In [8]:
import pandas as pd

# Read the first CSV file
file1 = '/home/seth/longevity/data/challenge_submissions/June/LucasCamillo.csv'
df1 = pd.read_csv(file1, delimiter=',')

# Read the second CSV file
file2 = '/home/seth/longevity/boa_challenge/phase1/lucas_submission/tutorials/files/all_challenge_preds.csv'
df2 = pd.read_csv(file2, delimiter=',')

# Calculate the average predicted age for each sample in the second file
df2['average_predicted_age'] = df2.iloc[:, 1:].mean(axis=1)

# Calculate the range of predicted ages for each sample in the second file
df2['predicted_age_range'] = df2.iloc[:, 1:].max(axis=1) - df2.iloc[:, 1:].min(axis=1)

# Ensure both dataframes have the same length
assert len(df1) == len(df2), "The number of samples in both files must be the same"

# Merge the dataframes by their order
merged_df = pd.DataFrame({
    'sampleId': df1['sampleId'],
    'predictedAge': df1['predictedAge'],
    'average_predicted_age': df2['average_predicted_age'],
    'predicted_age_range': df2['predicted_age_range']
})

# Calculate the difference between the predicted age in the first file and the average predicted age in the second file
merged_df['difference'] = merged_df['predictedAge'] - merged_df['average_predicted_age']

# Calculate the comparison between the difference and the range of predicted ages in the second file
merged_df['difference_vs_range'] = merged_df['difference'] / merged_df['predicted_age_range']

# Display the results
print(merged_df[['sampleId', 'predictedAge', 'average_predicted_age', 'difference', 'predicted_age_range', 'difference_vs_range']])

# Optionally, save the results to a new CSV file
output_file = 'predicted_age_comparison.csv'
merged_df[['sampleId', 'predictedAge', 'average_predicted_age', 'difference', 'predicted_age_range', 'difference_vs_range']].to_csv(output_file, index=False)


                sampleId  predictedAge  average_predicted_age  difference  \
0    207700470022_R08C01         33.00              32.222053    0.777947   
1    207686150104_R02C01         80.60              80.153782    0.446218   
2    207700460080_R08C01         61.88              60.035631    1.844369   
3    207700470049_R01C01         56.88              56.038372    0.841628   
4    207805820146_R07C01         25.84              24.873471    0.966529   
..                   ...           ...                    ...         ...   
495  207700460135_R07C01         21.66              21.842174   -0.182174   
496  207686140150_R01C01         59.40              53.971309    5.428691   
497  207700460101_R06C01         57.10              55.131277    1.968723   
498  207700460136_R04C01         82.60              83.805941   -1.205941   
499  207700460034_R08C01         80.60              82.177492   -1.577492   

     predicted_age_range  difference_vs_range  
0               0.471745   

In [5]:
df1

Unnamed: 0,sampleId,predictedAge
0,207700470022_R08C01,33.00
1,207686150104_R02C01,80.60
2,207700460080_R08C01,61.88
3,207700470049_R01C01,56.88
4,207805820146_R07C01,25.84
...,...,...
495,207700460135_R07C01,21.66
496,207686140150_R01C01,59.40
497,207700460101_R06C01,57.10
498,207700460136_R04C01,82.60


In [6]:
df2

Unnamed: 0,sampleId,predictedAge0,predictedAge1,predictedAge2,predictedAge3,predictedAge4,average_predicted_age,predicted_age_range
0,GSM7866964,32.252290,32.086660,32.260048,32.019760,32.491505,32.222053,0.471745
1,GSM7866965,80.248360,79.978240,80.223330,80.327730,79.991250,80.153782,0.349490
2,GSM7866966,60.288220,59.892582,60.026993,59.951366,60.018993,60.035631,0.395638
3,GSM7866967,55.995674,56.009296,56.036877,56.068558,56.081455,56.038372,0.085781
4,GSM7866968,24.643246,24.984373,24.596039,25.120522,25.023174,24.873471,0.524483
...,...,...,...,...,...,...,...,...
495,GSM7867459,21.972443,21.795639,21.837929,21.609898,21.994960,21.842174,0.385062
496,GSM7867460,54.116062,53.935585,53.942493,53.896156,53.966250,53.971309,0.219906
497,GSM7867461,55.135906,55.037730,55.174360,55.309580,54.998810,55.131277,0.310770
498,GSM7867462,83.769530,83.752266,83.787810,83.952660,83.767440,83.805941,0.200394


In [2]:
# Read the GEO matrix file
geo_matrix_file = '/home/seth/longevity/boa_challenge/GSE246337_series_matrix.txt'

# Initialize dictionary
geo_dict = {}

# Read the file and extract the necessary lines
with open(geo_matrix_file, 'r') as file:
    lines = file.readlines()

# Extract the values and keys from the specific lines and strip out the double quotes
values_line = [val.strip('"') for val in lines[30].strip().split('\t')[1:]]  # Line 31, skipping the first column
keys_line = [key.strip('"') for key in lines[48].strip().split('\t')[1:]]    # Line 49, skipping the first column

# Create the dictionary
geo_dict = dict(zip(keys_line, values_line))

# Print the dictionary to verify
print(geo_dict)



{'207700470022_R08C01': 'GSM7866964', '207686150104_R02C01': 'GSM7866965', '207700460080_R08C01': 'GSM7866966', '207700470049_R01C01': 'GSM7866967', '207805820146_R07C01': 'GSM7866968', '207700470041_R01C01': 'GSM7866969', '207700470049_R03C01': 'GSM7866970', '207700470024_R05C01': 'GSM7866971', '207705770040_R06C01': 'GSM7866972', '207686140150_R03C01': 'GSM7866973', '207700470041_R08C01': 'GSM7866974', '207700460101_R02C01': 'GSM7866975', '207805820117_R07C01': 'GSM7866976', '207700470036_R08C01': 'GSM7866977', '207686140125_R08C01': 'GSM7866978', '207705770089_R06C01': 'GSM7866979', '207700460084_R05C01': 'GSM7866980', '207700470022_R01C01': 'GSM7866981', '207700470011_R08C01': 'GSM7866982', '207700460115_R08C01': 'GSM7866983', '207805820161_R01C01': 'GSM7866984', '207700460111_R08C01': 'GSM7866985', '207700460115_R02C01': 'GSM7866986', '207700470010_R04C01': 'GSM7866987', '207700460136_R06C01': 'GSM7866988', '207700460101_R04C01': 'GSM7866989', '207700460019_R04C01': 'GSM7866990', 

In [16]:
geo_dict["207700460115_R07C01"]

KeyError: '207700460115_R07C01'

In [4]:
import pandas as pd
# Read the first CSV file
file1 = '/home/seth/longevity/data/challenge/final_submissions/lucascamillo.csv'
df1 = pd.read_csv(file1, delimiter=',')

# Read the second CSV file
file2 = '/home/seth/longevity/boa_challenge/phase1/lucas_submission/tutorials/files/all_challenge_preds.csv'
df2 = pd.read_csv(file2, delimiter=',')

# Map the sample IDs in df1 to the sample IDs in df2 using the dictionary
df1['mapped_sampleId'] = df1['sampleId'].map(geo_dict)

# Find the sample IDs that do not exist in df2
missing_samples = df1[~df1['mapped_sampleId'].isin(df2.iloc[:, 0])]
num_missing_samples = len(missing_samples)

# Output the number of missing samples and their IDs
print(f"Number of missing samples: {num_missing_samples}")
print("Missing sample IDs:")
print(missing_samples[['sampleId', 'mapped_sampleId']])

# If no samples are missing, proceed with the calculations
if num_missing_samples == 0:
    # Reorder df2 based on the mapped sample IDs in df1
    df2 = df2.set_index(df2.columns[0]).loc[df1['mapped_sampleId']].reset_index()

    # Calculate the average predicted age for each sample in the second file
    df2['average_predicted_age'] = df2.iloc[:, 1:].mean(axis=1)

    # Calculate the range of predicted ages for each sample in the second file
    df2['predicted_age_range'] = df2.iloc[:, 1:].max(axis=1) - df2.iloc[:, 1:].min(axis=1)

    # Merge the dataframes by their order
    merged_df = pd.DataFrame({
        'sampleId': df1['sampleId'],
        'predictedAge': df1['predictedAge'],
        'average_predicted_age': df2['average_predicted_age'],
        'predicted_age_range': df2['predicted_age_range']
    })

    # Calculate the difference between the predicted age in the first file and the average predicted age in the second file
    merged_df['difference'] = merged_df['predictedAge'] - merged_df['average_predicted_age']

    # Calculate the comparison between the difference and the range of predicted ages in the second file
    merged_df['difference_vs_range'] = merged_df['difference'] / merged_df['predicted_age_range']

    # Display the results
    print(merged_df[['sampleId', 'predictedAge', 'average_predicted_age', 'difference', 'predicted_age_range', 'difference_vs_range']])

    # Optionally, save the results to a new CSV file
    output_file = 'predicted_age_comparison.csv'
    merged_df[['sampleId', 'predictedAge', 'average_predicted_age', 'difference', 'predicted_age_range', 'difference_vs_range']].to_csv(output_file, index=False)


Number of missing samples: 0
Missing sample IDs:
Empty DataFrame
Columns: [sampleId, mapped_sampleId]
Index: []
                sampleId  predictedAge  average_predicted_age  difference  \
0    207700470022_R08C01         32.16              32.222053   -0.062053   
1    207686150104_R02C01         80.20              80.153782    0.046218   
2    207700460080_R08C01         59.97              60.035631   -0.065631   
3    207700470049_R01C01         56.00              56.038372   -0.038372   
4    207805820146_R07C01         25.10              24.873471    0.226529   
..                   ...           ...                    ...         ...   
495  207700460135_R07C01         21.70              21.842174   -0.142174   
496  207686140150_R01C01         54.00              53.971309    0.028691   
497  207700460101_R06C01         55.25              55.131277    0.118723   
498  207700460136_R04C01         83.75              83.805941   -0.055941   
499  207700460034_R08C01         82.10   

In [6]:
import pandas as pd

# Read the first CSV file
file1 = '/home/seth/longevity/data/challenge/final_submissions/ZetaPartition.csv'
df1 = pd.read_csv(file1)

# Read the second CSV file
file2 = '/home/seth/longevity/boa_challenge/phase1/zeta_partition/submission/submission.csv'
df2 = pd.read_csv(file2)

# Rename columns to distinguish predictions from the two files
df1 = df1.rename(columns={'predictedAge': 'predicted_age_file1'})
df2 = df2.rename(columns={'predicted_age': 'predicted_age_file2'})

# Merge the dataframes on the sampleId column
merged_df = pd.merge(df1, df2, on='sampleId')

# Calculate the differences between the predicted ages
merged_df['difference'] = merged_df['predicted_age_file1'] - merged_df['predicted_age_file2']

# Calculate the average, maximum, and median differences
average_difference = merged_df['difference'].mean()
max_difference = merged_df['difference'].max()
median_difference = merged_df['difference'].median()

# Print the results
print(f"Average difference: {average_difference}")
print(f"Maximum difference: {max_difference}")
print(f"Median difference: {median_difference}")

# Optionally, save the results to a new CSV file
output_file = 'predicted_age_differences.csv'
merged_df.to_csv(output_file, index=False)


Average difference: 0.0005657920000001297
Maximum difference: 0.0131149999999991
Median difference: 0.0002935000000050536
