In [24]:
# Read the input genotypes.txt file
with open("genotypes.txt", "r") as input_file:
    lines = input_file.readlines()

# Extract header line
header = lines[0].strip().split()

# Process each line of data and merge alleles into genotype calls
genotype_calls = []
for line in lines[1:]:
    parts = line.strip().split("\t")
    iid = parts[0]
    genotypes = [f"{parts[i]}{parts[i+1]}" for i in range(1, len(parts), 2)]
    genotype_calls.append([iid] + genotypes)

# Modify the header to remove "_1" and "_2" suffixes
modified_header = ["#IID"] + [rs.split("_")[0] for rs in header[1::2]]

# Write the merged genotype calls to genotype_merged.txt
with open("genotype_merged.txt", "w") as output_file:
    output_file.write("\t".join(modified_header) + "\n")
    for call in genotype_calls:
        output_file.write("\t".join(call) + "\n")

print("Genotype calls merged and saved as genotype_merged.txt")

Genotype calls merged and saved as genotype_merged.txt


In [25]:
import pandas as pd

# Read phenotypes data
phenotypes = pd.read_csv("phenotypes.txt", sep="\t")

In [26]:
# Filter female patients under 60 with Type II Diabetes
filtered_phenotypes = phenotypes[(phenotypes["Gender"] == 0) & (phenotypes["Age"] < 60) & (phenotypes["Type.II.Diabetes"] == 1)]

# Extract the relevant IDs
selected_ids = filtered_phenotypes["Arb_ID"].tolist()

In [27]:
filtered_phenotypes

Unnamed: 0,Arb_ID,Age,Gender,BMI,Type.II.Diabetes
4,55979726895,38.0,0.0,60.235667,1.0
32,59135084867,27.0,0.0,59.559074,1.0
33,5654467294,41.0,0.0,,1.0
36,75132640171,51.0,0.0,60.543456,1.0
42,31073992919,46.0,0.0,,1.0
47,76985507846,50.0,0.0,60.890339,1.0


In [28]:
# Read genotypes data
genotypes = pd.read_csv("genotype_merged.txt", sep="\t")

genotypes

Unnamed: 0,#IID,rs1421085,rs7074440,rs9273367,rs9275530,rs9368219
0,1534210213,TT,GG,AT,GG,CC
1,1549214769,CC,AA,AT,GG,CC
2,3669281929,TT,GG,AT,GG,CT
3,5654467294,TT,AG,AT,GG,CC
4,7514162311,CT,AA,AT,GG,CT
5,7718430122,CC,GG,AT,GG,CC
6,8094374023,CT,GG,AA,GG,CC
7,9156030645,CT,AA,AT,GG,CT
8,11445637833,CT,GG,AA,GG,
9,14998017455,TT,GG,TT,GG,TT


In [30]:
# Filter genotypes data for selected IDs
selected_genotypes = genotypes[genotypes["#IID"].isin(selected_ids)]

selected_genotypes

Unnamed: 0,#IID,rs1421085,rs7074440,rs9273367,rs9275530,rs9368219
3,5654467294,TT,AG,AT,GG,CC
18,31073992919,CC,GG,AT,GC,CC
30,55979726895,CC,AG,AT,GG,CC
32,59135084867,CC,GG,,GG,CC
43,75132640171,CC,GG,TT,GG,CT
44,76985507846,CT,AG,AT,CC,CC


In [31]:
# Question 3
import matplotlib.pyplot as plt

# Remove IID column from selected_genotypes
selected_genotypes = selected_genotypes.drop(columns=["#IID"])

# Plot genotype distribution for each variant
for col in selected_genotypes.columns:
    plt.figure(figsize=(8, 6))
    plt.title(f"Genotype Distribution for {col}")
    selected_genotypes[col].value_counts().plot(kind="bar")
    plt.xlabel("Genotype")
    plt.ylabel("Frequency")
    plt.xticks(rotation=0)
    plt.tight_layout()
    plt.savefig(f"{col}_plot.png")
    plt.close()

In [47]:
#Q4

import pandas as pd
import numpy as np

# Read phenotypes data
phenotypes = pd.read_csv("phenotypes.txt", sep="\t")

# Read genotype merged data
genotype_merged = pd.read_csv("genotype_merged.txt", sep="\t")

# Define the conditions
male_patients = phenotypes[(phenotypes["Gender"] == 1)]
age_condition = male_patients["Age"] > 35

# Calculate the quartiles for BMI
bmi_quartiles = phenotypes["BMI"].quantile([0.25, 0.5, 0.75])
third_quartile_bmi = bmi_quartiles[0.75]

# Filter for patients with BMI in the 3rd quartile
bmi_3rd_quartile_condition = male_patients["BMI"] >= third_quartile_bmi

# Find out patients with missing genotypes
missing_genotype_patients = genotype_merged[genotype_merged.isnull().any(axis=1)]["#IID"]

# Apply the conditions
filtered_patients = male_patients[age_condition & bmi_3rd_quartile_condition & male_patients["Arb_ID"].isin(missing_genotype_patients)]

# Display the count of such patients
num_patients = len(filtered_patients)
print(f"Number of male patients over 35 with 3rd quartile BMI and missing genotypes: {num_patients}")


Number of male patients over 35 with 3rd quartile BMI and missing genotypes: 0


In [48]:
print(male_patients)

         Arb_ID   Age  Gender        BMI  Type.II.Diabetes
1   34610262659   NaN     1.0  61.033551               1.0
3   68012749485   NaN     1.0  61.980743               1.0
8   37662218447  39.0     1.0  59.214100               0.0
9    7718430122  23.0     1.0  60.845449               NaN
12  23302765165  48.0     1.0  59.833891               1.0
13  11445637833  33.0     1.0  60.064419               0.0
17  21010920759  72.0     1.0        NaN               1.0
20  61755432346   NaN     1.0  59.792605               0.0
24  50061718679   NaN     1.0  60.280996               0.0
28  77669633619   NaN     1.0  59.635639               1.0
31  49262217585  35.0     1.0  60.528419               0.0
34  28397619565  36.0     1.0  61.690349               1.0
38  49800253445  35.0     1.0  61.374589               1.0
39  17045850406  78.0     1.0        NaN               1.0
40  34051766148  47.0     1.0  60.699727               NaN
41  77805252824  62.0     1.0  60.502584               0

In [49]:
print(male_patients[age_condition])

         Arb_ID   Age  Gender        BMI  Type.II.Diabetes
8   37662218447  39.0     1.0  59.214100               0.0
12  23302765165  48.0     1.0  59.833891               1.0
17  21010920759  72.0     1.0        NaN               1.0
34  28397619565  36.0     1.0  61.690349               1.0
39  17045850406  78.0     1.0        NaN               1.0
40  34051766148  47.0     1.0  60.699727               NaN
41  77805252824  62.0     1.0  60.502584               0.0
49  19572843351  61.0     1.0  59.828946               0.0


In [52]:
bmi_quartiles = phenotypes["BMI"].quantile([0.25, 0.5, 0.75])
print(bmi_quartiles)
third_quartile_bmi = bmi_quartiles[0.75]
print(third_quartile_bmi)

0.25    59.632061
0.50    60.310507
0.75    61.115141
Name: BMI, dtype: float64
61.115140543388904


In [50]:
print(male_patients[bmi_3rd_quartile_condition])

         Arb_ID   Age  Gender        BMI  Type.II.Diabetes
3   68012749485   NaN     1.0  61.980743               1.0
34  28397619565  36.0     1.0  61.690349               1.0
38  49800253445  35.0     1.0  61.374589               1.0
48  99526743621  23.0     1.0  61.359909               0.0


In [54]:
male_patients[age_condition & bmi_3rd_quartile_condition]

Unnamed: 0,Arb_ID,Age,Gender,BMI,Type.II.Diabetes
34,28397619565,36.0,1.0,61.690349,1.0


In [53]:
print(missing_genotype_patients)

8     11445637833
12    21010920759
29    50061718679
32    59135084867
39    68012749485
41    73387314734
Name: #IID, dtype: int64
