In [1]:
import pandas as pd

In [2]:
def aggregate_grs_scores(base_filename, date, excluded_chromosomes=[]):
    total_score = None  # Initialize to None, will hold the aggregated scores dataframe
    
    # Loop through chromosomes 1 to 22
    for chr_num in range(1, 23):
        if chr_num in excluded_chromosomes:
            continue  # Skip the excluded chromosomes

        # Construct the filename for the current chromosome
        filename = f"{base_filename}_chr{chr_num}_{date}.sscore"
        
        try:
            # Read the score file
            current_grs = pd.read_csv(filename, sep='\t')
            
            # Check if total_score is None (i.e., first valid file)
            if total_score is None:
                total_score = current_grs
            else:
                # Sum the 'SCORE1_SUM' and 'ALLELE_CT' columns
                total_score['SCORE1_SUM'] += current_grs['SCORE1_SUM']
                total_score['ALLELE_CT'] += current_grs['ALLELE_CT']

        except FileNotFoundError:
            print(f"File {filename} not found. Skipping this chromosome.")
    
    return total_score


In [3]:
base_filename = 'Psoriasis_GRS'
total_grs_scores = aggregate_grs_scores(base_filename,
                                        date='020724')

In [4]:
total_grs_scores.rename({'IID':'n_eid', 'SCORE1_SUM':'Psoriasis_GRS'}, axis=1, inplace=True)

In [5]:
def z_normalize_column(df, column_name):
    mean = df[column_name].mean()
    std = df[column_name].std()
    df[column_name] = (df[column_name] - mean) / std
    return df

# Columns to normalize
cols_to_normalise = ['Psoriasis_GRS']

# Apply z-normalization to the specified columns
for col in cols_to_normalise:
    df = z_normalize_column(total_grs_scores, col)

In [6]:
total_grs_scores.Psoriasis_GRS.describe()

count    4.874090e+05
mean     2.681615e-16
std      1.000000e+00
min     -3.736035e+00
25%     -6.836425e-01
50%     -8.658954e-02
75%      5.933268e-01
max      5.474866e+00
Name: Psoriasis_GRS, dtype: float64

In [7]:
total_grs_scores.drop(['#FID', 'ALLELE_CT', 'NAMED_ALLELE_DOSAGE_SUM', 'SCORE1_AVG'], axis=1, inplace=True)

In [8]:
total_grs_scores.to_stata('/slade/home/pl450/Uveitis/GRS/Psoriasis/psoriasis_grs_030724.dta')