In [None]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures

In [None]:
demog_df = pd.read_csv("~/01_repos/CardiacGWAS/shiny/data/gwas_covariates_63k.csv")

cardiac_indices = pd.read_csv("~/01_repos/CardiacSegmentation/data/transforms/LVED_cardiac_indices.csv")
cardiac_indices = cardiac_indices.drop(["LVMVR_mesh", "LVM_mesh"], axis=1).rename({"LVM_vox": "LVM", "LVMVR_vox": "LVMVR"}, axis=1)

In [None]:
df = pd.merge(cardiac_indices, demog_df)

onehot = OneHotEncoder(drop='first')
columns_to_encode = ["alcohol_intake_freq", "smoking_status", "sex", "imaging_centre"]
X_onehot = onehot.fit_transform(df[columns_to_encode])
encoded_names = onehot.get_feature_names_out(columns_to_encode)

In [None]:
float_cols = ['weight', 'height', 'bmi', 'age', 'body_surface_area', 'adj_dbp', 'adj_sbp']
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(df[float_cols])
poly_features = poly.get_feature_names_out()

In [None]:
phenotypes = ['LVEDV', 'LVEDSph', 'LVM', 'LVMVR', 'RVEDV']

In [None]:
imaging_centre_cols = [ x for x in encoded_names if "imaging" in x ]

In [None]:
imaging_centre_cols

In [None]:
df = pd.concat([
    pd.DataFrame(X_poly, columns=poly_features),
    pd.DataFrame(X_onehot.todense(), columns=encoded_names),
    df[phenotypes]],
    axis=1
)

In [None]:
# Define your independent variables
independent_vars = poly_features.tolist() + encoded_names.tolist()

In [None]:
variables_to_rm = [ "sex_MALE" ]

In [None]:
variables_to_rm = imaging_centre_cols

In [None]:
# Add a constant term for the intercept
X = sm.add_constant(df[independent_vars])

results, residues = {}, {}

for phenotype in phenotypes:
    
    y = np.asarray(df[phenotype])
    X_ = X.drop(variables_to_rm, axis=1)
    
    model = sm.OLS(y, X_)
    
    results[phenotype] = model.fit()
    
    yhat = results[phenotype].predict(X_)
    
    residues[phenotype] = y - yhat
  
# To access the results
# for phenotype, result in results.items():
#    print(f"Results for {phenotype}:\n", result.summary(), "\n")

In [None]:
df.groupby("sex_MALE")["RVEDV"].mean()

In [None]:
kk = sm.OLS(df['LVEDSph'], sm.add_constant(df["bmi"])).fit()
print(kk.pvalues)
kk.summary()

In [None]:
import seaborn as sns

In [None]:
df.LVEDV.hist();

In [None]:
sns.boxplot(data=df, x="age", y="LVEDV");

In [None]:
model = sm.OLS(df['LVEDV'], sm.add_constant(pd.concat([df['age'], df['age'] * df['sex_MALE']], axis=1)))
kk = model.fit()
# print(kk.pvalues)
kk.summary()

In [None]:
-0.9552 * 7.5

In [None]:
kk = sm.OLS(df.query("sex_MALE == 1")['LVEDV'], df.query("sex_MALE == 1")["age"]).fit()
print(kk.pvalues)
kk.summary()