In [None]:
import numpy as np
import os
import matplotlib.pyplot as plt
import pandas as pd
import sys
from statsmodels.regression.linear_model import OLS
from statsmodels.tools import add_constant
import datetime
from sklearn.metrics import roc_auc_score
import seaborn as sns
sns.set_style()
import pandas as pd
%matplotlib inline

In [None]:
datasplit_df = pd.read_csv('data/processed/train_test_valid_id_split.csv')[['videoid','dataset']]
datasplit_df['videoid'] = datasplit_df['videoid'].apply(lambda x: int(x))
alldata_processed =\
    pd.read_csv("./data/processed/alldata_processed.csv")
alldata_processed = alldata_processed.merge(right=datasplit_df,on=['videoid'],how='inner')

In [None]:
alldata_processed = pd.read_csv("./data/processed/alldata_processed.csv")

In [None]:
datasplit_df = pd.read_csv('data/processed/train_test_valid_id_split.csv')[['videoid','dataset']]
datasplit_df['videoid'] = datasplit_df['videoid'].apply(lambda x: int(x))
alldata_processed = alldata_processed.merge(right=datasplit_df,on=['videoid'],how='inner')
alldata_processed['const'] = 1

In [None]:
#age is truncated at 20 years because the relationship between age and SEMLS after 20 years is non-linear and 
#noisier
alldata_processed['age_truncated'] = np.clip(alldata_processed['age'],0,20)

In [None]:
#interpolate missing mass and height: there are only 6 instances where these values are missing
#interpolate missing mass
df_interp = alldata_processed[alldata_processed['dataset'] == 'train'].copy()
X = df_interp[["age_truncated","const"]].values
y = df_interp["mass"].values.reshape(-1,1)
lm = OLS(y,X,missing='drop').fit()

alldata_processed["predicted_mass"] = lm.predict(alldata_processed[["age_truncated","const"]].values)
alldata_processed["mass_interpolated"] = np.where(alldata_processed["mass"].isnull(),alldata_processed["predicted_mass"],
                                                 alldata_processed["mass"])

alldata_processed["log_interpolated_mass"] = np.log(alldata_processed["mass_interpolated"])
#interpolate missing height using log(mass)
df_interp = alldata_processed[alldata_processed['dataset'] == 'train'].copy()
X = df_interp[["log_interpolated_mass","const"]].values
y = df_interp["height"].values.reshape(-1,1)
lm = OLS(y,X,missing='drop').fit()

alldata_processed["predicted_height"] = lm.predict(alldata_processed[["log_interpolated_mass","const"]].values)

alldata_processed["height_interpolated"] = np.where(alldata_processed["height"].isnull(),alldata_processed["predicted_height"],
                                                 alldata_processed["height"])

In [None]:
#bucket data to make graphs
alldata_processed['mass_buckets'] =\
    np.clip(np.floor(alldata_processed['mass_interpolated']/5)*5,1,80)

alldata_processed['height_buckets'] =\
    np.clip(np.floor(alldata_processed['height_interpolated']/10)*10,90,180)
    
alldata_processed['age_buckets'] =\
    np.clip(np.floor(alldata_processed['age']),0,20)

In [None]:
sns.pointplot(x='height_buckets',y='SEMLS',data=alldata_processed[alldata_processed['dataset'] == 'train'])

In [None]:
sns.pointplot(x='mass_buckets',y='SEMLS',data=alldata_processed[alldata_processed['dataset'] == 'train'])

In [None]:
sns.pointplot(x='age_buckets',y='SEMLS',data=alldata_processed[alldata_processed['dataset'] == 'train'])

In [None]:
alldata_processed['mass_interpolated2'] = np.square(alldata_processed['mass_interpolated'])/100
alldata_processed['age_truncated2'] = np.square(alldata_processed['age_truncated'])/100
alldata_processed['height_interpolated2'] = np.square(alldata_processed['height_interpolated'])/100

In [None]:
from statsmodels.regression.linear_model import OLS
from statsmodels.discrete.discrete_model import Logit
from statsmodels.tools import add_constant
Xcols = ["mass_interpolated","mass_interpolated2","age_truncated2","age_truncated",
         "height_interpolated","height_interpolated2",
         "isPostSurgGaitVisit","const"]

X_train = alldata_processed[alldata_processed['dataset'] == 'train'][Xcols]
y_train = alldata_processed[alldata_processed['dataset'] == 'train']["SEMLS"]
X = alldata_processed[Xcols].values
y = alldata_processed["SEMLS"].values

lm = Logit(y_train,X_train).fit()


alldata_processed['predicted_SEMLS'] = lm.predict(X)

In [None]:
lm.summary2()

In [None]:
def compute_deviance_residual(y,predicted_prob):
    return np.where(y==1,np.sqrt(-2.0*np.log(predicted_prob)),
                                 -np.sqrt(-2.0*np.log(1-predicted_prob)))

In [None]:
roc_auc_score(alldata_processed[alldata_processed['dataset'] == 'train']['SEMLS'],
              alldata_processed[alldata_processed['dataset'] == 'train']['predicted_SEMLS'])

In [None]:
roc_auc_score(alldata_processed[alldata_processed['dataset'] == 'validation']['SEMLS'],
              alldata_processed[alldata_processed['dataset'] == 'validation']['predicted_SEMLS'])

In [None]:
alldata_processed['SEMLS_dev_residual'] = compute_deviance_residual(y,
                                                                   lm.predict(X))

In [None]:
alldata_processed.to_csv("./data/processed/alldata_processed_with_dev_residual.csv",index=False)

In [None]:
alldata_processed['SEMLS_dev_residual'].std()