In [21]:
import pandas as pd
import numpy as np
import os
import xgboost as xgb
import pickle
from utils.data_process import mean_normalize

# Final Model Predictions

In [3]:
raw_data_dir = '../data/raw/'
test_set = pd.read_csv(raw_data_dir + '38_sick_test.csv')
test_set.head()

Unnamed: 0,d3mIndex,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,...,TT4_measured,TT4,T4U_measured,T4U,FTI_measured,FTI,TBG_measured,TBG,referral_source,Class
0,1,23.0,F,f,f,f,f,f,f,f,...,t,102.0,f,,f,,f,,other,
1,4,70.0,F,f,f,f,f,f,f,f,...,t,61.0,t,0.87,t,70.0,f,,SVI,
2,7,80.0,F,f,f,f,f,f,f,f,...,t,80.0,t,0.7,t,115.0,f,,SVI,
3,12,71.0,F,f,f,f,t,f,f,f,...,t,171.0,t,1.13,t,151.0,f,,other,
4,17,63.0,F,f,f,f,f,f,f,f,...,t,117.0,t,0.96,t,121.0,f,,SVI,


In [4]:
# Check Sparsity of data
for c in test_set.columns: 
    missing = test_set[c].isna().sum()
    if missing > 0: 
        print('{} - dtype: {}'.format(c, test_set.dtypes.loc[c]))
        print("missing: {}".format(missing))
        print("percent: {:1.4f}".format((missing/test_set.shape[0]) * 100))

sex - dtype: object
missing: 25
percent: 3.3156
TSH - dtype: float64
missing: 74
percent: 9.8143
T3 - dtype: float64
missing: 165
percent: 21.8833
TT4 - dtype: float64
missing: 47
percent: 6.2334
T4U - dtype: float64
missing: 61
percent: 8.0902
FTI - dtype: float64
missing: 61
percent: 8.0902
TBG - dtype: float64
missing: 754
percent: 100.0000
Class - dtype: float64
missing: 754
percent: 100.0000


In [7]:
features = ['FTI', 'T3', 'TT4', 'TSH']
test_set_filered = test_set[features]
test_set_filered.head()

Unnamed: 0,FTI,T3,TT4,TSH
0,,2.0,102.0,4.1
1,70.0,1.2,61.0,0.72
2,115.0,0.6,80.0,2.2
3,151.0,3.8,171.0,0.03
4,121.0,1.2,117.0,1.5


In [9]:
# Normalize while keeping NaN values. 
normalized_values = mean_normalize(test_set_filered)
normalized_values.head()

Unnamed: 0,FTI,T3,TT4,TSH
0,,-0.023526,-0.186567,0.052316
1,-1.273653,-0.970981,-1.346836,-0.242529
2,0.159052,-1.681573,-0.80915,-0.113425
3,1.305215,2.10825,1.766081,-0.302719
4,0.350079,-0.970981,0.237922,-0.174488


# Load training data

In [12]:
temp_data_dir = '../data/temp/'
for i, k in enumerate(os.listdir(raw_data_dir)):
    print(i,k)

0 X_dev.knn.normalized.csv
1 .DS_Store
2 X_dev.csv
3 y_dev.csv
4 X_train.knn.normalized.csv
5 X_train.zeros.csv
6 X_train.pca.knn.normalized.csv
7 X_dev.zeros.csv
8 X_dev.pca.knn.normalized.csv
9 X_dev.normalized.csv
10 X_train.csv
11 X_train.normalized.csv
12 y_train.csv


In [13]:
X_train = pd.read_csv(temp_data_dir + 'X_train.normalized.csv', index_col = 0)
y_train = pd.read_csv(temp_data_dir + 'y_train.csv', index_col = 0)
X_dev = pd.read_csv(temp_data_dir + 'X_dev.normalized.csv', index_col = 0)
y_dev = pd.read_csv(temp_data_dir + 'y_dev.csv', index_col = 0)

In [14]:
X_train.head()

Unnamed: 0,age,M,F,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,...,TSH,T3,TT4,T4U,FTI,STMW,SVHC,SVHD,SVI,other
1753,0.499352,1,0,0,0,0,0,0,0,0,...,-0.155552,,-0.436236,-0.276865,-0.381828,0,0,0,1,0
1754,-0.712752,1,0,0,0,0,0,0,0,0,...,-0.193954,-0.629861,-0.055767,-1.038689,0.731375,0,0,0,1,0
1050,-0.765452,1,0,0,0,0,0,0,0,0,...,-0.188718,-0.022597,-0.728905,0.7389,-1.185807,0,0,0,0,1
439,-1.134354,0,1,0,0,0,0,0,0,0,...,-0.131114,,-0.289902,-0.022924,-0.350905,0,0,0,0,1
2538,1.079055,0,1,0,0,0,0,0,0,0,...,-0.14857,-0.751314,0.207635,-0.124501,0.236619,0,1,0,0,0


In [15]:
X_dev.head()

Unnamed: 0,age,M,F,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,...,TSH,T3,TT4,T4U,FTI,STMW,SVHC,SVHD,SVI,other
2916,-0.732342,0,1,0,0,0,0,0,0,0,...,,,,,,0,0,0,0,1
1812,-1.098253,0,1,0,0,0,0,0,0,0,...,-0.079667,,-0.74228,-0.247995,-0.75295,0,0,0,0,1
877,0.156299,0,1,1,0,0,0,0,0,0,...,,,,,,0,0,0,0,1
973,0.574483,0,1,0,0,0,0,0,0,0,...,-0.23524,-0.223618,-0.063958,-0.747758,0.379719,0,0,0,0,1
1060,-1.202799,1,0,0,0,0,0,0,0,0,...,-0.193639,2.201121,0.420558,0.701553,0.010943,0,1,0,0,0


In [16]:
# Join dev set and train set 
X_train_full = pd.concat([X_train, X_dev])
print(X_train_full.shape)
X_train_full.head()

(3017, 27)


Unnamed: 0,age,M,F,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,...,TSH,T3,TT4,T4U,FTI,STMW,SVHC,SVHD,SVI,other
1753,0.499352,1,0,0,0,0,0,0,0,0,...,-0.155552,,-0.436236,-0.276865,-0.381828,0,0,0,1,0
1754,-0.712752,1,0,0,0,0,0,0,0,0,...,-0.193954,-0.629861,-0.055767,-1.038689,0.731375,0,0,0,1,0
1050,-0.765452,1,0,0,0,0,0,0,0,0,...,-0.188718,-0.022597,-0.728905,0.7389,-1.185807,0,0,0,0,1
439,-1.134354,0,1,0,0,0,0,0,0,0,...,-0.131114,,-0.289902,-0.022924,-0.350905,0,0,0,0,1
2538,1.079055,0,1,0,0,0,0,0,0,0,...,-0.14857,-0.751314,0.207635,-0.124501,0.236619,0,1,0,0,0


In [22]:
X_train_full = X_train_full[features]
X_train_full.head()

Unnamed: 0,FTI,T3,TT4,TSH
1753,-0.381828,,-0.436236,-0.155552
1754,0.731375,-0.629861,-0.055767,-0.193954
1050,-1.185807,-0.022597,-0.728905,-0.188718
439,-0.350905,,-0.289902,-0.131114
2538,0.236619,-0.751314,0.207635,-0.14857


In [23]:
y_train_full = pd.concat([y_train, y_dev])
print(y_train_full.shape)
y_train_full.head()

(3017, 1)


Unnamed: 0,Class
1753,negative
1754,negative
1050,negative
439,negative
2538,negative


In [24]:
# Structure data for Xgboost
y_train_full = y_train_full['Class'].values

In [26]:
# Train Classifier 
model = xgb.XGBClassifier(objective='binary:logistic', 
                          n_estimators=40,
                          max_depth=4,
                          learning_rate=0.1,
                          gamma=2,
                          colsample_bytree=0.6)

model.fit(X_train_full, y_train_full)

# save model to file
pickle.dump(model, open("../data/output/thyroid_classifier.p", "wb"))

# Make Predictions
predictions = model.predict(normalized_values)

In [34]:
output = pd.DataFrame(predictions)
output.head()

Unnamed: 0,0
0,negative
1,negative
2,sick
3,negative
4,negative


In [38]:
output.to_csv('../data/output/84887758_predictions.csv', header=False)