In [1]:
%matplotlib inline
import math
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegressionCV

from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

from sklearn.impute import KNNImputer

np.random.seed(109)

In [2]:
X_data = pd.read_csv('NHANESI_X.csv')
y_data = pd.read_csv('NHANESI_y.csv')

X_data = X_data.drop('Unnamed: 0', axis = 1)
y_data = y_data.drop('Unnamed: 0', axis = 1)

In [3]:
X_data.head()


Unnamed: 0,Age,Diastolic BP,Poverty index,Race,Red blood cells,Sedimentation rate,Serum Albumin,Serum Cholesterol,Serum Iron,Serum Magnesium,Serum Protein,Sex,Systolic BP,TIBC,TS,White blood cells,BMI,Pulse pressure
0,35.0,92.0,126.0,2.0,77.7,12.0,5.0,165.0,135.0,1.37,7.6,2.0,142.0,323.0,41.8,5.8,31.109434,50.0
1,71.0,78.0,210.0,2.0,77.7,37.0,4.0,298.0,89.0,1.38,6.4,2.0,156.0,331.0,26.9,5.3,32.362572,78.0
2,74.0,86.0,999.0,2.0,77.7,31.0,3.8,222.0,115.0,1.37,7.4,2.0,170.0,299.0,38.5,8.1,25.388497,84.0
3,64.0,92.0,385.0,1.0,77.7,30.0,4.3,265.0,94.0,1.97,7.3,2.0,172.0,349.0,26.9,6.7,26.44661,80.0
4,32.0,70.0,183.0,2.0,77.7,18.0,5.0,203.0,192.0,1.35,7.3,1.0,128.0,386.0,49.7,8.1,20.354684,58.0


In [4]:
y_data.head()

Unnamed: 0,y
0,15.274658
1,11.586073
2,8.149087
3,-21.094292
4,-0.0


In [5]:
print("X_data # of null values for each predictor:")
print(X_data[X_data.isnull().any(axis=1)].count())
print("\ny_data # of null values for the prediction:")
print(y_data[y_data.isnull().any(axis=1)].count())

print(X_data.columns)
print(y_data.columns)



X_data # of null values for each predictor:
Age                   1339
Diastolic BP          1281
Poverty index         1339
Race                  1339
Red blood cells       1339
Sedimentation rate     513
Serum Albumin         1339
Serum Cholesterol     1339
Serum Iron            1339
Serum Magnesium       1339
Serum Protein         1339
Sex                   1339
Systolic BP           1282
TIBC                  1339
TS                    1339
White blood cells      298
BMI                   1339
Pulse pressure        1280
dtype: int64

y_data # of null values for the prediction:
y    0
dtype: int64
Index(['Age', 'Diastolic BP', 'Poverty index', 'Race', 'Red blood cells',
       'Sedimentation rate', 'Serum Albumin', 'Serum Cholesterol',
       'Serum Iron', 'Serum Magnesium', 'Serum Protein', 'Sex', 'Systolic BP',
       'TIBC', 'TS', 'White blood cells', 'BMI', 'Pulse pressure'],
      dtype='object')
Index(['y'], dtype='object')


In [6]:
#X_data_dropped = X_data.dropna()
#y_data_dropped = y_data.dropna()

#print(combined_dropped.shape)


#knn imputer.fit
#choose number of transmitter


In [7]:
imputer = KNNImputer(n_neighbors=1)
imputer_result = imputer.fit_transform(X_data)
print(type(imputer_result))
combined = pd.concat([X_data, y_data], axis = 1)
print(combined.shape)
combined_dropped = combined.dropna()

# turn it into a dataframe, rename columns
X_data_imputed = pd.DataFrame(imputer_result, columns = X_data.columns)
X_data_imputed.head()

<class 'numpy.ndarray'>
(9932, 19)


Unnamed: 0,Age,Diastolic BP,Poverty index,Race,Red blood cells,Sedimentation rate,Serum Albumin,Serum Cholesterol,Serum Iron,Serum Magnesium,Serum Protein,Sex,Systolic BP,TIBC,TS,White blood cells,BMI,Pulse pressure
0,35.0,92.0,126.0,2.0,77.7,12.0,5.0,165.0,135.0,1.37,7.6,2.0,142.0,323.0,41.8,5.8,31.109434,50.0
1,71.0,78.0,210.0,2.0,77.7,37.0,4.0,298.0,89.0,1.38,6.4,2.0,156.0,331.0,26.9,5.3,32.362572,78.0
2,74.0,86.0,999.0,2.0,77.7,31.0,3.8,222.0,115.0,1.37,7.4,2.0,170.0,299.0,38.5,8.1,25.388497,84.0
3,64.0,92.0,385.0,1.0,77.7,30.0,4.3,265.0,94.0,1.97,7.3,2.0,172.0,349.0,26.9,6.7,26.44661,80.0
4,32.0,70.0,183.0,2.0,77.7,18.0,5.0,203.0,192.0,1.35,7.3,1.0,128.0,386.0,49.7,8.1,20.354684,58.0


In [8]:
#linear regresion, knn imputation, mean imputation, then transform the dataset 

# split beforehand

# baseline with drop model
# 2nd model, with imputation ()
X_data_dropped = combined_dropped[['Age', 'Diastolic BP', 'Poverty index', 'Race', 'Red blood cells',
       'Sedimentation rate', 'Serum Albumin', 'Serum Cholesterol',
       'Serum Iron', 'Serum Magnesium', 'Serum Protein', 'Sex', 'Systolic BP',
       'TIBC', 'TS', 'White blood cells', 'BMI', 'Pulse pressure']]
y_data_dropped = combined_dropped[['y']]

print(X_data_dropped.shape)
print(y_data_dropped.shape)


# 

(8593, 18)
(8593, 1)


In [9]:
#Check data types of columns and recast if needed
print(X_data.dtypes)
print(y_data.dtypes)
print("All columns are floats, which means we are good to go!")

Age                   float64
Diastolic BP          float64
Poverty index         float64
Race                  float64
Red blood cells       float64
Sedimentation rate    float64
Serum Albumin         float64
Serum Cholesterol     float64
Serum Iron            float64
Serum Magnesium       float64
Serum Protein         float64
Sex                   float64
Systolic BP           float64
TIBC                  float64
TS                    float64
White blood cells     float64
BMI                   float64
Pulse pressure        float64
dtype: object
y    float64
dtype: object
All columns are floats, which means we are good to go!


In [10]:
# See groupby statistics and see any problems
X_train, X_test, y_train, y_test = train_test_split(X_data_dropped,y_data_dropped, test_size =0.2,random_state=109)


# groupby for demographics
X_train[['Age','Poverty index','Race','Sex','BMI']].describe()


Unnamed: 0,Age,Poverty index,Race,Sex,BMI
count,6874.0,6874.0,6874.0,6874.0,6874.0
mean,49.320919,289.161914,1.1897,1.613326,25.641083
std,15.884999,222.026227,0.421078,0.487023,5.106703
min,25.0,2.0,1.0,1.0,12.585333
25%,35.0,135.0,1.0,1.0,22.108762
50%,48.0,233.0,1.0,2.0,24.93623
75%,66.0,373.0,1.0,2.0,28.312195
max,74.0,999.0,3.0,2.0,58.818811


Here, we see a problem with Sex and Race the minimum is 1 and the max is 3 or 2 respectively, so we need to change that!

In [11]:
X_train['Sex'] = X_train['Sex'].subtract(1)
X_test['Sex'] = X_test['Sex'].subtract(1)

X_train['Race'] = X_train['Race'].subtract(1)
X_test['Race'] = X_test['Race'].subtract(1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value ins

In [12]:
X_train.head()


Unnamed: 0,Age,Diastolic BP,Poverty index,Race,Red blood cells,Sedimentation rate,Serum Albumin,Serum Cholesterol,Serum Iron,Serum Magnesium,Serum Protein,Sex,Systolic BP,TIBC,TS,White blood cells,BMI,Pulse pressure
5684,71.0,116.0,75.0,1.0,44.0,11.0,4.0,237.0,57.0,1.6,7.4,1.0,200.0,329.0,17.3,8.7,22.854203,84.0
5510,33.0,80.0,292.0,0.0,77.7,8.0,4.7,216.0,102.0,1.7,6.9,1.0,112.0,471.0,21.7,8.6,22.255359,32.0
7912,69.0,100.0,316.0,0.0,77.7,26.0,4.2,197.0,65.0,1.49,7.5,0.0,165.0,298.0,21.8,8.8,22.129018,65.0
7599,35.0,66.0,384.0,0.0,77.7,14.0,4.3,251.0,116.0,1.78,6.6,1.0,114.0,325.0,35.7,8.3,21.032693,48.0
213,68.0,84.0,217.0,0.0,77.7,4.0,4.4,219.0,93.0,1.87,6.4,1.0,160.0,299.0,31.1,4.5,18.772688,76.0


In [13]:
# Statistics of blood information
X_train[['Diastolic BP','Red blood cells','Sedimentation rate','Systolic BP','TIBC','Pulse pressure','TS','White blood cells']].describe()

Unnamed: 0,Diastolic BP,Red blood cells,Sedimentation rate,Systolic BP,TIBC,Pulse pressure,TS,White blood cells
count,6874.0,6874.0,6874.0,6874.0,6874.0,6874.0,6874.0,6874.0
mean,83.249054,51.988667,16.013529,134.626564,363.351178,51.377509,28.496872,7.47895
std,13.30393,11.805827,11.414022,24.902325,58.728982,18.262595,11.214908,2.352901
min,38.0,29.3,1.0,80.0,196.0,10.0,3.2,2.1
25%,74.0,44.7,7.0,118.0,323.0,40.0,21.0,6.0
50%,82.0,48.2,14.0,130.0,356.0,48.0,27.2,7.2
75%,90.0,53.2,22.0,150.0,397.0,60.0,34.5,8.6
max,180.0,88.8,72.0,270.0,717.0,150.0,100.0,56.0


In [14]:
# Statistics of Serum contents
X_train[['Serum Albumin', 'Serum Cholesterol',
       'Serum Iron', 'Serum Magnesium', 'Serum Protein']].describe()

Unnamed: 0,Serum Albumin,Serum Cholesterol,Serum Iron,Serum Magnesium,Serum Protein
count,6874.0,6874.0,6874.0,6874.0,6874.0
mean,4.370338,221.872141,101.170643,1.682633,7.108859
std,0.331533,50.111946,37.473088,0.14626,0.5066
min,2.7,53.0,17.0,0.82,4.8
25%,4.2,187.0,75.0,1.59,6.8
50%,4.4,217.0,96.0,1.68,7.1
75%,4.6,251.0,122.0,1.77,7.4
max,5.6,793.0,396.0,2.7,11.5


In [15]:
# Groupby sex then describe demographics, blood, and serum levels:
X_train.groupby(by='Sex')[['Age','Poverty index']].describe()

Unnamed: 0_level_0,Age,Age,Age,Age,Age,Age,Age,Age,Poverty index,Poverty index,Poverty index,Poverty index,Poverty index,Poverty index,Poverty index,Poverty index
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
Sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
0.0,2658.0,52.360045,15.529646,25.0,39.0,54.0,67.0,74.0,2658.0,304.525959,228.038992,2.0,148.25,250.0,385.0,999.0
1.0,4216.0,47.404886,15.809286,25.0,33.0,44.0,65.0,74.0,4216.0,279.475569,217.620715,5.0,128.0,221.0,357.0,999.0


In [16]:
X_train.groupby(by='Sex')[['Race','BMI']].describe()

Unnamed: 0_level_0,Race,Race,Race,Race,Race,Race,Race,Race,BMI,BMI,BMI,BMI,BMI,BMI,BMI,BMI
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
Sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
0.0,2658.0,0.179458,0.41762,0.0,0.0,0.0,0.0,2.0,2658.0,25.667069,4.14776,14.779363,22.894745,25.502982,27.855563,49.645994
1.0,4216.0,0.196157,0.423166,0.0,0.0,0.0,0.0,2.0,4216.0,25.624701,5.62833,12.585333,21.651138,24.441123,28.621443,58.818811


In [21]:
X_train.groupby(by='Sex')[['Diastolic BP','Red blood cells','Sedimentation rate','Systolic BP','TIBC','Pulse pressure','TS','White blood cells']].describe()

Unnamed: 0_level_0,Diastolic BP,Diastolic BP,Diastolic BP,Diastolic BP,Diastolic BP,Diastolic BP,Diastolic BP,Diastolic BP,Red blood cells,Red blood cells,...,TS,TS,White blood cells,White blood cells,White blood cells,White blood cells,White blood cells,White blood cells,White blood cells,White blood cells
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0.0,2658.0,85.5,12.984495,48.0,78.0,84.0,92.0,180.0,2658.0,54.167871,...,36.5,100.0,2658.0,7.499059,2.211513,2.9,6.0,7.3,8.7,51.2
1.0,4216.0,81.829934,13.308664,38.0,72.0,80.0,90.0,170.0,4216.0,50.614777,...,33.0,97.0,4216.0,7.466271,2.438002,2.1,6.0,7.2,8.525,56.0


In [None]:
X_train.groupby(by='Sex')[['Serum Albumin', 'Serum Cholesterol',
       'Serum Iron', 'Serum Magnesium', 'Serum Protein']].describe()

In [19]:
X_train.groupby(by='Race')[['Age','Poverty index']].describe()

Unnamed: 0_level_0,Age,Age,Age,Age,Age,Age,Age,Age,Poverty index,Poverty index,Poverty index,Poverty index,Poverty index,Poverty index,Poverty index,Poverty index
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
Race,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
0.0,5651.0,49.375155,15.932727,25.0,35.0,48.0,66.0,74.0,5651.0,305.542205,218.249165,2.0,156.0,254.0,385.0,999.0
1.0,1142.0,49.337128,15.726497,25.0,35.0,47.0,66.0,74.0,1142.0,206.995622,223.12395,5.0,74.0,129.0,250.0,999.0
2.0,81.0,45.308642,14.364576,25.0,35.0,41.0,57.0,73.0,81.0,304.82716,213.138558,13.0,168.0,257.0,373.0,999.0


In [20]:
X_train.groupby(by='Race')[['Sex','BMI']].describe()

Unnamed: 0_level_0,Sex,Sex,Sex,Sex,Sex,Sex,Sex,Sex,BMI,BMI,BMI,BMI,BMI,BMI,BMI,BMI
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
Race,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
0.0,5651.0,0.60768,0.488311,0.0,0.0,1.0,1.0,1.0,5651.0,25.360962,4.79948,12.585333,22.035736,24.72821,27.864672,54.276992
1.0,1142.0,0.645359,0.478614,0.0,0.0,1.0,1.0,1.0,1142.0,27.146299,6.250685,15.06235,22.811234,26.185541,30.434025,58.818811
2.0,81.0,0.555556,0.5,0.0,0.0,1.0,1.0,1.0,81.0,23.96219,3.952742,15.549184,21.494238,23.230662,26.405415,36.220412


In [22]:
X_train.groupby(by='Sex')[['Diastolic BP','Red blood cells','Sedimentation rate','Systolic BP','TIBC','Pulse pressure','TS','White blood cells']].describe()

Unnamed: 0_level_0,Diastolic BP,Diastolic BP,Diastolic BP,Diastolic BP,Diastolic BP,Diastolic BP,Diastolic BP,Diastolic BP,Red blood cells,Red blood cells,...,TS,TS,White blood cells,White blood cells,White blood cells,White blood cells,White blood cells,White blood cells,White blood cells,White blood cells
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0.0,2658.0,85.5,12.984495,48.0,78.0,84.0,92.0,180.0,2658.0,54.167871,...,36.5,100.0,2658.0,7.499059,2.211513,2.9,6.0,7.3,8.7,51.2
1.0,4216.0,81.829934,13.308664,38.0,72.0,80.0,90.0,170.0,4216.0,50.614777,...,33.0,97.0,4216.0,7.466271,2.438002,2.1,6.0,7.2,8.525,56.0


In [17]:
# Baseline linear regression model on dropped data
baseline_dropped = LinearRegression().fit(X_train,y_train)
y_pred = baseline_dropped.predict(X_test)

MSE_baseline_dropped = mean_squared_error(y_pred,y_test)
print(MSE_baseline_dropped)

142.24285468467096


In [18]:
# Baseline linear regression model on KNN imputed data
X_train_impute, X_test_impute, y_train_impute, y_test_impute = train_test_split(X_data_imputed, y_data,test_size=0.2,random_state=109)

baseline_impute = LinearRegression().fit(X_train_impute,y_train_impute)
y_pred_impute = baseline_impute.predict(X_test_impute)

MSE_baseline_impute = mean_squared_error(y_pred_impute,y_test_impute)
print(MSE_baseline_impute)

143.48928744765016
