In [2]:
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler


In [3]:
df = pd.read_csv('/Users/shimriteliezer/Documents/testProjects/CKDanalysis/CKD.csv', index_col = 0)
print(df.shape)
df.info()
df.drop_duplicates(inplace=True)
print(df.shape)
print(df.columns)



(400, 25)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 400 entries, 0 to 399
Data columns (total 25 columns):
age               391 non-null float64
bp                388 non-null float64
sg                353 non-null float64
al                354 non-null float64
su                351 non-null float64
rbc               248 non-null object
pc                335 non-null object
pcc               396 non-null object
ba                396 non-null object
bgr               356 non-null float64
bu                381 non-null float64
sc                383 non-null float64
sod               313 non-null float64
pot               312 non-null float64
hemo              348 non-null float64
pcv               330 non-null object
wc                295 non-null object
rc                270 non-null object
htn               398 non-null object
dm                398 non-null object
cad               398 non-null object
appet             399 non-null object
pe                399 non-null object


In [4]:
# extract labels col
y_data = df['classification']
x_data = df.drop(columns = ['classification'])
# split data
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.33, random_state=48)
print(x_train.shape)

(268, 24)


In [6]:
# -- handle train data --
# imputation
x_train_val = x_train.dropna(axis=1, thresh=x_train.shape[0]*0.75)
print(x_train_val.shape)
train_val_columns = x_train_val.columns
# clean data
# impute categorical val
cat_values = {} # create categorical common values dictionary - per att its most common value (top)
cat_col_list = x_train_val.select_dtypes(include=np.object).columns.tolist()
for key in cat_col_list:
    x_train_val[key] = x_train_val[key].str.strip()
    x_train_val[key] = x_train_val[key].str.strip('?')
    x_train_val[key] = pd.to_numeric(x_train_val[key], errors='ignore') # e.g. pcv
    if x_train_val[key].dtype == object:
        cat_values[key] = x_train_val[key].describe().top
cat_col_list = x_train_val.select_dtypes(include=np.object).columns.tolist() # after removal of numerical columns
x_train_val.fillna(value=cat_values, inplace=True)
# impute numerical val (mean/median/corr)
num_desc = x_train_val.describe() # note, this is executed only on numerical data
mean_num_values = num_desc.loc['mean']
x_train_val.fillna(value=mean_num_values, inplace=True)

# scale train data
x_train_val_scale = x_train_val
mms = MinMaxScaler() # fit numerical data
num_col_list = x_train_val.select_dtypes(include=np.number).columns.tolist()
x_train_val_scale[num_col_list] = mms.fit_transform(x_train_val_scale[num_col_list])
cat2num_map = {}
for key in cat_col_list: # create categorical values dictionary - per att its optional values and their numerical replacement (top)
    cat2num_map[key] = {x_train_val[key].value_counts().keys()[0]: 0, x_train_val[key].value_counts().keys()[1]: 1}
x_train_val_scale.replace(cat2num_map, inplace=True) # scale categorical data (TBD - onehot!)

x_train_val_scale.info()



(268, 21)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 268 entries, 60 to 0
Data columns (total 21 columns):
age      268 non-null float64
bp       268 non-null float64
sg       268 non-null float64
al       268 non-null float64
su       268 non-null float64
pc       268 non-null int64
pcc      268 non-null int64
ba       268 non-null int64
bgr      268 non-null float64
bu       268 non-null float64
sc       268 non-null float64
sod      268 non-null float64
pot      268 non-null float64
hemo     268 non-null float64
pcv      268 non-null float64
htn      268 non-null int64
dm       268 non-null int64
cad      268 non-null int64
appet    268 non-null int64
pe       268 non-null int64
ane      268 non-null int64
dtypes: float64(12), int64(9)
memory usage: 46.1 KB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pa

In [7]:
# -- handle test data -- get only valid columns, fill NA, scale/fit
x_test_val = x_test[train_val_columns]
# clean data
cat_col_list_test = x_test_val.select_dtypes(include=np.object).columns.tolist()
for key in cat_col_list_test:
    x_test_val[key] = x_test_val[key].str.strip() # note, str can be triggered only per colums (cant be performed on the all data frame)
    x_test_val[key] = x_test_val[key].str.strip('?')
    x_test_val[key] = pd.to_numeric(x_test_val[key], errors='ignore')  # e.g. pcv
cat_col_list_test = x_test_val.select_dtypes(include=np.object).columns.tolist() # after removal of numeric
print(cat_col_list_test)
print(cat_col_list) # check the same
# fill NA
x_test_val.info()
x_test_val.fillna(value=mean_num_values, inplace=True)
x_test_val.fillna(value=cat_values, inplace=True)
# scale
x_test_val_scale = x_test_val
x_test_val_scale.replace(cat2num_map, inplace=True)
x_test_val_scale[num_col_list] = mms.fit_transform(x_test_val_scale[num_col_list])
x_test_val_scale.info()


['pc', 'pcc', 'ba', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane']
['pc', 'pcc', 'ba', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane']
<class 'pandas.core.frame.DataFrame'>
Int64Index: 132 entries, 150 to 71
Data columns (total 21 columns):
age      129 non-null float64
bp       129 non-null float64
sg       119 non-null float64
al       118 non-null float64
su       116 non-null float64
pc       113 non-null object
pcc      132 non-null object
ba       132 non-null object
bgr      116 non-null float64
bu       128 non-null float64
sc       128 non-null float64
sod      107 non-null float64
pot      107 non-null float64
hemo     113 non-null float64
pcv      107 non-null float64
htn      131 non-null object
dm       131 non-null object
cad      131 non-null object
appet    132 non-null object
pe       132 non-null object
ane      132 non-null object
dtypes: float64(12), object(9)
memory usage: 22.7+ KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 132 entries, 150 to 71
Data columns (total 21

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-vers

In [8]:
# scale labels to 0/1
y_cat2num_map={}
# for key in y_train.keys():
y_train = y_train.str.strip()
y_test = y_test.str.strip()
y_cat2num_map = {y_train.value_counts().keys()[0]: 0, y_train.value_counts().keys()[1]: 1}
y_train.replace(y_cat2num_map, inplace=True)
y_test.replace(y_cat2num_map, inplace=True)


In [9]:
# feature selection
# method 1 - K best by chi^2 statistic
test = SelectKBest(score_func=chi2, k=10)
fit = test.fit(x_train_val_scale, y_train)
# summarize scores
np.set_printoptions(precision=3)
print(fit.scores_)
features = fit.transform(x_train_val_scale)
# summarize selected features
print(features[0:15,:]) # print data of the K best features
print(np.argsort(fit.scores_)[-10:]) # best K features indexes (10)
print(x_train_val_scale.values[:15,np.argsort(fit.scores_)[-10:]]) # print data of the K best features


[ 0.6    1.415 15.481 29.158 13.851 28.363 17.585  8.509  4.026  4.275
  3.777  0.139  0.11   7.344  5.723 55.023 55.591 14.749 30.064 27.795
 22.69 ]
[[0.75  0.2   1.    1.    1.    0.    0.    0.    0.    0.   ]
 [0.25  0.204 0.    0.    0.    1.    0.    0.    0.    0.   ]
 [0.25  0.    0.    0.    0.    0.    0.    0.    0.    0.   ]
 [0.5   0.4   1.    0.    0.    1.    0.    0.    1.    0.   ]
 [0.616 0.204 0.    0.    1.    1.    0.    0.    0.    1.   ]
 [1.    0.    0.    0.    0.    0.    0.    0.    0.    0.   ]
 [1.    0.    0.    0.    0.    0.    0.    0.    0.    0.   ]
 [1.    0.    0.    0.    0.    0.    0.    0.    0.    0.   ]
 [0.75  0.    0.    0.    0.    0.    0.    0.    0.    0.   ]
 [1.    0.4   1.    1.    0.    0.    0.    0.    0.    0.   ]
 [0.5   0.6   1.    0.    1.    1.    1.    1.    1.    0.   ]
 [0.616 0.204 0.    0.    1.    0.    0.    0.    1.    0.   ]
 [0.75  0.    0.    0.    0.    0.    0.    0.    0.    0.   ]
 [0.616 0.204 0.    0.    1.  