In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt 
import seaborn as sns 
from warnings import filterwarnings as filt

filt('ignore')
plt.rcParams['figure.figsize'] = (12,6)
plt.style.use('Solarize_Light2')
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
base_dir = '/kaggle/input/predict-the-genetic-disorders-datasetof-genomes/'
traindf = pd.read_csv(f'{base_dir}train_genetic_disorders.csv')
testdf = pd.read_csv(f'{base_dir}test_genetic_disorders.csv')
traindf.head()

In [None]:
traindf.shape, testdf.shape

In [None]:
traindf.info()

In [None]:
traindf.select_dtypes(exclude = 'object').describe()

In [None]:
traindf.select_dtypes(include = 'object').describe()

In [None]:
traindf = traindf.dropna(subset= ['Genetic Disorder', 'Disorder Subclass'])
target1 = traindf['Genetic Disorder']
target2 = traindf['Disorder Subclass']
print(f"target 1 ==> {target1.unique()}")
print()
print(f"target 2 ==> {target2.unique()}")

In [None]:
trainIdx, testIdx = traindf.index, testdf.index
df = pd.concat([traindf.drop(['Genetic Disorder', 'Disorder Subclass'], axis = 1), testdf])
df = df.drop(['Patient Id'], axis = 1)
df.head()

### Handling null values

In [None]:
def null(df):
    null_feats = pd.DataFrame(df.isnull().sum(), index = df.columns, columns = ['nans']).sort_values('nans', ascending = False)
    null_feats['nans %'] = df.isnull().sum() / df.shape[0]
    null_feats['data_type'] = [df[c].dtypes.name for c in null_feats.index]
    return null_feats[null_feats.nans > 0]

null(df)

In [None]:
df.select_dtypes(include = 'object').columns

In [None]:
df[null(df).index].head()

In [None]:
feats_to_drop = [c for c in null(df).index if 'name' in c.lower()] + ['Place of birth', 'Location of Institute']
feats_to_drop

In [None]:
df = df.drop(feats_to_drop, axis = 1)

In [None]:
nan = null(df)
num_null = nan[nan.data_type != 'object']
obj_null = nan[nan.data_type == 'onject']
df[num_null.index].head()

In [None]:
tests = ['Test 1','Test 2','Test 3','Test 4','Test 5']
for col in tests:
    print()
    print(col.center(60,'='))
    print(df[col].unique())
    

In [None]:
df[tests][df['Test 1'] == -99].head()

In [None]:
# lets just fill the tests Na values with -99
df[tests] = df[tests].fillna(0.0)

In [None]:
from scipy.stats import skew, norm 

ax = sns.distplot(df['Patient Age'], color = 'red')
ax = sns.distplot(df['Blood cell count (mcL)'], color = 'blue')
ax = sns.distplot(df['White Blood cell count (thousand per microliter)'], color = 'black')
plt.legend(['Patient age','blood cell count','white blood cell count'])

In [None]:
sns.scatterplot(data = df, x = 'White Blood cell count (thousand per microliter)',y ='Blood cell count (mcL)')

* hmm looks like there are lot of outliers for the white blood cells 
* A negative result means no white blood cells (leukocytes) were found in the sample. If you or your child's results were negative, the symptoms are probably not caused by an infection.

In [None]:
sns.scatterplot(data = df, x = 'Patient Age',y ='Blood cell count (mcL)')

In [None]:
sns.scatterplot(data = df, y = 'White Blood cell count (thousand per microliter)',x ='Patient Age')

In [None]:
df['White Blood cell count (thousand per microliter)'][df['White Blood cell count (thousand per microliter)'] == 0].unique()

In [None]:
df['White Blood cell count (thousand per microliter)'] = df['White Blood cell count (thousand per microliter)'].fillna(df['White Blood cell count (thousand per microliter)'].mean())
df['Patient Age'] = df['Patient Age'].fillna(np.floor(df['Patient Age'].mean()))
df['Blood cell count (mcL)'] = df['Blood cell count (mcL)'].fillna(df['Blood cell count (mcL)'].mean())

In [None]:
sns.scatterplot(data = df, y = "Mother's age", x = "Father's age")

In [None]:
sns.distplot(df["Mother's age"], fit = norm)

In [None]:
sns.distplot(df["Father's age"], fit = norm)

In [None]:
parent_age = ["Mother's age", "Father's age"]
df[parent_age] = df[parent_age].fillna(np.floor(df[parent_age].mean()))

In [None]:
np.floor(df[parent_age].mean())

In [None]:
sns.distplot(df['No. of previous abortion'])

In [None]:
df['No. of previous abortion'].unique()

In [None]:
df[df['No. of previous abortion'] == -99.0]

In [None]:
sns.scatterplot(data = df, y = "Mother's age", x = 'No. of previous abortion')

wait what ?? just at the age of 18 there were 4 previos abortion ? 0.0

In [None]:
plt.xticks(rotation = 90)
sns.countplot(df["Father's age"][df["Mother's age"] == 18])
plt.title("Father's age for 18 year old mother");

In [None]:
import plotly.express as px

# px.scatter_3d(data_frame = df,x = "Father's age", y = "Mother's age", z = 'No. of previous abortion')

In [None]:
df['No. of previous abortion'] = df['No. of previous abortion'].fillna(np.floor(df['No. of previous abortion'].mean()))

In [None]:
nans = null(df)
obj_null = nans[nans.data_type == 'object']
obj_null

In [None]:
simps = [f'Symptom {i}' for i in range(1,6)]
df[simps].head()

In [None]:
df[simps] = df[simps].fillna(0.0)

In [None]:
df[null(df).index].head()

In [None]:
def plot(df, rc, kind = 'dist'):
    fig, ax = plt.subplots(rc[0], rc[1], figsize = (13, 8))
    fig.tight_layout()
    cols = df.columns
    ind = 0
    for r in range(rc[0]):
        for c in range(rc[1]):
            if ind >= len(cols): break
            x = df[cols[ind]]
            if kind == 'dist':
                sns.distplot(x, ax = ax[r,c])
            elif kind == 'bar':
                sns.countplot(x, ax = ax[r,c])
            ind += 1

In [None]:
plot(df[null(df).index[:10]], [2,5], 'bar')

In [None]:
# since there are lot of parental consent we'll just assume that tests conducted due to parents consent , there's no way a kid will check up on his own

df = df.drop(['Parental consent'], axis = 1)

In [None]:
null(df)

In [None]:
for i in null(df).index:
    print(i.center(60,'='))
    print(df[i].unique())
    print()

In [None]:
nos = [c for c in null(df).index if 'No' in df[c].unique()]
plot(df[nos], [4,3], 'bar')

In [None]:
df[nos] = df[nos].fillna('No')

In [None]:
plot(df[null(df).index], [2,4], 'bar')

In [None]:
for i in null(df).index:
    df[i] = df[i].fillna(df[i].mode()[0])

In [None]:
null(df)

### Data cleaning 

In [None]:
df.head()

In [None]:
categorical_feats = [c for c in df.columns if df[c].dtypes == 'object']
numerical_feats = [c for c in df.columns if df[c].dtypes != 'object']
num99 = [c for c in numerical_feats if -99.0 in df[c].unique()]
obj99 = [c for c in categorical_feats if "-99" in df[c].unique()]

In [None]:
df[num99].head()

In [None]:
df[['Test 1','Test 2','Test 3','Test 4','Test 5','No. of previous abortion']] = df[['Test 1','Test 2','Test 3','Test 4','Test 5','No. of previous abortion']].replace({-99.0 : 0})
df['White Blood cell count (thousand per microliter)'] = df['White Blood cell count (thousand per microliter)'].replace({-99.0 : df['White Blood cell count (thousand per microliter)'].median()})

In [None]:
[c for c in numerical_feats if -99.0 in df[c].unique()]

In [None]:
df[obj99].head()

In [None]:
plot(df[obj99], [3,5], 'bar')

In [None]:
# Normal (30-60)
# Respiratory Rate (breaths/min)	Heart Rate (rates/min	Follow-up	Gender

df[['Respiratory Rate (breaths/min)','Heart Rate (rates/min','Follow-up','Gender']]

In [None]:
for feat in ['Respiratory Rate (breaths/min)','Heart Rate (rates/min','Follow-up','Gender']:
    df[feat] = df[feat].replace({'-99' : df[feat].mode()[0]})
    
df['Respiratory Rate (breaths/min)'] = df['Respiratory Rate (breaths/min)'].replace({'Normal (30-60)' : 'Normal'})

In [None]:
df['Birth asphyxia'] = df['Birth asphyxia'].replace({
    'No record': 'Not available', 
    '-99' : 'Not available'})

In [None]:
appli = [c for c in obj99[5:] if 'Not applicable' in df[c].unique()]
df[appli] = df[appli].replace({
    '-99' : 'Not applicable',
    'None' : 'No',
    '-' : 'No'
})
for i in appli:
    print(df[i].unique())

In [None]:
obj99 = [c for c in categorical_feats if '-99' in df[c].unique()]
for obj in obj99:
    print(f"{obj} ===> {df[obj].unique()}")

In [None]:
df[obj99[:-2]] = df[obj99[:-2]].replace({'-99' : 'No'})
for obj in obj99[-2:]:
    df[obj] = df[obj].replace({'-99': df[obj].mode()[0]})

In [None]:
print([c for c in categorical_feats if '-99' in df[c].unique()])
print([c for c in numerical_feats if -99.0 in df[c].unique()])

In [None]:
df.head()

In [None]:
categorical_feats = df.select_dtypes(include = 'object').columns

In [None]:
for s in simps: 
    df[s] = pd.to_numeric(df[s])
    
categorical_feats = df.select_dtypes(include = 'object').columns

In [None]:
df[categorical_feats].head()

In [None]:
from sklearn.preprocessing import OrdinalEncoder as oe, LabelEncoder as le

In [None]:
encoder = oe()
df[categorical_feats] = encoder.fit_transform(df[categorical_feats])
df[categorical_feats].head()

In [None]:
x = df.reset_index(drop = True).iloc[: traindf.shape[0]]
test_x = df.reset_index(drop = True).iloc[traindf.shape[0] :]
target_encoder = le()
y_train = pd.DataFrame(target_encoder.fit_transform(target2), columns = [target2.name])
x.shape, traindf.shape, test_x.shape, testdf.shape

In [None]:
x.head()

In [None]:
test_x.head()

In [None]:
fig, ax = plt.subplots(1,2)
fig.tight_layout()
# ax[0].xticks(rotation = 90)
ax[0].set_xticklabels(rotation = 90, labels = target2.unique())
sns.countplot(target2.sort_values(ascending = True), ax = ax[0])
sns.countplot(y_train[target2.name], ax = ax[1])

In [None]:
from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn.svm import SVC
import eli5
from eli5.sklearn import PermutationImportance
from pdpbox import pdp
import shap
from sklearn.feature_selection import mutual_info_classif

def permImp(val_x, val_y):
    model = rfc(n_estimators=100).fit(val_x, val_y)
    perm = PermutationImportance(model).fit(val_x, val_y)
    return eli5.show_weights(perm, feature_names = val_x.columns.tolist())

def interact(cols, val_x, val_y):
    model = rfc(n_estimators=100).fit(val_x, val_y)
    pdp_dist = pdp.pdp_interact(model, dataset = val_x, model_features = val_x.columns, features = cols)
    return pdp.pdp_interact_plot(pdp_dist, feature_names=cols)

def isolate(col, val_x, val_y):
    model = rfc(n_estimators=100).fit(val_x, val_y)
    pdp_dist = pdp.pdp_isolate(model, dataset = val_x, model_features = val_x.columns, feature = col)
    return pdp.pdp_plot(pdp_dist, feature_name=col)

def forceplot(train_x, train_y, val_x):
    model = rfc(n_estimators=100).fit(val_x, val_y)
#     model = SVC().fit(val_x, val_y)
    feats = val_x.sample(n = 1)
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(feats)
    return shap.force_plot(explainer.expected_value[1],shap_values[1], feature_names = feats.columns, features = feats)

def train_val(x, y, test_size = 0.2):
    idx = x.sample(frac = test_size).index
    train_x, val_x = x.drop(idx), x.iloc[idx]
    train_y, val_y = y.drop(idx), y.iloc[idx]
    return train_x, val_x, train_y, val_y

def plot_mi(score):
    score = score.sort_values('mi_score', ascending = True)
    plt.barh(score.index, score.mi_score)
    return 

def mi_score(x, y):
    score = pd.DataFrame(mutual_info_classif(x, y, discrete_features=False), index = x.columns, columns = ['mi_score'])
    plot_mi(score)
    return score.sort_values('mi_score', ascending = False)

In [None]:
train_x, val_x, train_y, val_y = train_val(x, y_train)
train_x.shape, val_x.shape, train_y.shape, val_y.shape

In [None]:
sns.heatmap(train_x.corr(), cmap = 'icefire')

In [None]:
permImp(val_x, val_y)

In [None]:
mscore = mi_score(val_x, val_y)

In [None]:
isolate('Symptom 5', val_x, val_y);

In [None]:
shap.initjs()
forceplot(train_x, train_y, val_x)

In [None]:
from sklearn.naive_bayes import GaussianNB as gnb
from sklearn.neighbors import KNeighborsClassifier as knn
from xgboost import XGBRFClassifier as xgb 

from sklearn.model_selection import cross_val_score as cvs, GridSearchCV as gscv, StratifiedKFold as skf
from sklearn.metrics import classification_report, confusion_matrix

from sklearn.preprocessing import StandardScaler as ss, RobustScaler as rs, MinMaxScaler as mms 

from sklearn.pipeline import Pipeline
from scipy.stats import skew, norm

In [None]:
def best_model(train_x, train_y):
    models = [SVC(), rfc(), knn(), gnb(), xgb()]
    names = ['svm','random forest clf', 'knn', 'naive bayes', 'xgboost']
    scores = []
    for model in models:
        cv = skf(n_splits = 5, shuffle = True, random_state = 1)
        score = cvs(model, train_x, train_y, cv = cv, verbose = 1)
        scores.append(score)
    return pd.DataFrame(score, index = names, columns = ['accuracy']).sort_values('accuracy', ascending = False)

def gcv(train_x, train_y, model, params):
    cv = skf(n_splits = 5, shuffle = True, random_state = 1)
    clf = gscv(model, param_grid= params, verbose = 2, return_train_score=True, n_jobs = -1)
    clf.fit(train_x, train_y)
    results = pd.DataFrame(clf.cv_results_)
    results = results[['mean_test_score','mean_train_score','params']]
    return clf.best_estimator_, clf.best_params_, results

def get_score(yt, pred):
    print(classification_report(yt, pred))

In [None]:
# best_model(train_x, train_y)

In [None]:
train_x = train_x.drop(tests, axis = 1)
val_x = val_x.drop(tests, axis = 1)
# clf, best_params, results = gcv(ss().fit_transform(train_x), train_y, SVC(), {
#     'C' : [1,10,40,50],
#     'kernel' : ['rbf','poly','sigmoid'],
#     'decision_function_shape' : ['ovo','ovr']
# })
# results.head()

In [None]:
clf = xgb(max_depth = 15, learning_rate = 1.5, reg_lambda = 2, reg_alpha = 0.5, decision_function_shape = 'ovo')
clf.fit(train_x, train_y)
pred = clf.predict(val_x)
sns.heatmap(confusion_matrix(val_y, pred), fmt = '.1f', annot = True)
get_score(val_y, pred)

In [None]:
clf.score(train_x, train_y), clf.score(val_x, val_y)

In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression as lrr

In [None]:
n = int(np.sqrt(train_x.shape[0]))
n += 1 if n % 2 == 0 else 0  
n

In [None]:

estimators = [
    ('svm' , SVC(C = 10)), 
    ('rfc' , rfc(n_estimators=100)), 
    ('knn' , knn(n_neighbors=n)), 
    ('gnb' , gnb()), 
    ('xgb' , xgb(n_estimators = 100))
    ]

clfs = StackingClassifier(estimators=estimators, final_estimator=lrr(solver='liblinear'), verbose= 1, n_jobs = -1 )
clfs.fit(train_x, train_y)
pred = clfs.predict(val_x)
clfs.score(train_x, train_y), clfs.score(val_x, val_y)

In [None]:
sns.heatmap(confusion_matrix(val_y, pred), fmt = '.1f', annot = True)
get_score(val_y, pred)

In [None]:
testing = val_x.sample(n = 1)
testing 

In [None]:
# val_y, 
val_y.loc[testing.index,], target_encoder.inverse_transform(val_y.loc[testing.index])[0]

In [None]:
clf.predict(testing)[0], clfs.predict(testing)[0]