In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns

from pylab import rcParams
from matplotlib import pyplot as plt

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials , space_eval

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.tree import DecisionTreeClassifier
from sklearn.manifold import TSNE
from sklearn.cluster import DBSCAN
from sklearn.model_selection import cross_val_score, train_test_split

import statsmodels.api as sm
from scipy import stats as st 


pd.options.display.float_format = '{:,.2f}'.format
rcParams['figure.figsize'] = 10, 10

In [None]:
def from_pd_to_str(_df, _col, _val, _desc):
    dd = dict(_df[_df[_col] == _val][_desc].describe())
    return '\n'.join(['{} {:6.2f}'.format(k.upper(),v) for (k,v) in dd.items()])

In [None]:
df = pd.read_csv('/kaggle/input/memory-test-on-drugged-islanders-data/Islander_data.csv')
df.head()

In [None]:
df.last_name.value_counts()

In [None]:
df.groupby(['Drug', 'Dosage'])['last_name'].count()

In [None]:
sns.pairplot(df, hue="Happy_Sad_group")

In [None]:
print("""POTENTIAL IMPROVEMENTS FOR THE DATA:
- EXTRACT ETHNICITY BASED ON LAST_NAME
- EXTRACT SEX BASED ON FIRST NAME""")

In [None]:
### OUTLIER REMOVAL EXPERIMENTAL

In [None]:
clustering = DBSCAN(eps=10, min_samples=2).fit(df[['Mem_Score_Before', 'Mem_Score_After']])

In [None]:
df['Cluster_ID'] = clustering.labels_
sns.scatterplot(data = df, x = 'Mem_Score_Before', y = 'Mem_Score_After', hue = 'Cluster_ID');

In [None]:
df.query('Cluster_ID < 0')

In [None]:
df.Diff.describe()

In [None]:
df.groupby(['Drug', 'Dosage'])['age'].describe()

In [None]:
df.groupby('last_name')['age'].agg(['mean', 'median', 'count'])

In [None]:
df.groupby('Happy_Sad_group').agg({'Mem_Score_Before': 'mean', 
                                   'Mem_Score_After':'median'})

In [None]:
df['HS'] = df.Happy_Sad_group.map({'H': True, 'S': False})

In [None]:
df.groupby(['Drug', 'Dosage'])['Diff'].describe()
sns.distplot(df.Diff, hist = False, kde = True, norm_hist = True, kde_kws={'linestyle':':'}, bins = 50, color = 'red')
plt.title('Memory Score Difference Before and After the treatment \nBOTH DRUGS')
plt.text(20, 0.03, 'MULTIMODAL\nDISTRIBUTION');

#### ANOVA TESTS

In [None]:
st.kruskal(df[df.HS]['Diff'], df[~df.HS]['Diff'])

In [None]:
colors = ['red', 'blue']
for col, dfers in enumerate([df[df.HS], df[~df.HS]]):
    sns.distplot(dfers.Diff, hist = False, kde = True, norm_hist = True, kde_kws={'linestyle':':'}, bins = 50, color = colors[col])

In [None]:
st.f_oneway(df[df.HS]['Diff'], df[~df.HS]['Diff'])

In [None]:

corr = df.corr(method='kendall')
sns.heatmap(corr, 
           xticklabels=corr.columns.values, 
           yticklabels=corr.columns.values, 
           cmap="YlGnBu",
          annot=True)

In [None]:
sns.distplot(df.Mem_Score_Before, hist = False, kde = True, norm_hist = True, kde_kws={'linestyle':':'})
sns.distplot(df.Mem_Score_After, hist = False, kde = True, norm_hist = True, kde_kws={'linestyle':'--'});
plt.title('Memory Score Before and After the treatment \nBOTH DRUGS')
plt.text(100, 0.015, '--- AFTER\n... BEFORE');

In [None]:
sns.distplot(df[df.Happy_Sad_group == 'H'].Diff, hist = False, kde = True, norm_hist = True, color = 'orange')
sns.distplot(df[df.Happy_Sad_group == 'S'].Diff, hist = False, kde = True, norm_hist = True, color = 'blue');
plt.title('Memory Score Diff \nBOTH DRUGS')
plt.text(20, 0.015, 'HAPPY \n{}'.format(from_pd_to_str(df, 'Happy_Sad_group', 'H', 'Diff')));
plt.text(20, 0.03, 'SAD \n{}'.format(from_pd_to_str(df, 'Happy_Sad_group', 'S', 'Diff')));

In [None]:
df['med_score'] = df.apply(lambda row: np.mean(row[['Mem_Score_Before', 'Mem_Score_After']].values), axis = 1)
df['lage'] = df.age.apply(lambda x: np.log(x))
df['lscore'] = df.med_score.apply(lambda x: np.log(x))

In [None]:
plt.scatter(df.lage, df.lscore);
plt.text(4, 3.6, f'\nOLDER PEOPLE -> HIGHER SCORE [?]\nCORR COEFF: {str(df[["lage", "lscore"]].corr().values[-1][0])[:6]}');
plt.xlabel('LOG OF AGE')
plt.ylabel('LOG OF SCORE')
plt.title('TERRIBLE SCATTER');

In [None]:
sns.scatterplot(x="lage", y="lscore", hue="last_name", data=df, marker = 's');

In [None]:
sns.jointplot(df.age, df.Diff, kind = 'reg');
plt.text(60, 40, f'\nVERY SLIGHT MEMORY LOSS\nWITH AGE [?]\n\nCORR COEFF: {str(df[["age", "Diff"]].corr().values[-1][0])[:6]}');

In [None]:
sns.scatterplot(y="Diff", x="Mem_Score_Before", hue="Drug", data=df, style = 'Dosage');

In [None]:
df['cc'] = df.Diff.apply(lambda x: 'red' if x < 0 else 'blue')

In [None]:
DrugA = df[df.Drug == 'A'].sort_values(['Dosage', 'Diff']).reset_index()
DrugS = df[df.Drug == 'S'].sort_values(['Dosage', 'Diff']).reset_index()
DrugT = df[df.Drug == 'T'].sort_values(['Dosage', 'Diff']).reset_index()

In [None]:
dd = DrugA
med_val = dd.med_score.median()
a = np.linspace(-2,med_val*2, 200)
plt.plot(a, 1.7*a, linestyle = None, color = 'white')
for ix, row in dd.iterrows():
    plt.annotate("", xytext = (med_val, 5+ix+row.Dosage * 37), xy = (med_val+row.Diff, 5+ix+row.Dosage * 37), arrowprops=dict(arrowstyle="-", color = row.cc))
plt.text(10, 150, "\nDOSAGE 3\n{}\n".format(from_pd_to_str(dd, 'Dosage', 3, 'Diff')))
plt.text(10, 80, "\nDOSAGE 2\n{}\n".format(from_pd_to_str(dd, 'Dosage', 2, 'Diff')))
plt.text(10, 20,  "\nDOSAGE 1\n{}\n".format(from_pd_to_str(dd, 'Dosage', 1, 'Diff')))
plt.title('DRUG A');

In [None]:
dd = DrugS
med_val = dd.med_score.median()
a = np.linspace(-2,80, 100)
plt.plot(a, 2.4*a, linestyle = None, color = 'white')
for ix, row in dd.iterrows():
    plt.annotate("", xytext = (med_val, 5+ix+row.Dosage * 37), xy = (med_val+row.Diff, 5+ix+row.Dosage * 37), arrowprops=dict(arrowstyle="-", color = row.cc))
plt.text(10, 150, "\nDOSAGE 3\n{}\n".format(from_pd_to_str(dd, 'Dosage', 3, 'Diff')))
plt.text(10, 80, "\nDOSAGE 2\n{}\n".format(from_pd_to_str(dd, 'Dosage', 2, 'Diff')))
plt.text(10, 20,  "\nDOSAGE 1\n{}\n".format(from_pd_to_str(dd, 'Dosage', 1, 'Diff')))
plt.title('DRUG S');

In [None]:
dd = DrugT
med_val = dd.med_score.median()
a = np.linspace(-2,80, 200)
plt.plot(a, 2.4*a, linestyle = None, color = 'white')
for ix, row in dd.iterrows():
    plt.annotate("", xytext = (med_val, 5+ix+row.Dosage * 37), xy = (med_val+row.Diff, 5+ix+row.Dosage * 37), arrowprops=dict(arrowstyle="-", color = row.cc))
plt.text(10, 150, "\nDOSAGE 3\n{}\n".format(from_pd_to_str(dd, 'Dosage', 1, 'Diff')))
plt.text(10, 80, "\nDOSAGE 2\n{}\n".format(from_pd_to_str(dd, 'Dosage', 2, 'Diff')))
plt.text(10, 20,  "\nDOSAGE 1\n{}\n".format(from_pd_to_str(dd, 'Dosage', 3, 'Diff')))
plt.title('DRUG T');

Lets do some exploration

In [None]:
from sklearn.preprocessing import normalize

def plt_cos(_df, gr_cols: list, domain: list, norm: bool = False):
    if 'Drug' in list(_df):
        _df = pd.concat([_df, pd.get_dummies(_df['Drug'])], axis = 1)
    _sdf = _df.sort_values(gr_cols).reset_index()
    _sdf.reset_index(inplace= True)
    nn = _sdf.groupby(gr_cols).level_0.agg(['min', 'max']).reset_index()
    if norm:
        xx = np.transpose(normalize(np.transpose(_sdf[domain].values)))
    else:
        xx = _sdf[domain].values
    
    plt.imshow(cosine_similarity(xx), cmap = 'Blues')
    for i, r in nn.iterrows():
        plt.text(-20, r['max'], '_'.join(list(map(str,r[gr_cols].values))), color = 'red')
    plt.show()

In [None]:
plt_cos(df, ['Happy_Sad_group'], [ 'Diff', 'Mem_Score_Before', 'age'], True)

In [None]:
plt_cos(df, ['Happy_Sad_group'], [ 'Diff', 'Mem_Score_Before', 'age'], False)

In [None]:
plt_cos(df, ['Drug', 'Dosage'], ['A','S','T','Dosage', 'Diff'])

In [None]:
plt_cos(df, ['Drug', 'Dosage'], ['A','S','T','Dosage', 'Diff'], True)

In [None]:
plt_cos(df, ['Happy_Sad_group',], [ 'Diff', 'Mem_Score_Before'], True)

In [None]:
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
tsne_results = tsne.fit_transform(df[['Mem_Score_After', 'Mem_Score_Before', 'age', 'Dosage']])

df['tsne-2d-one'] = tsne_results[:,0]
df['tsne-2d-two'] = tsne_results[:,1]
plt.figure(figsize=(16,10))
sns.scatterplot(
    x="tsne-2d-one", y="tsne-2d-two",
    hue="Happy_Sad_group",
    palette=sns.color_palette("hls", 2),
    data=df,
    legend="full",
    alpha=0.3);

In [None]:
from sklearn.datasets import load_diabetes

diabetes = load_diabetes()

In [None]:
print(diabetes.DESCR)

In [None]:
# diabetes.feature_names

In [None]:
n = diabetes.data.shape[0]

data = diabetes.data
targets = diabetes.target
binary_targets = np.array(list(map(lambda v: 1 if v > np.median(targets) else 0, targets)))

from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from lightgbm.sklearn import LGBMClassifier
from sklearn.metrics import mean_squared_error

random_state=2
n_iter=30

X_train, X_test, y_train, y_test = train_test_split(data,binary_targets,stratify=binary_targets, test_size=0.20, shuffle=True,
                                                                      random_state=random_state)

num_folds=5
kf = KFold(n_splits=num_folds, random_state=random_state)

def gb_mse_cv(params, random_state=random_state, cv=kf, X=X_train, y=y_train):

    
    params = {'n_estimators': int(params['n_estimators']), 
              'max_depth': int(params['max_depth']), 
             'learning_rate': params['learning_rate']}
    
    
    model = LGBMClassifier(random_state=random_state, **params)
    
    
    score = -cross_val_score(model, 
                             X, 
                             y, cv=cv, 
                             scoring="neg_mean_squared_error", 
                             n_jobs=-1).mean()

    return score


space={'n_estimators': hp.quniform('n_estimators', 100, 2000, 1),
       'max_depth' : hp.quniform('max_depth', 2, 20, 1),
       'learning_rate': hp.loguniform('learning_rate', -5, 0)
      }


trials = Trials()

best=fmin(fn=gb_mse_cv,
          space=space,
          algo=tpe.suggest,
          max_evals=n_iter,
          trials=trials,
          rstate=np.random.RandomState(random_state))

# computing the score on the test set
model = LGBMClassifier(random_state=random_state, n_estimators=int(best['n_estimators']),
                      max_depth=int(best['max_depth']),learning_rate=best['learning_rate'])

model.fit(X_train, y_train)

clf = pd.DataFrame(classification_report(y_test, model.predict(X_test), output_dict = True))
clf_train = pd.DataFrame(classification_report(y_train, model.predict(X_train), output_dict = True))

train_mtrx = confusion_matrix(y_train, model.predict(X_train))
test_mtrx = confusion_matrix(y_test, model.predict(X_test))

In [None]:
clf

In [None]:
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, Normalizer
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.model_selection import RandomizedSearchCV

from sklearn.pipeline import make_pipeline

pipe = make_pipeline(Normalizer(), svm.SVC())
pipe.fit(X_train, y_train)

In [None]:
param_grid = dict(normalizer__norm=['l1', 'l2', 'max'],
                  svc__kernel=['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
                  svc__C=[0.5, 1.0, 1.5])

search = RandomizedSearchCV(pipe, param_distributions=param_grid)

In [None]:
search.fit(X_train, y_train)

In [None]:
def compare_train_test(_clf, _x_train, _x_test, _y_train, _y_test):
    #TRAIN
    trvals = pd.DataFrame(classification_report(_y_train, _clf.predict(_x_train), output_dict = True)).reset_index()[['index', '1']]
    tsvals = pd.DataFrame(classification_report(_y_test, _clf.predict(_x_test), output_dict = True)).reset_index()[['index', '1']]
    trvals.rename(columns = {'1': 'TRAIN'}, inplace = True)
    tsvals.rename(columns = {'1': 'TEST'}, inplace = True)
    return trvals.merge(tsvals)



In [None]:
#TUNED SVM
compare_train_test(search.best_estimator_, X_train, X_test, y_train, y_test)

In [None]:
## UNTUNED SVM
compare_train_test(pipe, X_train, X_test, y_train, y_test)



In [None]:
## TUNED LIGHTGBM
compare_train_test(model, X_train, X_test, y_train, y_test)
