In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_selection import mutual_info_classif
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from imblearn.combine import SMOTETomek
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from scipy.stats import probplot

In [None]:
# download link:   https://www.kaggle.com/andrewmvd/heart-failure-clinical-data/download
# api command to download data:   kaggle datasets download -d andrewmvd/heart-failure-clinical-data

df = pd.read_csv("../input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv")
df.head()

In [None]:
df.info()
# all are numeric features
# no null values...

In [None]:
# lets check for suspicious values that can be null
df.describe()

In [None]:
# sns.pairplot(df)

In [None]:
for i in df.columns:
    cat_num = df[i].value_counts()
    print("graph for %s: total = %d" % (i, len(cat_num)))
    chart = sns.barplot(x=cat_num.index, y=cat_num)
    chart.set_xticklabels(chart.get_xticklabels(), rotation=90)
    plt.show()

In [None]:
plt.figure(figsize=(15,15))
sns.heatmap(df.corr(), annot=True);

In [None]:
df.boxplot(['age']);

In [None]:
df.boxplot(['creatinine_phosphokinase']); # outliers... log transform

In [None]:
df.boxplot(['ejection_fraction'])

In [None]:
df.boxplot(['platelets'])   # outliers

In [None]:
df.boxplot(['serum_creatinine'])   # outliers

In [None]:
df.boxplot(['serum_sodium'])

In [None]:
df.boxplot(['time'])

In [None]:
df.describe()

In [None]:
probplot(df['age'],plot=plt);

In [None]:
probplot(df['platelets'],plot=plt);

In [None]:
probplot(df['creatinine_phosphokinase'],plot=plt);

In [None]:
sns.distplot(df['creatinine_phosphokinase'])

In [None]:
probplot(np.log(df['creatinine_phosphokinase']),plot=plt);

In [None]:
sns.distplot(np.log(df['creatinine_phosphokinase']));

In [None]:
df['creatinine_phosphokinase'] = np.log(df['creatinine_phosphokinase'])

In [None]:
probplot(df['ejection_fraction'],plot=plt);

In [None]:
probplot(df['serum_creatinine'],plot=plt);

In [None]:
sns.distplot(df['serum_creatinine'])

In [None]:
probplot(np.log(df['serum_creatinine']),plot=plt);

In [None]:
sns.distplot(np.log(df['serum_creatinine']));

### creatinine_phosphokinase: In summary, renal injury with high serum CPK values becomes a true concern when levels of CPK reach 5,000 IU/L and the patient has serious co-morbid disease such as volume depletion, sepsis or acidosis. Otherwise, values of up to 20,000 IU/L may be tolerated without untoward event.



In [None]:
df[df['creatinine_phosphokinase'] == 7861.000000]

In [None]:
df[df['platelets'] == 850000.000000]

### i want to try logistic regression so i wanted to drop these values... but we already have less values so we'll just scale these and create a model

In [None]:
df.drop([1,109], inplace=True)

In [None]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_scaled = pd.DataFrame(X_scaled,columns=df.columns[:-1])


In [None]:
probplot(X_scaled['serum_creatinine'],plot=plt);

In [None]:
# from sklearn.preprocessing import MinMaxScaler

# min_max = MinMaxScaler()
# s_c_scaled = min_max.fit_transform(X_scaled[['serum_creatinine']])
# s_c , _ = boxcox(s_c_scaled.T[0])
# probplot(s_c,plot=plt);

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X_scaled,y,test_size=0.3,random_state=3)

In [None]:
log_clf = LogisticRegression()
log_clf.fit(X_train,y_train)
log_pred = log_clf.predict(X_test)
log_clf.score(X_test,y_test)

In [None]:
dt_clf = DecisionTreeClassifier()
dt_clf.fit(X_train,y_train)
dt_clf.score(X_test,y_test)

In [None]:
kn_clf = KNeighborsClassifier()
kn_clf.fit(X_train,y_train)
kn_clf.score(X_test,y_test)

In [None]:
from sklearn.metrics import confusion_matrix

confusion_matrix(log_pred,y_test)

### lets work on hyper parameter tuning of Logistic regression

In [None]:
log_clf = RidgeClassifier()
log_clf.fit(X_train,y_train)
log_pred = log_clf.predict(X_test)
log_clf.score(X_test,y_test)

In [None]:

rf_clf = RandomForestClassifier()
rf_clf.fit(X_train,y_train)
rf_clf.score(X_test,y_test)

### random forest grid search

In [None]:
# n_estimators = [10, 100, 1000]
# max_features = ['sqrt', 'log2']
# # define grid search
# grid = dict(n_estimators=n_estimators,max_features=max_features)
# cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# grid_search = GridSearchCV(estimator=rf_clf, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
# grid_result = grid_search.fit(X, y)

In [None]:
# grid_result.best_estimator_

In [None]:
rf_clf = RandomForestClassifier(max_features='log2', n_estimators=1000)
rf_clf.fit(X_train,y_train)
rf_clf.score(X_test,y_test)

### Using knn for clustering approach

In [None]:
kn_clf = KNeighborsClassifier()
kn_clf.fit(X_scaled,y)

In [None]:
kn_clf.predict(X_scaled)

In [None]:
len(kn_clf.classes_)

In [None]:
df['knn_clf'] = kn_clf.predict(X_scaled)

In [None]:
df1 = df[df['knn_clf'] == 1].iloc[:,:-1]
df2 = df[df['knn_clf'] == 0].iloc[:,:-1]

In [None]:
X1 = df.iloc[:,:-1]
y1 = df.iloc[:,-1]

scaler = StandardScaler()
X_scaled1 = scaler.fit_transform(X1)

X_scaled1 = pd.DataFrame(X_scaled1,columns=df.columns[:-1])

X2 = df.iloc[:,:-1]
y2 = df.iloc[:,-1]

scaler = StandardScaler()
X_scaled2 = scaler.fit_transform(X2)

X_scaled2 = pd.DataFrame(X_scaled2,columns=df.columns[:-1])

X_train1,X_test1,y_train1,y_test1 = train_test_split(X_scaled1,y1,test_size=0.3,random_state=3)
X_train2,X_test2,y_train2,y_test2 = train_test_split(X_scaled2,y2,test_size=0.3,random_state=3)

In [None]:
def try_models(Xtr,Xts,ytr,yts):
    for model in [LogisticRegression(),DecisionTreeClassifier(),RandomForestClassifier(),KNeighborsClassifier(n_neighbors=2)]:
        print(model)
        model.fit(Xtr,ytr)
        print(model.score(Xts,yts))
        y_pred = model.predict(Xts)
        print(confusion_matrix(y_pred,yts))
        print()

In [None]:
print("class 1: ***********************************")
try_models(X_train1,X_test1,y_train1,y_test1)
print("class 2: ***********************************")
try_models(X_train2,X_test2,y_train2,y_test2)

In [None]:
y_tests = []
y_preds = []
# for i in range(2):
model = RandomForestClassifier(max_features='log2', n_estimators=1000)
model.fit(X_train1,y_train1)
y_preds.extend(model.predict(X_test1))
y_tests.extend(y_test1)
model = RandomForestClassifier(max_features='log2', n_estimators=1000)
model.fit(X_train2,y_train2)
y_preds.extend(model.predict(X_test2))
y_tests.extend(y_test2)

print(accuracy_score(y_preds,y_tests))
print(confusion_matrix(y_preds,y_tests))

### knn seems to perform well with a accuracy of 86%

### although we've go 4 False negatives... we have gotten more of False positives...

In [None]:
df.DEATH_EVENT.value_counts()

### 1 means a person is dead and 0 means alive... we have less number of deaths but still our death classification is better then not_dead classification... 

### Lets try to balance our dataset and see what comes...

In [None]:
!pip install imblearn

### Treat imbalanced dataset

In [None]:
smt = SMOTETomek(random_state=42)
X_res,y_res = smt.fit_resample(X,y)

In [None]:
pd.DataFrame(y_res)['DEATH_EVENT'].value_counts()

In [None]:
def clustering_approach(X,y, models,type = "none"):
    
    dfs = {}
    X_cls = {}
    y_cls = {}
    X_scaled = {}
    X_train, X_test, y_train, y_test = {},{},{},{}
    y_pred = {}
    models_out = {}
    
    # create knn model and predict
    knn_clf = KNeighborsClassifier()
    knn_clf.fit(X,y)
    df = pd.concat([X,y],axis=1) # so we can later separate x and y for each cluster
    df['knn_clf'] = knn_clf.predict(X)
    no_cls = knn_clf.classes_
    
    # get the dataframes, apply std.scaler, form train, test sets, apply models
    for cls in knn_clf.classes_:
        print("--------------The {} cluster's results-------------------".format(cls),end="\n\n")
        dfs[cls] = df[df['knn_clf'] == cls].iloc[:,:-1]
        
        X_cls[cls] = dfs[cls].iloc[:,:-1]
        y_cls[cls] = dfs[cls].iloc[:,-1]
        scaler = StandardScaler()
        X_scaled[cls] = scaler.fit_transform(X_cls[cls])
#         X_scaled[cls] = pd.DataFrame(X_scaled[cls],columns=df.columns[:-1])
    
        X_train[cls],X_test[cls],y_train[cls],y_test[cls] = train_test_split(X_scaled[cls],y_cls[cls],test_size=0.3,random_state=3)
        print("here")
        # type can be used for analyzing... eg: confusion matrix
        for model in models:
            model.fit(X_train[cls], y_train[cls])
            y_pred[cls] = model.predict(X_test[cls])
            print(model)
            print(model.score(X_test[cls],y_test[cls]))
            print(confusion_matrix(y_pred[cls], y_test[cls]), end="\n\n")
            models_out[str(model) + str(cls)] = model
            
    
    return [X_train, X_test, y_train, y_test,knn_clf, models_out]
        


In [None]:
!pip install lightgbm

In [None]:
mutual_info_vals = mutual_info_classif(X_res,y_res)
mutual_val_df = pd.DataFrame({"vals":mutual_info_vals},index=X.columns) # we're keeping the passenger id
plt.figure(figsize=(10,5))
mutual_val_df.vals.sort_values(ascending=False).plot(kind='bar');


In [None]:
X_res[['time','serum_creatinine','ejection_fraction','platelets','age','serum_sodium','creatinine_phosphokinase']].head()

In [None]:

models = [LogisticRegression(), RandomForestClassifier(), KNeighborsClassifier(), XGBClassifier(verbosity = 0),LGBMClassifier(),SVC()] 
X_train, X_test, y_train, y_test,clusterer, models = clustering_approach(X_res[['time','serum_creatinine','ejection_fraction','platelets','age','serum_sodium','creatinine_phosphokinase']],y_res,models)


### Looks like random forest has worked out for both the clusters

### Lets find the best params for our models....

In [None]:
# n_estimators = [10, 100, 1000,1500]
# max_features = np.arange(1,20)
# # define grid search
# grid = dict(n_estimators=n_estimators,max_features=max_features)
# cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# grid_search = GridSearchCV(estimator=RandomForestClassifier(), param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
# grid_result = grid_search.fit(X_res, y_res)


In [None]:
# grid_result.best_estimator_

### Lets find the total accuarcy and performance now!

In [None]:
# grid_result = grid_search.fit(X_train[1], y_train[1])
# grid_result.best_estimator__

In [None]:
len(X_train)

In [None]:


y_tests = []
y_preds = []
i=0
model = XGBClassifier(verbosity = 0)
model.fit(X_train[i],y_train[i])
y_preds.extend(model.predict(X_test[i]))
y_tests.extend(y_test[i])
i=1
model = RandomForestClassifier(max_features='log2', n_estimators=1000)
model.fit(X_train[i],y_train[i])
y_preds.extend(model.predict(X_test[i]))
y_tests.extend(y_test[i])
    
print(accuracy_score(y_preds,y_tests))
print(confusion_matrix(y_preds,y_tests))

In [None]:
models['RandomForestClassifier()0']

In [None]:
models['RandomForestClassifier()1']

In [None]:
clusterer

In [None]:
# import joblib

In [None]:
# model = clusterer
# filename = "clusterer" 
# joblib.dump(model, filename)

# model = models['RandomForestClassifier()0']
# filename = "RF_model1"
# joblib.dump(model, filename)

# model = models['RandomForestClassifier()1']
# filename = "RF_model2"
# joblib.dump(model, filename)