In [33]:
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import matplotlib
plt.style.use('ggplot')
from matplotlib.pyplot import figure

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

%matplotlib inline 
matplotlib.rcParams['figure.figsize'] = (12,8)

pd.options.mode.chained_assignment = None



# read the data
df = pd.read_csv('dataset.csv')

In [34]:
#drop irrelevant columns
cols_to_drop=['RANDID','educ','TIMECVD','TIME','PERIOD','HDLC','LDLC','ANGINA','HOSPMI', 'MI_FCHD', 'ANYCHD','STROKE','HYPERTEN', 'DEATH','TIMEAP','TIMEMI', 'TIMEMIFC','TIMECHD', 'TIMESTRK', 'TIMEHYP','TIMEDTH']
df=df.drop(cols_to_drop, axis=1) #axis=1 because we are dropping columns, not rows

In [35]:
#not many so we can drop them all
df=df.dropna()#drop the observations that contain missing values
df.isnull().sum().sum() #print number of rows with missing values after dropping the rows

0

In [36]:
def outlier_treatment(datacolumn):
 sorted(datacolumn)
 Q1,Q3 = np.percentile(datacolumn , [25,75])
 IQR = Q3-Q1
 lower_range = Q1-(1.5 * IQR)
 upper_range = Q3 + (1.5 * IQR)
 return lower_range,upper_range

In [37]:
lowerbound,upperbound = outlier_treatment(df.TOTCHOL)
df.drop(df[ (df.TOTCHOL > upperbound) | (df.TOTCHOL < lowerbound) ].index , inplace=True)

In [38]:
lowerbound,upperbound = outlier_treatment(df.SYSBP)
df.drop(df[ (df.SYSBP > upperbound) | (df.SYSBP < lowerbound) ].index , inplace=True)

In [39]:
lowerbound,upperbound = outlier_treatment(df.DIABP)
df.drop(df[ (df.DIABP > upperbound) | (df.DIABP < lowerbound) ].index , inplace=True)

In [40]:
lowerbound,upperbound = outlier_treatment(df.CIGPDAY)
df.drop(df[ (df.CIGPDAY > upperbound) | (df.CIGPDAY < lowerbound) ].index , inplace=True)

In [41]:
lowerbound,upperbound = outlier_treatment(df.BMI)
df.drop(df[ (df.BMI > upperbound) | (df.BMI < lowerbound) ].index , inplace=True)

In [42]:
lowerbound,upperbound = outlier_treatment(df.HEARTRTE)
df.drop(df[ (df.HEARTRTE > upperbound) | (df.HEARTRTE < lowerbound) ].index , inplace=True)

In [43]:
lowerbound,upperbound = outlier_treatment(df.GLUCOSE)
df.drop(df[ (df.GLUCOSE > upperbound) | (df.GLUCOSE < lowerbound) ].index , inplace=True)

In [44]:
df.shape

(8259, 18)

In [45]:
#drop irrelevant columns
drop_columns=['CURSMOKE', 'BMI', 'HEARTRTE', 'GLUCOSE']
df=df.drop(drop_columns, axis=1) #axis=1 because we are dropping columns, not rows

In [93]:
from sklearn.model_selection import train_test_split

X = df.drop('CVD', axis=1)
y = df['CVD']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)

In [47]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

In [48]:
X_train.iloc[:,1:5] = sc.fit_transform(X_train.iloc[:,1:5])

In [49]:
X_test.iloc[:, 1:5] = sc.transform(X_test.iloc[:, 1:5])

In [50]:
X_train.head()

Unnamed: 0,SEX,TOTCHOL,AGE,SYSBP,DIABP,CIGPDAY,DIABETES,BPMEDS,PREVCHD,PREVAP,PREVMI,PREVSTRK,PREVHYP
10791,2,0.500643,-0.720942,-0.327366,-0.081681,30.0,0,0.0,0,0,0,0,0
10146,1,0.018578,-0.193838,-0.508073,-0.850501,20.0,0,0.0,0,0,0,0,0
11619,2,0.717573,-0.193838,0.034047,0.110524,0.0,0,0.0,0,0,0,0,0
2146,1,0.597056,1.071209,0.059862,0.014422,50.0,0,0.0,0,0,0,0,0
5838,2,-0.608108,-0.5101,-1.230898,-1.042706,20.0,0,0.0,0,0,0,0,0


In [51]:
X_test.head()

Unnamed: 0,SEX,TOTCHOL,AGE,SYSBP,DIABP,CIGPDAY,DIABETES,BPMEDS,PREVCHD,PREVAP,PREVMI,PREVSTRK,PREVHYP
8306,1,-1.620446,-0.088418,-0.095029,-0.081681,0.0,0,0.0,1,1,0,0,0
7291,1,-1.644549,1.703733,-1.643941,-1.042706,0.0,1,0.0,1,1,0,0,0
417,2,1.296052,0.122423,0.292198,0.062473,0.0,0,0.0,0,0,0,0,1
4028,1,-1.451723,0.754947,-1.076007,-1.475167,0.0,0,0.0,0,0,0,0,0
3391,2,1.030915,0.122423,2.53812,0.110524,20.0,0,0.0,0,0,0,0,1


In [52]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)

In [53]:
x_train_res, y_train_res = sm.fit_resample(X_train, y_train)

In [54]:
print(f'''Shape of X train before SMOTE: {X_train.shape}
Shape of X train after SMOTE: {x_train_res.shape}''')

print('\nBalance of positive and negative classes (%):')
y_train_res.value_counts(normalize=True) * 100

Shape of X train before SMOTE: (6194, 13)
Shape of X train after SMOTE: (9500, 13)

Balance of positive and negative classes (%):


0    50.0
1    50.0
Name: CVD, dtype: float64

In [55]:
y_train.value_counts()

0    4750
1    1444
Name: CVD, dtype: int64

In [56]:
y_train_res.value_counts()

0    4750
1    4750
Name: CVD, dtype: int64

## Decision Tree

Before SMOTE

In [57]:
from sklearn.tree import DecisionTreeClassifier

model_DecisionTree= DecisionTreeClassifier(criterion= "gini",random_state=10)

#fit the model on the data and predict the values

model_DecisionTree.fit(X_train,y_train)

y_pred= model_DecisionTree.predict(X_test)

print("Classification Report")
print(classification_report(y_test, y_pred))

Classification Report
              precision    recall  f1-score   support

           0       0.84      0.82      0.83      1623
           1       0.39      0.43      0.40       442

    accuracy                           0.73      2065
   macro avg       0.61      0.62      0.62      2065
weighted avg       0.74      0.73      0.74      2065



After smote

In [58]:
from sklearn.tree import DecisionTreeClassifier

model_DecisionTree= DecisionTreeClassifier(criterion= "gini",random_state=10)

#fit the model on the data and predict the values

model_DecisionTree.fit(x_train_res,y_train_res)

y_pred= model_DecisionTree.predict(X_test)

print("Classification Report")
print(classification_report(y_test, y_pred))

Classification Report
              precision    recall  f1-score   support

           0       0.85      0.75      0.80      1623
           1       0.35      0.50      0.41       442

    accuracy                           0.70      2065
   macro avg       0.60      0.63      0.60      2065
weighted avg       0.74      0.70      0.71      2065



## KNN

In [59]:
k_range = list(range(1, 15))
scores = []

for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(x_train_res, y_train_res)
    y_pred = knn.predict(X_test)
    acc_pred = accuracy_score(y_test, y_pred)
    scores.append(acc_pred)
    
knn = KNeighborsClassifier(n_neighbors=np.argmax(scores, axis=0)+1)
knn.fit(x_train_res, y_train_res)
y_pred = knn.predict(X_test)

print("Classification Report:",)
print (classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.84      0.84      1623
           1       0.39      0.37      0.38       442

    accuracy                           0.74      2065
   macro avg       0.61      0.61      0.61      2065
weighted avg       0.74      0.74      0.74      2065



## Random Forest using GridSearchCV

In [60]:
from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier(random_state=42)

In [73]:
param_grid = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}

from sklearn.model_selection import GridSearchCV
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
CV_rfc.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [4, 5, 6, 7, 8],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'n_estimators': [200, 500]})

In [90]:
CV_rfc.best_params_

{'criterion': 'gini',
 'max_depth': 6,
 'max_features': 'auto',
 'n_estimators': 500}

In [66]:
rfc1=RandomForestClassifier(random_state=42, max_features='auto', n_estimators= 500, max_depth=6, criterion='gini')
rfc1.fit(x_train_res, y_train_res)
pred=rfc1.predict(X_test)

In [68]:
print("Classification Report")
print(classification_report(y_test, pred))

Classification Report
              precision    recall  f1-score   support

           0       0.88      0.71      0.79      1623
           1       0.38      0.64      0.48       442

    accuracy                           0.70      2065
   macro avg       0.63      0.68      0.63      2065
weighted avg       0.77      0.70      0.72      2065



## XGBoost

In [64]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
from xgboost import XGBClassifier
model = XGBClassifier()
model.fit(x_train_res, y_train_res)

y_pred = model.predict(X_test)

print("Classification Report")
print(classification_report(y_test, y_pred))

Classification Report
              precision    recall  f1-score   support

           0       0.83      0.90      0.86      1623
           1       0.47      0.31      0.37       442

    accuracy                           0.78      2065
   macro avg       0.65      0.61      0.62      2065
weighted avg       0.75      0.78      0.76      2065



## ADA Boost (Adaptive Boost)

In [65]:
# Load libraries
from sklearn.ensemble import AdaBoostClassifier
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

# Create adaboost classifer object
abc = AdaBoostClassifier(n_estimators=50,
                         learning_rate=1)
# Train Adaboost Classifer
model = abc.fit(x_train_res, y_train_res)

#Predict the response for test dataset
y_pred = model.predict(X_test)

print("Classification Report")
print(classification_report(y_test, y_pred))

Classification Report
              precision    recall  f1-score   support

           0       0.87      0.73      0.79      1623
           1       0.38      0.61      0.47       442

    accuracy                           0.70      2065
   macro avg       0.62      0.67      0.63      2065
weighted avg       0.77      0.70      0.72      2065



## SVM

In [70]:
# SVM
from sklearn import svm
sv = svm.SVC(kernel='linear')
sv.fit(x_train_res, y_train_res)

y_pred = sv.predict(X_test)

print("Classification Report")
print(classification_report(y_test, y_pred))

Classification Report
              precision    recall  f1-score   support

           0       0.88      0.66      0.76      1623
           1       0.35      0.68      0.47       442

    accuracy                           0.67      2065
   macro avg       0.62      0.67      0.61      2065
weighted avg       0.77      0.67      0.69      2065



# Let's try CV

In [85]:
from sklearn.model_selection import cross_val_score

In [105]:
#create a new KNN model
knn_cv = KNeighborsClassifier(n_neighbors=3)
#train model with cv of 5 
cv_scores = cross_val_score(knn_cv, X, y, cv=10)
#print each cv score (accuracy) and average them
print(cv_scores)
print('cv_scores mean:{}'.format(np.mean(cv_scores)))

[0.72639225 0.72397094 0.7566586  0.7251816  0.72881356 0.72397094
 0.71549637 0.72276029 0.70702179 0.73454545]
cv_scores mean:0.7264811798371121


In [98]:
from imblearn.pipeline import Pipeline, make_pipeline

In [104]:
imba_pipeline = make_pipeline(SMOTE(random_state=42), 
                              RandomForestClassifier(n_estimators=500, random_state=42))
#create a new KNN model
model = imba_pipeline
#train model with cv of 5 
cv_scores = cross_val_score(model, X, y, cv=10)
#print each cv score (accuracy) and average them
print(cv_scores)
print('cv_scores mean:{}'.format(np.mean(cv_scores)))

[0.73486683 0.7433414  0.75544794 0.72760291 0.71791768 0.7433414
 0.76271186 0.73123487 0.74213075 0.73939394]
cv_scores mean:0.7397989581040428
