# C._(rm_null) 3_choose_learning_machine_models

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
#load the file 
data = pd.read_csv('../data/cleaned_data/rm_null_cleaned_data.csv')
data.head()


Unnamed: 0,age,hypertension,heart_disease,ever_married,avg_glucose_level,bmi,gender_F,gender_M,work_type_Govt_job,work_type_Other,work_type_Private,work_type_Self-employed,residence_type_Rural,residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,stroke
0,0.839182,0.0,1.0,1.0,0.95477,0.700525,0,1,0,0,1,0,0,1,0,1,0,0,1
1,0.978813,0.0,1.0,1.0,0.612224,0.630121,0,1,0,0,1,0,1,0,0,0,1,0,1
2,0.639247,0.0,0.0,1.0,0.857404,0.663675,1,0,0,0,1,0,0,1,0,0,0,1,1
3,0.968192,1.0,0.0,1.0,0.863892,0.454702,1,0,0,0,0,1,1,0,0,0,1,0,1
4,0.989416,0.0,0.0,1.0,0.888781,0.563472,0,1,0,0,1,0,0,1,0,1,0,0,1


### x/y split

In [4]:
#separate the features from the labels
y = data['stroke']
X = data.drop(['stroke'], axis=1)

In [5]:
y.head(1)

0    1
Name: stroke, dtype: int64

In [6]:
# change the type of 'y'
y = y.astype('int64')
y.dtypes

dtype('int64')

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [8]:
display(y_train.value_counts())
display(y_test.value_counts())

stroke
0    3747
1     169
Name: count, dtype: int64

stroke
0    939
1     40
Name: count, dtype: int64

##### As I mentioned earlier, it is clearly imbalanced and it needs Random oversampling  or SMOTE. 
##### Random oversampling can lead to overfitting, while SMOTE can help to reduce overfitting, as it increases the diversity in the dataset. In this case, SMOTE was chosen.
reference: https://www.quora.com/Whats-the-difference-between-random-oversampling-and-oversampling-using-SMOTE-on-an-imbalanced-dataset-Is-the-later-always-better-then-the-first 

In [9]:
from imblearn.over_sampling import SMOTE

smote = SMOTE()
# Fit and apply SMOTE to generate synthetic samples for the minority class
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [10]:
X_resampled.shape

(7494, 18)

In [11]:
class_counts = y_resampled.value_counts()
print(class_counts)

stroke
0    3747
1    3747
Name: count, dtype: int64


In [12]:
# In order to apply K-NN, the type 'y' are changed into '~Object' 
y_resampled_obj = y_resampled.apply(lambda x: str(x))
y_resampled_obj.dtypes

dtype('O')

In [13]:
# save train and test sets 

X_resampled.to_csv('../data/x_y_data/rm_null/X_resampled.csv', index=False)
y_resampled.to_csv('../data/x_y_data/rm_null/y_resampled.csv', index=False)
X_test.to_csv('../data/x_y_data/rm_null/X_test.csv', index=False)
y_test.to_csv('../data/x_y_data/rm_null/y_test.csv', index=False)

### Choosing models

In [None]:
X_resampled.shape

In [None]:
y_resampled.shape

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier


models = []
models.append(('LR', LogisticRegression(random_state = 12345)))
models.append(('KNN', KNeighborsClassifier()))
models.append(('RF', RandomForestClassifier(random_state = 12345)))
models.append(('GB', GradientBoostingClassifier(random_state = 12345)))
models.append(('AB', AdaBoostClassifier(random_state = 12345)))

In [None]:
# evaluate each model in turn
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')

X_resampled2 = np.ascontiguousarray(X_resampled)
y_resampled2 = np.ascontiguousarray(y_resampled)

names = []
results = []

for name, model in models:
    kfold = KFold(n_splits=10, shuffle=False)
    cv_results = cross_val_score(model, X_resampled2, y_resampled2, cv=kfold, scoring="accuracy")
    results.append(cv_results)
    names.append(name)
    msg = f"{name}: {cv_results.mean()} ({cv_results.std()})"
    print(msg)


In [None]:
# boxplot algorithm comparison
fig = plt.figure(figsize=(15,10))
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

####  KNN and Random Forest  are chosen. In addition, nuural network is investigated.

In [None]:
names

In [None]:
results

In [None]:
results_dict = {name: result for name, result in zip(names, results)}

# Create the DataFrame using the dictionary
score = pd.DataFrame(results_dict)

# Print the DataFrame
display(score)

In [None]:
score.to_csv('../data/score/scores_5_models.csv', index=False)

#### recall

In [None]:
#evaluate each model in turn
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')

X_resampled2 = np.ascontiguousarray(X_resampled)
y_resampled2 = np.ascontiguousarray(y_resampled)

names2 = []
results2 = []

for name, model in models:
    kfold = KFold(n_splits=10, shuffle=False)
    cv_results2 = cross_val_score(model, X_resampled2, y_resampled2, cv=kfold, scoring="recall")
    results2.append(cv_results2)
    names2.append(name)
    msg2 = f"{name}: {cv_results2.mean()} ({cv_results2.std()})"
    print(msg2)


In [None]:
# boxplot algorithm comparison
fig = plt.figure(figsize=(15,10))
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results2)
ax.set_xticklabels(names2)
plt.show()

In [None]:
results_dict2 = {name2: result2 for name2, result2 in zip(names2, results2)}

# Create the DataFrame using the dictionary
score2 = pd.DataFrame(results_dict2)

# Print the DataFrame
display(score2)

In [None]:
score2.to_csv('../data/score/scores_5_models_recall.csv', index=False)

### Precision

In [None]:
#evaluate each model in turn
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')

X_resampled3 = np.ascontiguousarray(X_resampled)
y_resampled3 = np.ascontiguousarray(y_resampled)

names3 = []
results3 = []

for name, model in models:
    kfold = KFold(n_splits=10, shuffle=False)
    cv_results3 = cross_val_score(model, X_resampled3, y_resampled3, cv=kfold, scoring="precision")
    results3.append(cv_results3)
    names3.append(name)
    msg3 = f"{name}: {cv_results3.mean()} ({cv_results3.std()})"
    print(msg3)


In [None]:
# boxplot algorithm comparison
fig = plt.figure(figsize=(15,10))
fig.suptitle('Algorithm Comparison -Precision-')
ax = fig.add_subplot(111)
plt.boxplot(results3)
ax.set_xticklabels(names3)
plt.show()

In [None]:
results_dict3 = {name3: result3 for name3, result3 in zip(names3, results3)}

# Create the DataFrame using the dictionary
score3 = pd.DataFrame(results_dict3)

# Print the DataFrame
display(score3)

In [None]:
score3.to_csv('../data/score/scores_5_models_precision.csv', index=False)

### f1

In [None]:
#evaluate each model in turn
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')

X_resampled4 = np.ascontiguousarray(X_resampled)
y_resampled4 = np.ascontiguousarray(y_resampled)

names4 = []
results4 = []

for name, model in models:
    kfold = KFold(n_splits=10, shuffle=False)
    cv_results4 = cross_val_score(model, X_resampled4, y_resampled4, cv=kfold, scoring="f1")
    results4.append(cv_results4)
    names4.append(name)
    msg4 = f"{name}: {cv_results4.mean()} ({cv_results4.std()})"
    print(msg4)


In [None]:
# boxplot algorithm comparison
fig = plt.figure(figsize=(15,10))
fig.suptitle('Algorithm Comparison -f1-')
ax = fig.add_subplot(111)
plt.boxplot(results4)
ax.set_xticklabels(names4)
plt.show()

In [None]:
results_dict4 = {name4: result4 for name4, result4 in zip(names4, results4)}

# Create the DataFrame using the dictionary
score4 = pd.DataFrame(results_dict4)

# Print the DataFrame
display(score4)

In [None]:
score4.to_csv('../data/score/scores_5_models_f1.csv', index=False)