# Machine Learning

In [1]:
from sklearn.compose import make_column_transformer
from seaborn import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import tensorflow as tf

In [2]:
stroketrain = pd.read_csv("Datasets/train_stroke_data_cleaned.csv")

In [3]:
stroketrain.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,age_normalized,avg_glucose_level_normalized,bmi_normalized
0,Male,3.0,0,0,No,Never_worked,Rural,95.12,18,unknown,0,0.035645,0.169964,0.091954
1,Male,58.0,1,0,Yes,Private,Urban,87.96,39,never smoked,0,0.707031,0.139631,0.333333
2,Female,8.0,0,0,No,Private,Urban,110.89,17,unknown,0,0.09668,0.236772,0.08046
3,Female,70.0,0,0,Yes,Private,Rural,69.04,35,formerly smoked,0,0.853516,0.059479,0.287356
4,Male,14.0,0,0,No,Never_worked,Rural,161.28,19,unknown,0,0.169922,0.450244,0.103448


In [4]:
stroketrain['bmi'].isnull().values.any()

False

In [5]:
traindf = stroketrain[['gender', 'age_normalized', 'hypertension', 'heart_disease', 'ever_married', 'work_type',
              'Residence_type','avg_glucose_level_normalized', 'bmi_normalized', 'smoking_status', 'stroke']]


In [6]:
traindf.head()

Unnamed: 0,gender,age_normalized,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level_normalized,bmi_normalized,smoking_status,stroke
0,Male,0.035645,0,0,No,Never_worked,Rural,0.169964,0.091954,unknown,0
1,Male,0.707031,1,0,Yes,Private,Urban,0.139631,0.333333,never smoked,0
2,Female,0.09668,0,0,No,Private,Urban,0.236772,0.08046,unknown,0
3,Female,0.853516,0,0,Yes,Private,Rural,0.059479,0.287356,formerly smoked,0
4,Male,0.169922,0,0,No,Never_worked,Rural,0.450244,0.103448,unknown,0


In [7]:
traindf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43389 entries, 0 to 43388
Data columns (total 11 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   gender                        43389 non-null  object 
 1   age_normalized                43389 non-null  float64
 2   hypertension                  43389 non-null  int64  
 3   heart_disease                 43389 non-null  int64  
 4   ever_married                  43389 non-null  object 
 5   work_type                     43389 non-null  object 
 6   Residence_type                43389 non-null  object 
 7   avg_glucose_level_normalized  43389 non-null  float64
 8   bmi_normalized                43389 non-null  float64
 9   smoking_status                43389 non-null  object 
 10  stroke                        43389 non-null  int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 3.6+ MB


### RFC

In [8]:

# Generate our categorical variable list
strokesu = traindf.dtypes[traindf.dtypes == "object"].index.tolist()

# Check the number of unique values in each column
traindf[strokesu].nunique()

gender            2
ever_married      2
work_type         4
Residence_type    2
smoking_status    4
dtype: int64

In [9]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
strokeencode_df = pd.DataFrame(enc.fit_transform(traindf[strokesu]))

# Add the encoded variable names to the DataFrame
strokeencode_df.columns = enc.get_feature_names(strokesu)
strokeencode_df.head()



Unnamed: 0,gender_Female,gender_Male,ever_married_No,ever_married_Yes,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,Residence_type_Rural,Residence_type_Urban,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,smoking_status_unknown
0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [10]:
# Merge one-hot encoded features and drop the originals
trainerdf = traindf.merge(strokeencode_df,left_index=True, right_index=True)
trainerdf = trainerdf.drop(strokesu,1)
trainerdf.head()

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,age_normalized,hypertension,heart_disease,avg_glucose_level_normalized,bmi_normalized,stroke,gender_Female,gender_Male,ever_married_No,ever_married_Yes,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,Residence_type_Rural,Residence_type_Urban,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,smoking_status_unknown
0,0.035645,0,0,0.169964,0.091954,0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,0.707031,1,0,0.139631,0.333333,0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,0.09668,0,0,0.236772,0.08046,0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,0.853516,0,0,0.059479,0.287356,0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,0.169922,0,0,0.450244,0.103448,0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [11]:
# Remove stroke target from features data
y = trainerdf.stroke
X = trainerdf.drop(columns=["stroke"])

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [12]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test_scaled)
print(f" Random forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Random forest predictive accuracy: 0.982


In [13]:
predictions = rf_model.predict(X_test_scaled)
predictions

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [14]:
# Calculating the accuracy score.
acc2_score = accuracy_score(y_test, predictions)
acc2_score

0.9817477876106194

In [15]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,10649,3
Actual 1,195,1


In [16]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc2_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,10649,3
Actual 1,195,1


Accuracy Score : 0.9817477876106194
Classification Report
              precision    recall  f1-score   support

           0       0.98      1.00      0.99     10652
           1       0.25      0.01      0.01       196

    accuracy                           0.98     10848
   macro avg       0.62      0.50      0.50     10848
weighted avg       0.97      0.98      0.97     10848



In [17]:
importances = rf_model.feature_importances_
importances

array([0.22477852, 0.02190748, 0.02183687, 0.36761788, 0.17910432,
       0.01569854, 0.01559544, 0.00639861, 0.005696  , 0.0117593 ,
       0.00048912, 0.01599325, 0.01516691, 0.01784932, 0.01777911,
       0.01599116, 0.01742818, 0.01492829, 0.01398173])

In [18]:
sorted(zip(rf_model.feature_importances_, trainerdf.columns), reverse=True)

[(0.36761787943823054, 'avg_glucose_level_normalized'),
 (0.2247785200874255, 'age_normalized'),
 (0.17910431894503773, 'bmi_normalized'),
 (0.021907477769299466, 'hypertension'),
 (0.02183686849069257, 'heart_disease'),
 (0.01784931904883031, 'work_type_Self-employed'),
 (0.017779112389394777, 'Residence_type_Rural'),
 (0.017428178208919468, 'smoking_status_formerly smoked'),
 (0.015993245078482477, 'work_type_Never_worked'),
 (0.01599116059633068, 'Residence_type_Urban'),
 (0.015698538534765622, 'stroke'),
 (0.01559543517172358, 'gender_Female'),
 (0.015166905254564194, 'work_type_Private'),
 (0.014928285251477092, 'smoking_status_never smoked'),
 (0.01398172801254052, 'smoking_status_smokes'),
 (0.011759301064796472, 'ever_married_Yes'),
 (0.006398608378145819, 'gender_Male'),
 (0.0056959977161556066, 'ever_married_No'),
 (0.0004891205631875802, 'work_type_Govt_job')]

In [19]:
# Define the model - deep neural net
number_input_features = len(X_train_scaled[0])
hidden_nodes_layer1 = 24
hidden_nodes_layer2 = 12

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))


# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Compile the Sequential model together and customize metrics
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=50)

# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
339/339 - 0s - loss: 0.0797 - accuracy: 0.9819 - 347ms/epoch - 1ms/step
Loss: 0.0797167494893074, Accuracy: 0.9819321632385254
