# Modelling
## 1. Prepare for Modelling
### 1.1. Import libraries

In [120]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

# Data Split
from sklearn.model_selection import train_test_split

# Evaluation Metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score


In [121]:
# Disable Warnings
import warnings
warnings.filterwarnings("ignore")

### 1.2. Import data

In [122]:
# Substituir pelo df do feature selection 
all_data = pd.read_csv('data/processed/df_clustered.csv')

In [123]:
all_data.columns

Index(['Customer ID', 'Age', 'Under 30', 'Senior Citizen', 'Married',
       'Dependents', 'Number of Dependents', 'City', 'Referred a Friend',
       'Number of Referrals', 'Tenure in Months', 'Tenure Category',
       'Phone Service', 'Avg Monthly Long Distance Charges', 'Multiple Lines',
       'Internet Service', 'Avg Monthly GB Download', 'Online Security',
       'Online Backup', 'Device Protection Plan', 'Premium Tech Support',
       'Streaming TV', 'Streaming Movies', 'Streaming Music', 'Unlimited Data',
       'Paperless Billing', 'Monthly Charge', 'Total Charges', 'Total Refunds',
       'Total Extra Data Charges', 'Total Long Distance Charges',
       'Total Revenue', 'Satisfaction Score', 'New Customer', 'Churn Label',
       'Churn Score', 'CLTV', 'Churn Reason', 'Population', 'Engagement Score',
       'Gender_Male', 'Offer_Offer A', 'Offer_Offer B', 'Offer_Offer C',
       'Offer_Offer D', 'Offer_Offer E', 'Internet Type_Cable',
       'Internet Type_DSL', 'Internet Typ

In [124]:
# Drop irrelevant variables for prediction (cluster) 
columns_to_remove=[col for col in all_data.columns if col.startswith('Cluster_')]
all_data.drop(columns=columns_to_remove, inplace=True, errors='ignore')

### 1.3. Feature Selection

In [125]:
# Import scenarios generated in feature selection
scenario1_td = pd.read_csv('data/processed/scenario1.csv')
scenario2_td = pd.read_csv('data/processed/scenario2.csv')

# Other scenarios created in this notebook

scenario3 = all_data[['Senior Citizen','Dependents','Referred a Friend','Internet Service','Internet Type_Fiber Optic','Online Security','Offer_Offer E',
                     'Offer_Offer A','Premium Tech Support','Unlimited Data','Contract_Month-to-Month','Paperless Billing','Payment Method_Credit Card', 'Churn Label']]

### 1.4. Data Split
> Train-Test split<p>
> Train data in predictors (X) and target variable (y)

In [126]:
# Define the scenario to test
df = scenario1_td.copy()
df.set_index("Customer ID", inplace=True)

# Split
X_total = df
y_total = all_data[["Customer ID", "Churn Label"]]
y_total.set_index("Customer ID", inplace=True)

In [127]:
# Splitting the Data into Train and test with 20% test Data. Stratify dut to unbalance Dataset. (Only 27% Churn)
X_train_total, X_test, y_train_total, y_test = train_test_split(X_total, y_total, test_size=0.20, random_state=1, stratify=y_total)

In [128]:
# List to append the metrics of all the models
models =[]

## 2. Baseline Models
### 2.1 Logistic Regression - Train and Validation Results

In [129]:
# Feature scaling - scenario 1 already scaled
#scaler = StandardScaler()
#X_scaled = scaler.fit_transform(X_train)

# Create StratifiedKFold
k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize Logistic Regression model
model_LR = LogisticRegression()

# Initialize variables to store Validation results
train_accuracy_scores = []
train_f1_scores = []

# Initialize variables to store Validation results
val_accuracy_scores = []
val_f1_scores = []

# Perform k-fold cross-validation
for train_idx, val_idx in k_fold.split(X_train_total, y_train_total):
    X_train, X_val = X_train_total.iloc[train_idx], X_train_total.iloc[val_idx]
    y_train, y_val = y_train_total.iloc[train_idx], y_train_total.iloc[val_idx]

    # Train the model
    model_LR.fit(X_train, y_train)
    
    # Make predictions on the training set  - To check potential overfitting
    y_train_pred = model_LR.predict(X_train)

    # Evaluate the model on the validation set
    accuracy = accuracy_score(y_train, y_train_pred)
    train_accuracy_scores.append(round(accuracy,3))
    
    #F1 Score 
    f1 = f1_score(y_train, y_train_pred)
    train_f1_scores.append(round(f1,3))

    # Make predictions on the validation set
    y_val_pred = model_LR.predict(X_val)

    # Evaluate the model on the validation set
    accuracy = accuracy_score(y_val, y_val_pred)
    val_accuracy_scores.append(round(accuracy,3))
    
    #F1 Score 
    f1 = f1_score(y_val, y_val_pred)
    val_f1_scores.append(round(f1,3))

#Training
# Print accuracy results
print("Training Accuracy:")
print(f'Cross-Validation Accuracy Scores: {train_accuracy_scores}')
print(f'Mean Accuracy: {round(sum(train_accuracy_scores) / len(train_accuracy_scores),3)}\n')

# Print F1 results
print("Training F1 Score:")
print(f'Cross-Validation F1 Scores: {train_f1_scores}')
print(f'Mean F1: {round(sum(train_f1_scores) / len(train_f1_scores),3)}\n')

#Validation
# Print accuracy results
print("Validation Accuracy:")
print(f'Cross-Validation Accuracy Scores: {val_accuracy_scores}')
print(f'Mean Accuracy: {round(sum(val_accuracy_scores) / len(val_accuracy_scores),3)}\n')

# Print F1 results
print("Validation F1 Score:")
print(f'Cross-Validation F1 Scores: {val_f1_scores}')
print(f'Mean F1: {round(sum(val_f1_scores) / len(val_f1_scores),3)}')

Training Accuracy:
Cross-Validation Accuracy Scores: [0.974, 0.972, 0.975, 0.973, 0.972]
Mean Accuracy: 0.973

Training F1 Score:
Cross-Validation F1 Scores: [0.951, 0.946, 0.953, 0.949, 0.948]
Mean F1: 0.949

Validation Accuracy:
Cross-Validation Accuracy Scores: [0.97, 0.977, 0.968, 0.974, 0.978]
Mean Accuracy: 0.973

Validation F1 Score:
Cross-Validation F1 Scores: [0.943, 0.956, 0.939, 0.95, 0.958]
Mean F1: 0.949


### 2.1 Logistic Regression - Test Results

In [130]:
# Fit Model to total Training Data
model_LR.fit(X_train_total, y_train_total)

# Make predictions on the Test set
y_test_pred = model_LR.predict(X_test)

# Evaluate the model on the Test set
accuracy = accuracy_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)

#Print Test Scores
print(f'Test Accuracy Scores: {round(accuracy,3)}')
print(f'Test F1 Scores: {round(f1,3)}')

Test Accuracy Scores: 0.98
Test F1 Scores: 0.962


In [131]:
lr = {"Model": "Logistic Regression", 
      "Accuracy Training": round(sum(train_accuracy_scores) / len(train_accuracy_scores),3),
      "Accuracy Validation": round(sum(val_accuracy_scores) / len(val_accuracy_scores),3),
      "Accuracy Test": round(accuracy,3),
      "F1 Training": round(sum(train_f1_scores) / len(train_f1_scores),3),
      "F1 Validation": round(sum(val_f1_scores) / len(val_f1_scores),3),
      "F1 Test": round(f1,3)}

models.append(lr)

### 2.2 Decision Tree - Train and Validation Results


In [132]:
# Create StratifiedKFold
k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize Logistic Regression model
model_DT = DecisionTreeClassifier(max_depth=3)

# Initialize variables to store Validation results
train_accuracy_scores = []
train_f1_scores = []

# Initialize variables to store Validation results
val_accuracy_scores = []
val_f1_scores = []

# Perform k-fold cross-validation
for train_idx, val_idx in k_fold.split(X_train_total, y_train_total):
    X_train, X_val = X_train_total.iloc[train_idx], X_train_total.iloc[val_idx]
    y_train, y_val = y_train_total.iloc[train_idx], y_train_total.iloc[val_idx]

    # Train the model
    model_DT.fit(X_train, y_train)
    
    # Make predictions on the training set  - To check potential overfitting
    y_train_pred = model_DT.predict(X_train)

    # Evaluate the model on the validation set
    accuracy = accuracy_score(y_train, y_train_pred)
    train_accuracy_scores.append(round(accuracy,3))
    
    #F1 Score 
    f1 = f1_score(y_train, y_train_pred)
    train_f1_scores.append(round(f1,3))

    # Make predictions on the validation set
    y_val_pred = model_DT.predict(X_val)

    # Evaluate the model on the validation set
    accuracy = accuracy_score(y_val, y_val_pred)
    val_accuracy_scores.append(round(accuracy,3))
    
    #F1 Score 
    f1 = f1_score(y_val, y_val_pred)
    val_f1_scores.append(round(f1,3))

#Training
# Print accuracy results
print("Training Accuracy:")
print(f'Cross-Validation Accuracy Scores: {train_accuracy_scores}')
print(f'Mean Accuracy: {round(sum(train_accuracy_scores) / len(train_accuracy_scores),3)}\n')

# Print F1 results
print("Training F1 Score:")
print(f'Cross-Validation F1 Scores: {train_f1_scores}')
print(f'Mean F1: {round(sum(train_f1_scores) / len(train_f1_scores),3)}\n')

#Validation
# Print accuracy results
print("Validation Accuracy:")
print(f'Cross-Validation Accuracy Scores: {val_accuracy_scores}')
print(f'Mean Accuracy: {round(sum(val_accuracy_scores) / len(val_accuracy_scores),3)}\n')

# Print F1 results
print("Validation F1 Score:")
print(f'Cross-Validation F1 Scores: {val_f1_scores}')
print(f'Mean F1: {round(sum(val_f1_scores) / len(val_f1_scores),3)}')

Training Accuracy:
Cross-Validation Accuracy Scores: [0.972, 0.972, 0.973, 0.972, 0.971]
Mean Accuracy: 0.972

Training F1 Score:
Cross-Validation F1 Scores: [0.945, 0.943, 0.946, 0.945, 0.942]
Mean F1: 0.944

Validation Accuracy:
Cross-Validation Accuracy Scores: [0.97, 0.973, 0.969, 0.971, 0.977]
Mean Accuracy: 0.972

Validation F1 Score:
Cross-Validation F1 Scores: [0.94, 0.947, 0.938, 0.942, 0.955]
Mean F1: 0.944


### 2.2 Decision Tree - Test Results

In [133]:
# Fit Model to total Training Data
model_DT.fit(X_train_total, y_train_total)

# Make predictions on the Test set
y_test_pred = model_DT.predict(X_test)

# Evaluate the model on the Test set
accuracy = accuracy_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)

#Print Test Scores
print(f'Test Accuracy Scores: {round(accuracy,3)}')
print(f'Test F1 Scores: {round(f1,3)}')

Test Accuracy Scores: 0.974
Test F1 Scores: 0.949


In [134]:
dt = {"Model": "Decision Tree", 
      "Accuracy Training": round(sum(train_accuracy_scores) / len(train_accuracy_scores),3),
      "Accuracy Validation": round(sum(val_accuracy_scores) / len(val_accuracy_scores),3),
      "Accuracy Test": round(accuracy,3),
      "F1 Training": round(sum(train_f1_scores) / len(train_f1_scores),3),
      "F1 Validation": round(sum(val_f1_scores) / len(val_f1_scores),3),
      "F1 Test": round(f1,3)}

models.append(dt)

### 2.3. Random Forest - Train and Validation Results


In [135]:
# Create StratifiedKFold
k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize Logistic Regression model
model_RF = RandomForestClassifier(max_depth=3)

# Initialize variables to store Validation results
train_accuracy_scores = []
train_f1_scores = []

# Initialize variables to store Validation results
val_accuracy_scores = []
val_f1_scores = []

# Perform k-fold cross-validation
for train_idx, val_idx in k_fold.split(X_train_total, y_train_total):
    X_train, X_val = X_train_total.iloc[train_idx], X_train_total.iloc[val_idx]
    y_train, y_val = y_train_total.iloc[train_idx], y_train_total.iloc[val_idx]

    # Train the model
    model_DT.fit(X_train, y_train)
    
    # Make predictions on the training set  - To check potential overfitting
    y_train_pred = model_DT.predict(X_train)

    # Evaluate the model on the validation set
    accuracy = accuracy_score(y_train, y_train_pred)
    train_accuracy_scores.append(round(accuracy,3))
    
    #F1 Score 
    f1 = f1_score(y_train, y_train_pred)
    train_f1_scores.append(round(f1,3))

    # Make predictions on the validation set
    y_val_pred = model_DT.predict(X_val)

    # Evaluate the model on the validation set
    accuracy = accuracy_score(y_val, y_val_pred)
    val_accuracy_scores.append(round(accuracy,3))
    
    #F1 Score 
    f1 = f1_score(y_val, y_val_pred)
    val_f1_scores.append(round(f1,3))

#Training
# Print accuracy results
print("Training Accuracy:")
print(f'Cross-Validation Accuracy Scores: {train_accuracy_scores}')
print(f'Mean Accuracy: {round(sum(train_accuracy_scores) / len(train_accuracy_scores),3)}\n')

# Print F1 results
print("Training F1 Score:")
print(f'Cross-Validation F1 Scores: {train_f1_scores}')
print(f'Mean F1: {round(sum(train_f1_scores) / len(train_f1_scores),3)}\n')

#Validation
# Print accuracy results
print("Validation Accuracy:")
print(f'Cross-Validation Accuracy Scores: {val_accuracy_scores}')
print(f'Mean Accuracy: {round(sum(val_accuracy_scores) / len(val_accuracy_scores),3)}\n')

# Print F1 results
print("Validation F1 Score:")
print(f'Cross-Validation F1 Scores: {val_f1_scores}')
print(f'Mean F1: {round(sum(val_f1_scores) / len(val_f1_scores),3)}')

Training Accuracy:
Cross-Validation Accuracy Scores: [0.972, 0.972, 0.973, 0.972, 0.971]
Mean Accuracy: 0.972

Training F1 Score:
Cross-Validation F1 Scores: [0.945, 0.943, 0.946, 0.945, 0.942]
Mean F1: 0.944

Validation Accuracy:
Cross-Validation Accuracy Scores: [0.97, 0.973, 0.969, 0.971, 0.977]
Mean Accuracy: 0.972

Validation F1 Score:
Cross-Validation F1 Scores: [0.94, 0.947, 0.938, 0.942, 0.955]
Mean F1: 0.944


### 2.3 Random Forest - Test Results

In [136]:
# Fit Model to total Training Data
model_RF.fit(X_train_total, y_train_total)

# Make predictions on the Test set
y_test_pred = model_RF.predict(X_test)

# Evaluate the model on the Test set
accuracy = accuracy_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)

#Print Test Scores
print(f'Test Accuracy Scores: {round(accuracy,3)}')
print(f'Test F1 Scores: {round(f1,3)}')

Test Accuracy Scores: 0.974
Test F1 Scores: 0.949


In [137]:
rf = {"Model": "Random Forest", 
      "Accuracy Training": round(sum(train_accuracy_scores) / len(train_accuracy_scores),3),
      "Accuracy Validation": round(sum(val_accuracy_scores) / len(val_accuracy_scores),3),
      "Accuracy Test": round(accuracy,3),
      "F1 Training": round(sum(train_f1_scores) / len(train_f1_scores),3),
      "F1 Validation": round(sum(val_f1_scores) / len(val_f1_scores),3),
      "F1 Test": round(f1,3)}

models.append(rf)

### 2.4 SVC - Train and Validation Results

In [138]:
# Create StratifiedKFold
k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize Logistic Regression model
model_SVC = SVC()

# Initialize variables to store Validation results
train_accuracy_scores = []
train_f1_scores = []

# Initialize variables to store Validation results
val_accuracy_scores = []
val_f1_scores = []

# Perform k-fold cross-validation
for train_idx, val_idx in k_fold.split(X_train_total, y_train_total):
    X_train, X_val = X_train_total.iloc[train_idx], X_train_total.iloc[val_idx]
    y_train, y_val = y_train_total.iloc[train_idx], y_train_total.iloc[val_idx]

    # Train the model
    model_SVC.fit(X_train, y_train)
    
    # Make predictions on the training set  - To check potential overfitting
    y_train_pred = model_SVC.predict(X_train)

    # Evaluate the model on the validation set
    accuracy = accuracy_score(y_train, y_train_pred)
    train_accuracy_scores.append(round(accuracy,3))
    
    #F1 Score 
    f1 = f1_score(y_train, y_train_pred)
    train_f1_scores.append(round(f1,3))

    # Make predictions on the validation set
    y_val_pred = model_SVC.predict(X_val)

    # Evaluate the model on the validation set
    accuracy = accuracy_score(y_val, y_val_pred)
    val_accuracy_scores.append(round(accuracy,3))
    
    #F1 Score 
    f1 = f1_score(y_val, y_val_pred)
    val_f1_scores.append(round(f1,3))

#Training
# Print accuracy results
print("Training Accuracy:")
print(f'Cross-Validation Accuracy Scores: {train_accuracy_scores}')
print(f'Mean Accuracy: {round(sum(train_accuracy_scores) / len(train_accuracy_scores),3)}\n')

# Print F1 results
print("Training F1 Score:")
print(f'Cross-Validation F1 Scores: {train_f1_scores}')
print(f'Mean F1: {round(sum(train_f1_scores) / len(train_f1_scores),3)}\n')

#Validation
# Print accuracy results
print("Validation Accuracy:")
print(f'Cross-Validation Accuracy Scores: {val_accuracy_scores}')
print(f'Mean Accuracy: {round(sum(val_accuracy_scores) / len(val_accuracy_scores),3)}\n')

# Print F1 results
print("Validation F1 Score:")
print(f'Cross-Validation F1 Scores: {val_f1_scores}')
print(f'Mean F1: {round(sum(val_f1_scores) / len(val_f1_scores),3)}')

Training Accuracy:
Cross-Validation Accuracy Scores: [0.978, 0.975, 0.978, 0.976, 0.976]
Mean Accuracy: 0.977

Training F1 Score:
Cross-Validation F1 Scores: [0.958, 0.953, 0.958, 0.954, 0.956]
Mean F1: 0.956

Validation Accuracy:
Cross-Validation Accuracy Scores: [0.973, 0.979, 0.972, 0.974, 0.98]
Mean Accuracy: 0.976

Validation F1 Score:
Cross-Validation F1 Scores: [0.95, 0.96, 0.947, 0.951, 0.962]
Mean F1: 0.954


### 2.4 SVC - Test Results

In [139]:
# Fit Model to total Training Data
model_SVC.fit(X_train_total, y_train_total)

# Make predictions on the Test set
y_test_pred = model_SVC.predict(X_test)

# Evaluate the model on the Test set
accuracy = accuracy_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)

#Print Test Scores
print(f'Test Accuracy Scores: {round(accuracy,3)}')
print(f'Test F1 Scores: {round(f1,3)}')

Test Accuracy Scores: 0.982
Test F1 Scores: 0.967


In [140]:
svcc = {"Model": "SVC", 
      "Accuracy Training": round(sum(train_accuracy_scores) / len(train_accuracy_scores),3),
      "Accuracy Validation": round(sum(val_accuracy_scores) / len(val_accuracy_scores),3),
      "Accuracy Test": round(accuracy,3),
      "F1 Training": round(sum(train_f1_scores) / len(train_f1_scores),3),
      "F1 Validation": round(sum(val_f1_scores) / len(val_f1_scores),3),
      "F1 Test": round(f1,3)}

models.append(svcc)

9

### 2.5 Gaussian - Train and Validation Results

In [141]:
# Create StratifiedKFold
k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize Logistic Regression model
model_GA = GaussianNB()

# Initialize variables to store Validation results
train_accuracy_scores = []
train_f1_scores = []

# Initialize variables to store Validation results
val_accuracy_scores = []
val_f1_scores = []

# Perform k-fold cross-validation
for train_idx, val_idx in k_fold.split(X_train_total, y_train_total):
    X_train, X_val = X_train_total.iloc[train_idx], X_train_total.iloc[val_idx]
    y_train, y_val = y_train_total.iloc[train_idx], y_train_total.iloc[val_idx]

    # Train the model
    model_GA.fit(X_train, y_train)
    
    # Make predictions on the training set  - To check potential overfitting
    y_train_pred = model_GA.predict(X_train)

    # Evaluate the model on the validation set
    accuracy = accuracy_score(y_train, y_train_pred)
    train_accuracy_scores.append(round(accuracy,3))
    
    #F1 Score 
    f1 = f1_score(y_train, y_train_pred)
    train_f1_scores.append(round(f1,3))

    # Make predictions on the validation set
    y_val_pred = model_GA.predict(X_val)

    # Evaluate the model on the validation set
    accuracy = accuracy_score(y_val, y_val_pred)
    val_accuracy_scores.append(round(accuracy,3))
    
    #F1 Score 
    f1 = f1_score(y_val, y_val_pred)
    val_f1_scores.append(round(f1,3))

#Training
# Print accuracy results
print("Training Accuracy:")
print(f'Cross-Validation Accuracy Scores: {train_accuracy_scores}')
print(f'Mean Accuracy: {round(sum(train_accuracy_scores) / len(train_accuracy_scores),3)}\n')

# Print F1 results
print("Training F1 Score:")
print(f'Cross-Validation F1 Scores: {train_f1_scores}')
print(f'Mean F1: {round(sum(train_f1_scores) / len(train_f1_scores),3)}\n')

#Validation
# Print accuracy results
print("Validation Accuracy:")
print(f'Cross-Validation Accuracy Scores: {val_accuracy_scores}')
print(f'Mean Accuracy: {round(sum(val_accuracy_scores) / len(val_accuracy_scores),3)}\n')

# Print F1 results
print("Validation F1 Score:")
print(f'Cross-Validation F1 Scores: {val_f1_scores}')
print(f'Mean F1: {round(sum(val_f1_scores) / len(val_f1_scores),3)}')

Training Accuracy:
Cross-Validation Accuracy Scores: [0.959, 0.958, 0.96, 0.959, 0.959]
Mean Accuracy: 0.959

Training F1 Score:
Cross-Validation F1 Scores: [0.923, 0.922, 0.924, 0.924, 0.924]
Mean F1: 0.923

Validation Accuracy:
Cross-Validation Accuracy Scores: [0.96, 0.96, 0.958, 0.962, 0.959]
Mean Accuracy: 0.96

Validation F1 Score:
Cross-Validation F1 Scores: [0.926, 0.925, 0.922, 0.928, 0.925]
Mean F1: 0.925


### 2.5 Gaussian - Test Results

In [142]:
# Fit Model to total Training Data
model_GA.fit(X_train_total, y_train_total)

# Make predictions on the Test set
y_test_pred = model_GA.predict(X_test)

# Evaluate the model on the Test set
accuracy = accuracy_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)

#Print Test Scores
print(f'Test Accuracy Scores: {round(accuracy,3)}')
print(f'Test F1 Scores: {round(f1,3)}')

Test Accuracy Scores: 0.963
Test F1 Scores: 0.931


In [143]:
gaussian = {"Model": "Gaussian", 
      "Accuracy Training": round(sum(train_accuracy_scores) / len(train_accuracy_scores),3),
      "Accuracy Validation": round(sum(val_accuracy_scores) / len(val_accuracy_scores),3),
      "Accuracy Test": round(accuracy,3),
      "F1 Training": round(sum(train_f1_scores) / len(train_f1_scores),3),
      "F1 Validation": round(sum(val_f1_scores) / len(val_f1_scores),3),
      "F1 Test": round(f1,3)}

models.append(gaussian)

### 2.6 KNN - Train and Validation Results

In [144]:
# Create StratifiedKFold
k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize Logistic Regression model
model_KNN = KNeighborsClassifier()

# Initialize variables to store Validation results
train_accuracy_scores = []
train_f1_scores = []

# Initialize variables to store Validation results
val_accuracy_scores = []
val_f1_scores = []

# Perform k-fold cross-validation
for train_idx, val_idx in k_fold.split(X_train_total, y_train_total):
    X_train, X_val = X_train_total.iloc[train_idx], X_train_total.iloc[val_idx]
    y_train, y_val = y_train_total.iloc[train_idx], y_train_total.iloc[val_idx]

    # Train the model
    model_KNN.fit(X_train, y_train)
    
    # Make predictions on the training set  - To check potential overfitting
    y_train_pred = model_KNN.predict(X_train)

    # Evaluate the model on the validation set
    accuracy = accuracy_score(y_train, y_train_pred)
    train_accuracy_scores.append(round(accuracy,3))
    
    #F1 Score 
    f1 = f1_score(y_train, y_train_pred)
    train_f1_scores.append(round(f1,3))

    # Make predictions on the validation set
    y_val_pred = model_KNN.predict(X_val)

    # Evaluate the model on the validation set
    accuracy = accuracy_score(y_val, y_val_pred)
    val_accuracy_scores.append(round(accuracy,3))
    
    #F1 Score 
    f1 = f1_score(y_val, y_val_pred)
    val_f1_scores.append(round(f1,3))

#Training
# Print accuracy results
print("Training Accuracy:")
print(f'Cross-Validation Accuracy Scores: {train_accuracy_scores}')
print(f'Mean Accuracy: {round(sum(train_accuracy_scores) / len(train_accuracy_scores),3)}\n')

# Print F1 results
print("Training F1 Score:")
print(f'Cross-Validation F1 Scores: {train_f1_scores}')
print(f'Mean F1: {round(sum(train_f1_scores) / len(train_f1_scores),3)}\n')

#Validation
# Print accuracy results
print("Validation Accuracy:")
print(f'Cross-Validation Accuracy Scores: {val_accuracy_scores}')
print(f'Mean Accuracy: {round(sum(val_accuracy_scores) / len(val_accuracy_scores),3)}\n')

# Print F1 results
print("Validation F1 Score:")
print(f'Cross-Validation F1 Scores: {val_f1_scores}')
print(f'Mean F1: {round(sum(val_f1_scores) / len(val_f1_scores),3)}')

Training Accuracy:
Cross-Validation Accuracy Scores: [0.983, 0.981, 0.982, 0.982, 0.984]
Mean Accuracy: 0.982

Training F1 Score:
Cross-Validation F1 Scores: [0.968, 0.964, 0.966, 0.966, 0.969]
Mean F1: 0.967

Validation Accuracy:
Cross-Validation Accuracy Scores: [0.97, 0.977, 0.974, 0.972, 0.977]
Mean Accuracy: 0.974

Validation F1 Score:
Cross-Validation F1 Scores: [0.943, 0.956, 0.951, 0.947, 0.956]
Mean F1: 0.951


### 2.6 KNN - Test Results

In [145]:
# Fit Model to total Training Data
model_KNN.fit(X_train_total, y_train_total)

# Make predictions on the Test set
y_test_pred = model_KNN.predict(X_test)

# Evaluate the model on the Test set
accuracy = accuracy_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)

#Print Test Scores
print(f'Test Accuracy Scores: {round(accuracy,3)}')
print(f'Test F1 Scores: {round(f1,3)}')

Test Accuracy Scores: 0.982
Test F1 Scores: 0.965


In [146]:
knn = {"Model": "KNN", 
      "Accuracy Training": round(sum(train_accuracy_scores) / len(train_accuracy_scores),3),
      "Accuracy Validation": round(sum(val_accuracy_scores) / len(val_accuracy_scores),3),
      "Accuracy Test": round(accuracy,3),
      "F1 Training": round(sum(train_f1_scores) / len(train_f1_scores),3),
      "F1 Validation": round(sum(val_f1_scores) / len(val_f1_scores),3),
      "F1 Test": round(f1,3)}

models.append(knn)

### 2.7 Gradient Boosting - Train and Validation Results

In [147]:
# Create StratifiedKFold
k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize Logistic Regression model
model_GB = GradientBoostingClassifier()

# Initialize variables to store Validation results
train_accuracy_scores = []
train_f1_scores = []

# Initialize variables to store Validation results
val_accuracy_scores = []
val_f1_scores = []

# Perform k-fold cross-validation
for train_idx, val_idx in k_fold.split(X_train_total, y_train_total):
    X_train, X_val = X_train_total.iloc[train_idx], X_train_total.iloc[val_idx]
    y_train, y_val = y_train_total.iloc[train_idx], y_train_total.iloc[val_idx]

    # Train the model
    model_GB.fit(X_train, y_train)
    
    # Make predictions on the training set  - To check potential overfitting
    y_train_pred = model_GB.predict(X_train)

    # Evaluate the model on the validation set
    accuracy = accuracy_score(y_train, y_train_pred)
    train_accuracy_scores.append(round(accuracy,3))
    
    #F1 Score 
    f1 = f1_score(y_train, y_train_pred)
    train_f1_scores.append(round(f1,3))

    # Make predictions on the validation set
    y_val_pred = model_GB.predict(X_val)

    # Evaluate the model on the validation set
    accuracy = accuracy_score(y_val, y_val_pred)
    val_accuracy_scores.append(round(accuracy,3))
    
    #F1 Score 
    f1 = f1_score(y_val, y_val_pred)
    val_f1_scores.append(round(f1,3))

#Training
# Print accuracy results
print("Training Accuracy:")
print(f'Cross-Validation Accuracy Scores: {train_accuracy_scores}')
print(f'Mean Accuracy: {round(sum(train_accuracy_scores) / len(train_accuracy_scores),3)}\n')

# Print F1 results
print("Training F1 Score:")
print(f'Cross-Validation F1 Scores: {train_f1_scores}')
print(f'Mean F1: {round(sum(train_f1_scores) / len(train_f1_scores),3)}\n')

#Validation
# Print accuracy results
print("Validation Accuracy:")
print(f'Cross-Validation Accuracy Scores: {val_accuracy_scores}')
print(f'Mean Accuracy: {round(sum(val_accuracy_scores) / len(val_accuracy_scores),3)}\n')

# Print F1 results
print("Validation F1 Score:")
print(f'Cross-Validation F1 Scores: {val_f1_scores}')
print(f'Mean F1: {round(sum(val_f1_scores) / len(val_f1_scores),3)}')

Training Accuracy:
Cross-Validation Accuracy Scores: [0.985, 0.985, 0.987, 0.984, 0.985]
Mean Accuracy: 0.985

Training F1 Score:
Cross-Validation F1 Scores: [0.973, 0.971, 0.976, 0.97, 0.973]
Mean F1: 0.973

Validation Accuracy:
Cross-Validation Accuracy Scores: [0.982, 0.985, 0.973, 0.98, 0.98]
Mean Accuracy: 0.98

Validation F1 Score:
Cross-Validation F1 Scores: [0.967, 0.972, 0.95, 0.963, 0.964]
Mean F1: 0.963


### 2.7 Gradient Boosting - Test Results

In [148]:
# Fit Model to total Training Data
model_GB.fit(X_train_total, y_train_total)

# Make predictions on the Test set
y_test_pred = model_GB.predict(X_test)

# Evaluate the model on the Test set
accuracy = accuracy_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)

#Print Test Scores
print(f'Test Accuracy Scores: {round(accuracy,3)}')
print(f'Test F1 Scores: {round(f1,3)}')

Test Accuracy Scores: 0.984
Test F1 Scores: 0.969


In [149]:
gb = {"Model": "Gradient Boosting", 
      "Accuracy Training": round(sum(train_accuracy_scores) / len(train_accuracy_scores),3),
      "Accuracy Validation": round(sum(val_accuracy_scores) / len(val_accuracy_scores),3),
      "Accuracy Test": round(accuracy,3),
      "F1 Training": round(sum(train_f1_scores) / len(train_f1_scores),3),
      "F1 Validation": round(sum(val_f1_scores) / len(val_f1_scores),3),
      "F1 Test": round(f1,3)}

models.append(gb)

### 2.8 MLP - Train and Validation Results

In [150]:
# Create StratifiedKFold
k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize Logistic Regression model
model_MLP = MLPClassifier()

# Initialize variables to store Validation results
train_accuracy_scores = []
train_f1_scores = []

# Initialize variables to store Validation results
val_accuracy_scores = []
val_f1_scores = []

# Perform k-fold cross-validation
for train_idx, val_idx in k_fold.split(X_train_total, y_train_total):
    X_train, X_val = X_train_total.iloc[train_idx], X_train_total.iloc[val_idx]
    y_train, y_val = y_train_total.iloc[train_idx], y_train_total.iloc[val_idx]

    # Train the model
    model_MLP.fit(X_train, y_train)
    
    # Make predictions on the training set  - To check potential overfitting
    y_train_pred = model_MLP.predict(X_train)

    # Evaluate the model on the validation set
    accuracy = accuracy_score(y_train, y_train_pred)
    train_accuracy_scores.append(round(accuracy,3))
    
    #F1 Score 
    f1 = f1_score(y_train, y_train_pred)
    train_f1_scores.append(round(f1,3))

    # Make predictions on the validation set
    y_val_pred = model_MLP.predict(X_val)

    # Evaluate the model on the validation set
    accuracy = accuracy_score(y_val, y_val_pred)
    val_accuracy_scores.append(round(accuracy,3))
    
    #F1 Score 
    f1 = f1_score(y_val, y_val_pred)
    val_f1_scores.append(round(f1,3))

#Training
# Print accuracy results
print("Training Accuracy:")
print(f'Cross-Validation Accuracy Scores: {train_accuracy_scores}')
print(f'Mean Accuracy: {round(sum(train_accuracy_scores) / len(train_accuracy_scores),3)}\n')

# Print F1 results
print("Training F1 Score:")
print(f'Cross-Validation F1 Scores: {train_f1_scores}')
print(f'Mean F1: {round(sum(train_f1_scores) / len(train_f1_scores),3)}\n')

#Validation
# Print accuracy results
print("Validation Accuracy:")
print(f'Cross-Validation Accuracy Scores: {val_accuracy_scores}')
print(f'Mean Accuracy: {round(sum(val_accuracy_scores) / len(val_accuracy_scores),3)}\n')

# Print F1 results
print("Validation F1 Score:")
print(f'Cross-Validation F1 Scores: {val_f1_scores}')
print(f'Mean F1: {round(sum(val_f1_scores) / len(val_f1_scores),3)}')

Training Accuracy:
Cross-Validation Accuracy Scores: [0.975, 0.974, 0.976, 0.976, 0.975]
Mean Accuracy: 0.975

Training F1 Score:
Cross-Validation F1 Scores: [0.952, 0.952, 0.956, 0.954, 0.952]
Mean F1: 0.953

Validation Accuracy:
Cross-Validation Accuracy Scores: [0.975, 0.98, 0.97, 0.975, 0.975]
Mean Accuracy: 0.975

Validation F1 Score:
Cross-Validation F1 Scores: [0.953, 0.962, 0.944, 0.953, 0.953]
Mean F1: 0.953


### 2.8 MLP - Test Results

In [151]:
# Fit Model to total Training Data
model_MLP.fit(X_train_total, y_train_total)

# Make predictions on the Test set
y_test_pred = model_MLP.predict(X_test)

# Evaluate the model on the Test set
accuracy = accuracy_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)

#Print Test Scores
print(f'Test Accuracy Scores: {round(accuracy,3)}')
print(f'Test F1 Scores: {round(f1,3)}')

Test Accuracy Scores: 0.979
Test F1 Scores: 0.96


In [152]:
mlp = {"Model": "MLP", 
      "Accuracy Training": round(sum(train_accuracy_scores) / len(train_accuracy_scores),3),
      "Accuracy Validation": round(sum(val_accuracy_scores) / len(val_accuracy_scores),3),
      "Accuracy Test": round(accuracy,3),
      "F1 Training": round(sum(train_f1_scores) / len(train_f1_scores),3),
      "F1 Validation": round(sum(val_f1_scores) / len(val_f1_scores),3),
      "F1 Test": round(f1,3)}

models.append(mlp)

In [157]:
# Coonvert List of Models Metrics to DF
models_df = pd.DataFrame(models)

In [158]:
models_df

Unnamed: 0,Model,Accuracy Training,Accuracy Validation,Accuracy Test,F1 Training,F1 Validation,F1 Test
0,Logistic Regression,0.973,0.973,0.98,0.949,0.949,0.962
1,Decision Tree,0.972,0.972,0.974,0.944,0.944,0.949
2,Random Forest,0.972,0.972,0.974,0.944,0.944,0.949
3,Gaussian,0.959,0.96,0.963,0.923,0.925,0.931
4,KNN,0.982,0.974,0.982,0.967,0.951,0.965
5,Gradient Boosting,0.985,0.98,0.984,0.973,0.963,0.969
6,MLP,0.975,0.975,0.979,0.953,0.953,0.96


> Decision Trees:
Intuitive and easy to understand.
Can capture complex relationships in the data.

> Random Forest:
Ensemble method built on decision trees.
Generally more robust and accurate than individual trees.

> Support Vector Machines (SVM):
Effective in high-dimensional spaces.
Works well when there is a clear margin of separation between classes.

> Naive Bayes:
Assumes independence between features.
Fast and can perform well on certain types of data.

> K-Nearest Neighbors (KNN):
Instance-based learning.
Simple and easy to understand.

> Gradient Boosting (e.g., XGBoost, LightGBM, AdaBoost):
Builds a strong predictive model by combining weak models.
Often produces very accurate results.