In [9]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
%pip install xgboost
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

Note: you may need to restart the kernel to use updated packages.


In [2]:
processed_data_dir = '/home/rizanb/Documents/hob_pred/data/processed/'

X_train_scaled = joblib.load(f"{processed_data_dir}X_train_scaled.joblib")
X_test_scaled = joblib.load(f"{processed_data_dir}X_test_scaled.joblib")
y_train = joblib.load(f"{processed_data_dir}y_train.joblib")
y_test = joblib.load(f"{processed_data_dir}y_test.joblib")

print(f"X_train_scaled shape: {X_train_scaled.shape}")
print(f"X_test_scaled shape: {X_test_scaled.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X_train_scaled shape: (181, 10)
X_test_scaled shape: (46, 10)
y_train shape: (181,)
y_test shape: (46,)


## train classical models: logr rf svc knn gb xgb

In [3]:
models_path = "/home/rizanb/Documents/hob_pred/models/"
reports_path = "/home/rizanb/Documents/hob_pred/reports/"

accuracy_report = ""

models = [
    ('logr', LogisticRegression()),
    ('rf', RandomForestClassifier()),
    ('svc', SVC()),
    ('knn', KNeighborsClassifier()),
    ('gb', GradientBoostingClassifier()),
    ('xgb', XGBClassifier(eval_metric="mlogloss", enable_categorical=True))
]

for name, model in models:
    if name == "xgb":
        model.fit(X_train_scaled, y_train - 1)
        accuracy = model.score(X_test_scaled, y_test - 1) 
        print(f"accuracy of {name}: {accuracy:.3f}")
        
        # with open("/home/rizanb/Documents/hob_pred/reports/accuracy_report.txt", "a") as f:
            # f.write(f"{name}: {accuracy:.3f} \n")
        
        # joblib.dump(name, f"{models_path}{name}_{accuracy:.3f}.pkl")
        break
    model.fit(X_train_scaled, y_train)
    accuracy = model.score(X_test_scaled, y_test)
    print(f"accuracy of {name}: {accuracy:.3f}")

    # with open("/home/rizanb/Documents/hob_pred/reports/accuracy_report.txt", "a") as f:
            # f.write(f"{name}: {accuracy:.3f} \n")
        
    # joblib.dump(name, f"{models_path}{name}_{accuracy:.3f}.pkl")


accuracy of logr: 0.522
accuracy of rf: 0.609
accuracy of svc: 0.543
accuracy of knn: 0.587
accuracy of gb: 0.522
accuracy of xgb: 0.543


## voting ensemble models: hard and soft voting with knn, svc and xgb

In [4]:
from sklearn.ensemble import VotingClassifier

In [22]:
models_dir = "/home/rizanb/Documents/hob_pred/models/"

est = [('knn', KNeighborsClassifier()), ('svc', SVC(probability=True)), ('gb', GradientBoostingClassifier())]

voting_hard = VotingClassifier(
    estimators = est,
    voting='hard'
)

voting_soft = VotingClassifier(
    estimators = est,
    voting = 'soft'
)

voting_hard.fit(X_train_scaled, y_train)
voting_hard_acc = voting_hard.score(X_test_scaled, y_test)
print(f"Voting Classifier (Hard) Accuracy: {voting_hard_acc:.3f}")
# joblib.dump(voting_hard, f"{models_dir}voting_hard_{voting_hard_acc:.3f}.joblib")

voting_soft.fit(X_train_scaled, y_train)
voting_soft_acc = voting_soft.score(X_test_scaled, y_test)
print(f"Voting Classifier (Soft) Accuracy: {voting_soft_acc:.3f}")
# joblib.dump(voting_soft, f"{models_dir}voting_soft_{voting_soft_acc:.3f}.joblib")

Voting Classifier (Hard) Accuracy: 0.565
Voting Classifier (Soft) Accuracy: 0.587


## knn as meta learner

In [16]:
base_estimators = [
    ('logr', LogisticRegression()),
    ('rf', RandomForestClassifier()),
    ('svc', SVC()),
    ('gb', GradientBoostingClassifier())
]

stacking_knn = StackingClassifier(
    estimators=base_estimators,
    final_estimator=KNeighborsClassifier(),
    cv=5
)

stacking_knn.fit(X_train_scaled, y_train)
stacking_knn_acc = stacking_knn.score(X_test_scaled, y_test)
print(f"stacking classifier with knn meta learner acc: {stacking_knn_acc:.3f}")
joblib.dump(stacking_knn, f"{models_dir}stacking_knn_{stacking_knn_acc:.3f}.joblib")

stacking classifier with knn meta learner acc: 0.413


['/home/rizanb/Documents/hob_pred/models/stacking_knn_0.413.joblib']

## weighted voting with accuracies as weights

In [25]:
individual_accuracies = {
    'knn': 0.587,
    'svc': 0.543,
    'gb': 0.500,
    }

In [29]:
weights = [individual_accuracies['knn'], individual_accuracies['svc'],  
           individual_accuracies['gb']]
weights

[0.587, 0.543, 0.5]

In [30]:
nw = np.array(weights) / np.sum(weights)
print(f"normalized w: {nw}")

normalized w: [0.3601227  0.33312883 0.30674847]


In [31]:
weighted_voting = VotingClassifier(
    estimators=est,
    voting='soft',
    weights=nw
)

In [37]:
weighted_voting.fit(X_train_scaled, y_train)
a = weighted_voting.score(X_test_scaled, y_test)
print(f"weighted voting acc: {a:.3f}")
joblib.dump(weighted_voting, f"{models_dir}weighted_voting_{a}.joblib")

weighted voting acc: 0.565


['/home/rizanb/Documents/hob_pred/models/weighted_voting_0.5652173913043478.joblib']