In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
%pip install xgboost
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

Note: you may need to restart the kernel to use updated packages.


In [2]:
processed_data_dir = '/home/rizanb/Documents/hob_pred/data/processed/'

X_train_scaled = joblib.load(f"{processed_data_dir}X_train_scaled.joblib")
X_test_scaled = joblib.load(f"{processed_data_dir}X_test_scaled.joblib")
y_train = joblib.load(f"{processed_data_dir}y_train.joblib")
y_test = joblib.load(f"{processed_data_dir}y_test.joblib")

print(f"X_train_scaled shape: {X_train_scaled.shape}")
print(f"X_test_scaled shape: {X_test_scaled.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X_train_scaled shape: (181, 10)
X_test_scaled shape: (46, 10)
y_train shape: (181,)
y_test shape: (46,)


## train classical models: logr rf svc knn gb xgb

In [3]:
models_path = "/home/rizanb/Documents/hob_pred/models/"
reports_path = "/home/rizanb/Documents/hob_pred/reports/"

accuracy_report = ""

models = [
    ('logr', LogisticRegression()),
    ('rf', RandomForestClassifier()),
    ('svc', SVC()),
    ('knn', KNeighborsClassifier()),
    ('gb', GradientBoostingClassifier()),
    ('xgb', XGBClassifier(eval_metric="mlogloss", enable_categorical=True))
]

for name, model in models:
    if name == "xgb":
        model.fit(X_train_scaled, y_train - 1)
        accuracy = model.score(X_test_scaled, y_test - 1) 
        print(f"accuracy of {name}: {accuracy:.3f}")
        
        # with open("/home/rizanb/Documents/hob_pred/reports/accuracy_report.txt", "a") as f:
            # f.write(f"{name}: {accuracy:.3f} \n")
        
        # joblib.dump(name, f"{models_path}{name}_{accuracy:.3f}.pkl")
        break
    model.fit(X_train_scaled, y_train)
    accuracy = model.score(X_test_scaled, y_test)
    print(f"accuracy of {name}: {accuracy:.3f}")

    # with open("/home/rizanb/Documents/hob_pred/reports/accuracy_report.txt", "a") as f:
            # f.write(f"{name}: {accuracy:.3f} \n")
        
    # joblib.dump(name, f"{models_path}{name}_{accuracy:.3f}.pkl")


accuracy of logr: 0.522
accuracy of rf: 0.522
accuracy of svc: 0.543
accuracy of knn: 0.587
accuracy of gb: 0.500
accuracy of xgb: 0.543


## voting ensemble models: hard and soft voting 

In [5]:
from sklearn.ensemble import VotingClassifier

In [24]:
models_dir = "/home/rizanb/Documents/hob_pred/models/"

est = [('logr', LogisticRegression()), ('rf', RandomForestClassifier()), ('svc', SVC(probability=True)), ('knn', KNeighborsClassifier()), ('gb', GradientBoostingClassifier())]

voting_hard = VotingClassifier(
    estimators = est,
    voting='hard'
)

voting_soft = VotingClassifier(
    estimators = est,
    voting = 'soft'
)

voting_hard.fit(X_train_scaled, y_train)
voting_hard_acc = voting_hard.score(X_test_scaled, y_test)
print(f"Voting Classifier (Hard) Accuracy: {voting_hard_acc:.3f}")
joblib.dump(voting_hard, f"{models_dir}voting_hard_{voting_hard_acc:.3f}.joblib")

voting_soft.fit(X_train_scaled, y_train)
voting_soft_acc = voting_soft.score(X_test_scaled, y_test)
print(f"Voting Classifier (Soft) Accuracy: {voting_soft_acc:.3f}")
joblib.dump(voting_soft, f"{models_dir}voting_soft_{voting_soft_acc:.3f}.joblib")

Voting Classifier (Hard) Accuracy: 0.609
Voting Classifier (Soft) Accuracy: 0.565


['/home/rizanb/Documents/hob_pred/models/voting_soft_0.565.joblib']