In [None]:
import joblib

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score,fbeta_score, precision_score, recall_score, roc_curve, auc

In [None]:
%matplotlib inline
sns.set_theme()

In [None]:
X = np.load("./dataset/FYP_train_X_TFIDF.npy")
Y = np.load("./dataset/FYP_train_Y.npy")

print("X Shape:", X.shape)
print("Y Shape:", Y.shape)

In [None]:
models = {
    "logistic_regression": {
        "instance": LogisticRegression(max_iter=500),
        "params": {
            "C": [0.001, 0.01, 0.1, 1, 10, 100],
        }
    },
    "multinominal_naive_bayes": {
        "instance": MultinomialNB(),
        "params": { }
    },
    "decision_tree": {
        "instance": DecisionTreeClassifier(),
        "params": { 
            "criterion" : ["gini"]
        }
    },
    "random_forest": {
        "instance": RandomForestClassifier(),
        "params": {
            "n_estimators": [1, 10, 100, 1000]
        }
    },
    "svm_linear": {
        "instance": SVC(probability=True, gamma="auto", kernel="linear"),
        "params": {
            "C": [1, 10, 100],
        }
    },
    "svm_rbf": {
        "instance": SVC(probability=True, kernel="rbf"),
        "params": {
            "C": [1, 10, 100],
            "gamma": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
        }
    },
    
}

In [None]:
scores = []
best_estimators = {}

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.1, random_state = 1)

for model_name, config in models.items():
    print("Training", model_name)
    classifier = GridSearchCV(
        estimator=config["instance"],
        param_grid=config["params"],
        cv=5,
        scoring="accuracy"
    )
    
    classifier.fit(X_train, Y_train)
    
    scores.append({
        "model" : model_name,
        "train_score" : classifier.best_score_,
        "test_score" : classifier.best_estimator_.score(X_test, Y_test),
        "params" : classifier.best_params_
    })
    
    best_estimators[model_name] = classifier.best_estimator_

In [None]:
result = pd.DataFrame(scores, columns=["model", "train_score", "test_score", "params"])
result.head()

In [None]:
test_split_sizes = [0.1, 0.2, 0.3, 0.4, 0.5]

def train(X, Y, Classifier, name, params={}):
    scores = {}
    
    print("Training", name, "\n")
    
    for size in test_split_sizes:    
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = size, random_state = 1)
        
        model = Classifier(**params)
        model.fit(X_train, Y_train)
        
        Y_pred = model.predict(X_test)

        
        scores["{:.0f}%".format(size * 100)] = {
            "Accuracy" : accuracy_score(Y_test, Y_pred),
            "F-0.5": fbeta_score(Y_test,Y_pred, beta=0.5),
            "F-1": fbeta_score(Y_test,Y_pred, beta=1),
            "Precision" : precision_score(Y_test,Y_pred),
            "Recall" : recall_score(Y_test,Y_pred),
            "ROC": roc_curve(Y_test, Y_pred)
        }
        
        idx = "{:.0f}%".format(size * 100)
        
        print("Test Size={:.2f} Accuracy={:.2f} Precision={:.2f} Recall={:.2f} F-1 Score={:.2f} F-0.5 Score={:.2f}".format(size, scores[idx]["Accuracy"], scores[idx]["Accuracy"], scores[idx]["Precision"], scores[idx]["Recall"], scores[idx]["F-1"], scores[idx]["F-0.5"]))
        
        del X_train, X_test, Y_train, Y_test
    
    return scores

In [None]:
logistic_regression_result = train(
    X=X, 
    Y=Y, 
    Classifier=LogisticRegression, 
    name="Logistics Regression",
    params={"C": 1}
)

In [None]:
naive_bayes_result = train(
    X=X, 
    Y=Y, 
    Classifier=MultinomialNB, 
    name="Multinomial Naive Bayes",
    params={}
)

In [None]:
decision_tree_result = train(
    X=X, 
    Y=Y, 
    Classifier=DecisionTreeClassifier, 
    name="Decision Tree",
    params={"criterion" : "gini"}
)

In [None]:
svm_result = train(
    X=X, 
    Y=Y, 
    Classifier=SVC, 
    name="Support Vector Machine",
    params={"C": 0.5, "kernel": "linear"}
)

In [None]:
random_forest_result = train(
    X=X, 
    Y=Y, 
    Classifier=RandomForestClassifier, 
    name="Random Forest",
    params={"n_estimators" : 1000}
)

#### Exporting Results

In [None]:
def export_result(result, name):
    table = pd.DataFrame(columns=["Test Size", "Accuracy", "Precision", "Recall", "F-1 Score", "F-0.5 Score"])
    
    for test_size in result:
        table.loc[table.shape[0]] = [
            test_size, 
            round(result[test_size]["Accuracy"], 3), 
            round(result[test_size]["Precision"], 3), 
            round(result[test_size]["Recall"], 3), 
            round(result[test_size]["F-1"], 3), 
            round(result[test_size]["F-0.5"], 3)
        ]
    
    table.to_csv(f"./logs/{name}.csv")
    
    return table

In [None]:
logistic_regression_result = export_result(logistic_regression_result, "Logistic Regression")
naive_bayes_result = export_result(naive_bayes_result, "Naive Bayes")
decision_tree_result = export_result(decision_tree_result, "Decision Tree")
svm_result = export_result(svm_result, "SVM")
random_forest_result = export_result(random_forest_result, "Random Forest")

In [None]:
logistic_regression_result

In [None]:
naive_bayes_result

In [None]:
decision_tree_result

In [None]:
svm_result

In [None]:
random_forest_result

In [None]:
def load_result(name):
    result = pd.read_csv(f"./logs/{name}.csv")
    return result

In [None]:
logistic_regression_result = load_result("Logistic Regression")
naive_bayes_result = load_result("Naive Bayes")
decision_tree_result = load_result("Decision Tree")
svm_result = load_result("SVM")
random_forest_result = load_result("Random Forest")

In [None]:
logistic_regression_result

#### Plotting Results

In [None]:
train_split_sizes = 1 - np.array(test_split_sizes)

In [None]:
plt.figure(figsize=(8, 6), dpi=80)

plt.plot(train_split_sizes * 100, logistic_regression_result["Accuracy"], label="Logistic Regression")
plt.plot(train_split_sizes * 100, naive_bayes_result["Accuracy"], label="Naive Bayes")
plt.plot(train_split_sizes * 100, decision_tree_result["Accuracy"], label="Decision Tree")
plt.plot(train_split_sizes * 100, svm_result["Accuracy"], label="Support Vector Machine")
plt.plot(train_split_sizes * 100, random_forest_result["Accuracy"], label="Random Forest")

plt.xlabel("Training Set Size (%)")
plt.ylabel("Accuracy")

plt.legend()

plt.show()

In [None]:
plt.figure(figsize=(8, 6), dpi=80)

plt.plot(train_split_sizes * 100, logistic_regression_result["Precision"], label="Logistic Regression")
plt.plot(train_split_sizes * 100, naive_bayes_result["Precision"], label="Naive Bayes")
plt.plot(train_split_sizes * 100, decision_tree_result["Precision"], label="Decision Tree")
plt.plot(train_split_sizes * 100, svm_result["Precision"], label="Support Vector Machine")
plt.plot(train_split_sizes * 100, random_forest_result["Precision"], label="Random Forest")

plt.xlabel("Training Set Size (%)")
plt.ylabel("Precision")

plt.legend()

plt.show()

In [None]:
plt.figure(figsize=(8, 6), dpi=80)

plt.plot(train_split_sizes * 100, logistic_regression_result["Recall"], label="Logistic Regression")
plt.plot(train_split_sizes * 100, naive_bayes_result["Recall"], label="Naive Bayes")
plt.plot(train_split_sizes * 100, decision_tree_result["Recall"], label="Decision Tree")
plt.plot(train_split_sizes * 100, svm_result["Recall"], label="Support Vector Machine")
plt.plot(train_split_sizes * 100, random_forest_result["Recall"], label="Random Forest")

plt.xlabel("Training Set Size (%)")
plt.ylabel("Recall")

plt.legend()

plt.show()

In [None]:
plt.figure(figsize=(8, 6), dpi=80)

plt.plot(train_split_sizes * 100, logistic_regression_result["F-1 Score"], label="Logistic Regression")
plt.plot(train_split_sizes * 100, naive_bayes_result["F-1 Score"], label="Naive Bayes")
plt.plot(train_split_sizes * 100, decision_tree_result["F-1 Score"], label="Decision Tree")
plt.plot(train_split_sizes * 100, svm_result["F-1 Score"], label="Support Vector Machine")
plt.plot(train_split_sizes * 100, random_forest_result["F-1 Score"], label="Random Forest")

plt.xlabel("Training Set Size (%)")
plt.ylabel("F-1 Score")

plt.legend()

plt.show()

In [None]:
plt.figure(figsize=(8, 6), dpi=80)

plt.plot(train_split_sizes * 100, logistic_regression_result["F-0.5 Score"], label="Logistic Regression")
plt.plot(train_split_sizes * 100, naive_bayes_result["F-0.5 Score"], label="Naive Bayes")
plt.plot(train_split_sizes * 100, decision_tree_result["F-0.5 Score"], label="Decision Tree")
plt.plot(train_split_sizes * 100, svm_result["F-0.5 Score"], label="Support Vector Machine")
plt.plot(train_split_sizes * 100, random_forest_result["F-0.5 Score"], label="Random Forest")

plt.xlabel("Training Set Size (%)")
plt.ylabel("F-0.5 Score")

plt.legend()

plt.show()

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.1, random_state = 1)

In [None]:
logistic_regression_model = LogisticRegression()
naive_bayes_model = MultinomialNB()
decision_tree_model = DecisionTreeClassifier()
svm_model = SVC()
random_forest_model = RandomForestClassifier()

In [None]:
def generate_roc(model, rnd=False):
    model.fit(X_train, Y_train)
    Y_pred = model.predict(X_test)
    FPR, TPR, Threshold = roc_curve(Y_test, Y_pred)
    AUC = auc(FPR, TPR)
    return {"FPR": FPR, "TPR": TPR, "AUC": AUC} 

In [None]:
roc_logistic_regression = generate_roc(logistic_regression_model)
roc_naive_bayes = generate_roc(naive_bayes_model)
roc_svm = generate_roc(svm_model)
roc_decision_tree = generate_roc(decision_tree_model)
roc_random_forest = generate_roc(random_forest_model)

In [None]:
plt.figure(figsize=(8, 6), dpi=80)

plt.plot(
    roc_logistic_regression['FPR'], 
    roc_logistic_regression['TPR'], 
    label=f'Logistic Regression (AUC = {round(roc_logistic_regression["AUC"], 3)})'
)

plt.plot(
    roc_naive_bayes['FPR'], 
    roc_naive_bayes['TPR'], 
    label=f'Naive Bayes (AUC = {round(roc_naive_bayes["AUC"], 3)})'
)

plt.plot(
    roc_decision_tree['FPR'], 
    roc_decision_tree['TPR'], 
    label=f'Decision Tree (AUC = {round(roc_decision_tree["AUC"], 3)})'
)

plt.plot(
    roc_svm['FPR'], 
    roc_svm['TPR'], 
    label=f'Support Vector Machine (AUC = {round(roc_svm["AUC"], 3)})'
)

plt.plot(
    roc_random_forest['FPR'], 
    roc_random_forest['TPR'], 
    label=f'Random Forest (AUC = {round(roc_random_forest["AUC"], 3)})'
)

plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")

plt.legend()

plt.show()

#### Optimising Result With Ensemble Models

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.1, random_state = 1)

In [None]:
BAG_model = BaggingClassifier(base_estimator=MultinomialNB(), n_estimators=500)
BAG_model.fit(X_train, Y_train)
Y_pred = BAG_model.predict(X_test)
accuracy_score(Y_test, Y_pred)

In [None]:
GB_model = GradientBoostingClassifier(n_estimators=500, learning_rate=0.05, subsample=0.5)
GB_model.fit(X_train, Y_train)
Y_pred = GB_model.predict(X_test)
accuracy_score(Y_test, Y_pred)

In [None]:
AB_model = AdaBoostClassifier(base_estimator=MultinomialNB(), n_estimators=500, learning_rate=0.05) # Add parameters
AB_model.fit(X_train, Y_train)
Y_pred = AB_model.predict(X_test)
accuracy_score(Y_test, Y_pred)

In [None]:
XGB_model = XGBClassifier(n_estimators=2000, eta=0.05, subsample=0.5, colsample_bytree=0.5)
XGB_model.fit(X_train, Y_train)
Y_pred = XGB_model.predict(X_test)
accuracy_score(Y_test, Y_pred)

#### Saving The Best Model

In [None]:
joblib.dump(XGB_model, "./models/SDC_XGB_MODEL")