In [None]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join('..')))

import pandas as pd
import numpy as np
import seaborn as sns
import pickle

from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from src.preprocess import preprocessing_fn_dict
from src.helper import *

config = parse_config('/Users/rayandaod/Documents/Docs/Job Search/ILLUIN/intent_classification/config.yaml')

In [None]:
dataset_folder_name = 'CLINC150_oos1_down_carry_trans_sentenceCamembertBase'
dataset_version = 'plus'
data_keywords = '_'.join(dataset_folder_name.split('_')[1:])

In [None]:
train_df = pd.read_pickle(os.path.join('../data', dataset_folder_name, dataset_version, 'train', f'train_{data_keywords}.pkl'))
val_df = pd.read_pickle(os.path.join('../data', dataset_folder_name, dataset_version, 'validation', f'validation_{data_keywords}.pkl'))
test_df = pd.read_pickle(os.path.join('../data', dataset_folder_name, dataset_version, 'test', f'test_{data_keywords}.pkl'))

In [None]:
# Get the embedding and convert them to numpy arrays
X = train_df['embedding']

# Get the labels
y = train_df['label']

# Encode the labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Print the labels and their encoded values as a dictionary
# dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

X = np.array(X.tolist())

print('X shape: ', X.shape)
print('y shape: ', y.shape)

In [None]:
# # Step 1: Initialize PCA and the X vector for dimensionality reduction
# pca = PCA(n_components=0.95)  # keep 95% of variance
# pca.fit(X)  # X_train is your training data

# # Step 2: Apply dimensionality reduction to X
# X = pca.transform(X)
# X.shape

In [None]:
model_type = 'logReg'

if model_type == 'logReg':
    classifier = LogisticRegression(random_state=config['random_state'], max_iter=1000)

elif model_type == 'xgb':
    classifier = XGBClassifier(random_state=config['random_state'], max_depth=10, n_estimators=100, learning_rate=0.1)

elif model_type == 'decisionTree':
    classifier = DecisionTreeClassifier(random_state=config['random_state'])

elif model_type == 'randomForest':
    classifier = RandomForestClassifier(random_state=config['random_state'])

elif model_type == 'gradientBoost':
    classifier = GradientBoostingClassifier(random_state=config['random_state'])

elif model_type == 'adaBoost':
    classifier = AdaBoostClassifier(random_state=config['random_state'])

elif model_type == 'mlp':
    classifier = MLPClassifier(random_state=config['random_state'], max_iter=1000)

# Fit the model in a cross validation fashion
classifier.fit(X, y)

In [None]:
def evaluate_model(classifier:object, df:pd.DataFrame, title:str) -> pd.DataFrame:
    """Evaluate a classifier on a dataframe.

    Args:
        classifier (object): The classifier to evaluate.
        df (pd.DataFrame): The dataframe to evaluate the classfiier on.
        title (str): The title of the plot.

    Returns:
        pd.DataFrame: The dataframe with the predicted classes.
    """
    # Copy the dataframe
    df = df.copy()

    # Encode the labels
    encoder = LabelEncoder()

    # If the column 'embedding' does not exist, we should run the preprocessing functions
    if 'embedding' not in df.columns:
        for preprocessing_fn_name in config['training_inference_data_prep']:
            df = preprocessing_fn_dict[preprocessing_fn_name][0](df, verbose=config['verbose'])

    # Get the embeddings
    X_eval = np.array(df['embedding'].tolist())

    # Apply PCA
    # X_eval = pca.transform(X_eval)

    # Get the labels and encode them
    y_eval = encoder.fit_transform(df['label'])

    # Predict
    y_pred = classifier.predict(X_eval)

    # Compute the accuracy
    accuracy_score(y_eval, y_pred)

    # Compute the confusion matrix
    conf_mat = confusion_matrix(y_eval, y_pred)
    conf_mat_df = pd.DataFrame(conf_mat, index = encoder.classes_, columns = encoder.classes_)
    conf_mat_df.index.name = 'Actual'
    conf_mat_df.columns.name = 'Predicted'
    
    # Plot the confusion matrix
    plt.figure(figsize = (20, 14))
    sns.set(font_scale=1.4)
    sns.heatmap(conf_mat_df, annot=True, annot_kws={"size": 16}, fmt='d', cmap='Blues')
    plt.title(title)
    plt.show()
    
    # Print the classification report
    print(classification_report(y_eval, y_pred, target_names=encoder.classes_))

    # Add the predicted classes to the dataframe
    df['predictions'] = encoder.inverse_transform(y_pred)

    return df

In [None]:
# Evaluate the model on the training set
train_df_new = evaluate_model(classifier, train_df, 'Training')

# Evaluate the model on the validation set
val_df_new = evaluate_model(classifier, val_df, 'Validation')

# Evaluate the model on the test set
test_df_new = evaluate_model(classifier, test_df, 'Test')

# Evaluate the model on the example set
example_df = pd.read_csv(os.path.join('..', config['example_set_local_path']))
example_df_new = evaluate_model(classifier, example_df, 'Example')

In [None]:
# Save the classifier and the label encoder
model_folder_name = model_type + '_on_' + dataset_folder_name.split('_')[0] + dataset_version + '_' + dataset_folder_name.split('_')[1:]
model_path = f'../models/{model_folder_name}'
os.makedirs(model_path, exist_ok=True)
pickle.dump(classifier, open(f'{model_path}/model.pkl', 'wb'))