In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import pickle
import os

from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.utils.class_weight import compute_class_weight

RANDOM_STATE = 42  # For reproducibility

In [None]:
base_path = '../data/CLINC150_oos1_down_carry_trans_stop_flaubert_average_norm/small'

train_df = pd.
val_df = pd.read_csv('../data/clinc150_validation_down_tr_emb.csv', converters={'embeddings_avg': eval})
test_df = pd.read_csv('../data/clinc150_test_down_tr_emb.csv', converters={'embeddings_avg': eval})

In [None]:
# Get the embedding and convert them to numpy arrays
X = np.array(train_df['embeddings_avg'].tolist())
X.shape

In [None]:
# Get the labels
y = train_df.filter(regex='label')
display(y)

# Encode the labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
print(y)

# Print the labels and their encoded values as a dictionary
dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

In [None]:
model_type = 'log_reg'

if model_type == 'log_reg':
    classifier = LogisticRegression(C=1.0, penalty='l2', random_state=RANDOM_STATE, max_iter=1000)

elif model_type == 'xgb':
    classifier = XGBClassifier(random_state=RANDOM_STATE, max_depth=10, n_estimators=100, learning_rate=0.1)

elif model_type == 'decision_tree':
    classifier = DecisionTreeClassifier(random_state=RANDOM_STATE)

elif model_type == 'random_forest':
    classifier = RandomForestClassifier(random_state=RANDOM_STATE)

elif model_type == 'gradient_boost':
    classifier = GradientBoostingClassifier(random_state=RANDOM_STATE)

elif model_type == 'ada_boost':
    classifier = AdaBoostClassifier(random_state=RANDOM_STATE)

elif model_type == 'mlp':
    classifier = MLPClassifier(random_state=RANDOM_STATE, max_iter=1000)

# Fit the model in a cross validation fashion
classifier.fit(X, y)

In [None]:
# Evaluate the model on the training set
train_df_new = evaluate_model(classifier, train_df, 'Training')

# Evaluate the model on the validation set
val_df_new = evaluate_model(classifier, val_df, 'Validation')

# Evaluate the model on the test set
test_df_new = evaluate_model(classifier, test_df, 'Test')

# Evaluate the model on the example set
example_df = pd.read_csv('../data/examples.csv')
example_df_new = evaluate_model(classifier, example_df, 'Example')

In [None]:
# Save the classifier and the label encoder
pickle.dump(classifier, open(f'../models/{model_type}_classifier.pkl', 'wb'))
pickle.dump(label_encoder, open(f'../models/label_encoder.pkl', 'wb'))

In [None]:
# Load the classifier
classifier = pickle.load(open(f'../models/{model_type}_classifier.pkl', 'rb'))

# Get the embeddings
X_eval = train_df.filter(regex='embeddings_avg')

# Convert the embeddings to numpy arrays
X_eval = np.array(X_eval['embeddings_avg'].tolist())
print(X_eval.shape)

# Predict
y_pred = classifier.predict(X_eval)

y_pred

In [None]:
# set pandas display options to make rows larger
pd.set_option('display.max_colwidth', 200)

val_df_new[val_df.label == 'travel_suggestion']