# Import libraries

In [None]:
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score
import numpy as np
from nltk import RegexpTokenizer
import preprocess #Only executing the import the preprocessing and tokenization will be applied
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.linear_model import LogisticRegression


## Load data

In [None]:
# This data was preprocessed with the code in preprocess.py

train_set = pd.read_parquet('./preprocessed_and_tokenized_data/preprocessed_train_dataset.parquet')
test_set  = pd.read_parquet('./preprocessed_and_tokenized_data/preprocessed_test_dataset.parquet')


In [3]:
# Data vectorization
vectorizer = CountVectorizer()
fit = vectorizer.fit(train_set['processed_text'][i] for i in range(len(train_set)))
X_train = vectorizer.transform(train_set['processed_text'][i] for i in range(len(train_set)))
y_train = train_set['label']
X_test = vectorizer.transform(test_set['processed_text'][i] for i in range(len(test_set)))
y_test = test_set['label']

## Create models

SVC

In [None]:
best_f1 = 0
for C in [0.1,0.25,0.5,1,2,5]:
    for tol in [0.000001,0.0001,0.01,0.1,0.15,0.175]:
        model     = LinearSVC(tol=tol, C=C, multi_class='ovr', random_state=2024, max_iter=1000)
        trained   = model.fit(X_train, y_train)
        preds     = fit.predict(X_test)
        f1        = f1_score(y_test, preds, average = 'micro')
        if best_f1 < f1:
            best_f1     = f1
            best_params = [C,tol]
            best_model  = trained
print(f'Best f1 obtained = {best_f1}\n')
print(f'Best value for C: {best_params[0]}, best value for tol: {best_params[1]}')

Logistic Regression

In [None]:
best_f1 = 0
for C in [0.1,0.25,0.5,1,2,5]:
    for tol in [0.000001,0.0001,0.01,0.1,0.15,0.175]:
        model = LogisticRegression(tol=tol, C=C, multi_class='ovr', random_state=2024, max_iter=1000)
        fit   = model.fit(X_train, y_train)
        preds = fit.predict(X_test)
        f1    = f1_score(y_test, preds, average = 'micro')
        if best_f1 < f1:
            best_f1     = f1
            best_params = [C,tol]
            best_model  = fit
print(f'Best f1 obtained = {best_f1}\n')
print(f'Best value for C: {best_params[0]}, best value for tol: {best_params[1]}')

In [None]:


model = MLPClassifier(hidden_layer_sizes=(100,), activation = 'tanh', solver = 'adam')
model_f = model.fit(X_train, y_train)
preds = model_f.predict(X_test)
f1 = f1_score(y_test, preds, average = 'micro')
print(f'f1 obtained = {f1}\n')

