# Import libraries

In [28]:
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score
import numpy as np
from nltk import RegexpTokenizer
import preprocess
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.linear_model import LogisticRegression


## Load data

In [30]:
# This data was preprocessed with the code in preprocess.py

train_set = pd.read_parquet('./preprocessed_and_tokenized_data/preprocessed_train_dataset.parquet')
test_set  = pd.read_parquet('./preprocessed_and_tokenized_data/preprocessed_test_dataset.parquet')


                                          processed_text label
0      kopp is a surname of german origin derived fro...     D
1      se compromeix la vocal quasioberta central és ...     B
2      a boulder that has attracted attention due to ...     C
3      o presidente dos estados unidos george w bush ...     C
4      nerbio h orrek sentimen funtzioak eta funtzio ...     D
...                                                  ...   ...
14684  juliaroberts i hope youre aware of the crisis ...     E
14685  el paraíso en la tierra magnífico el paraíso e...     C
14686  ao ouvir as palavras de césar bruto sentiuse d...     E
14687  eu probei esta receita de torta de queixo ao f...     D
14688  visbretinden also known as visbretinden is a a...     C

[14689 rows x 2 columns]


In [3]:
# Data vectorization
vectorizer = CountVectorizer()
fit = vectorizer.fit(train_set['processed_text'][i] for i in range(len(train_set)))
X_train = vectorizer.transform(train_set['processed_text'][i] for i in range(len(train_set)))
y_train = train_set['label']
X_test = vectorizer.transform(test_set['processed_text'][i] for i in range(len(test_set)))
y_test = test_set['label']

## Create models

SVC

In [68]:
best_f1 = 0
for C in [0.1,0.25,0.5,1,2,5]:
    for tol in [0.000001,0.0001,0.01,0.1,0.15,0.175]:
        model     = LinearSVC(tol=tol, C=C, multi_class='ovr', random_state=2024, max_iter=1000)
        trained   = model.fit(X_train, y_train)
        preds     = fit.predict(X_test)
        f1        = f1_score(y_test, preds, average = 'micro')
        if best_f1 < f1:
            best_f1     = f1
            best_params = [C,tol]
            best_model  = trained
print(f'Best f1 obtained = {best_f1}\n')
print(f'Best value for C: {best_params[0]}, best value for tol: {best_params[1]}')



Best f1 obtained = 0.6823473347402819

Best value for C: 0.1, best value for tol: 0.1




Logistic Regression

In [70]:
best_f1 = 0
for C in [0.1,0.25,0.5,1,2,5]:
    for tol in [0.000001,0.0001,0.01,0.1,0.15,0.175]:
        model = LogisticRegression(tol=tol, C=C, multi_class='ovr', random_state=2024, max_iter=1000)
        fit   = model.fit(X_train, y_train)
        preds = fit.predict(X_test)
        f1    = f1_score(y_test, preds, average = 'micro')
        if best_f1 < f1:
            best_f1     = f1
            best_params = [C,tol]
            best_model  = fit
print(f'Best f1 obtained = {best_f1}\n')
print(f'Best value for C: {best_params[0]}, best value for tol: {best_params[1]}')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best f1 obtained = 0.7177479746749268

Best value for C: 0.1, best value for tol: 0.175


In [27]:


model = MLPClassifier(hidden_layer_sizes=(100,), activation = 'tanh', solver = 'adam')
model_f = model.fit(X_train, y_train)
preds = model_f.predict(X_test)
f1 = f1_score(y_test, preds, average = 'micro')
print(f'f1 obtained = {f1}\n')



f1 obtained = 0.698686091633195

