# INF442: PI 3 : GDPR in practice

## Subproblem 1: Representation computing and supervised binary classification

### Libraries importation

In [1]:
import numpy as np
from tqdm import tqdm, trange
import matplotlib.pyplot as plt

# Datascience imports
import sklearn as sk
import torch
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from transformers import BertTokenizer, BertModel, pipeline
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


### Dataset importation

In [2]:
# read datasets from csv
X_train = pd.read_csv('data/representation.eng.train.csv')
X_test = pd.read_csv('data/representation.eng.testa.csv')

y_train = pd.read_csv('data/true_labels.eng.train.csv')
y_test = pd.read_csv('data/true_labels.eng.testa.csv')


### Dataset Exploration

In [3]:
# Dataset exploration
#print(X_train.head(2))
#print(y_train.head(2))

# The shape of the dataset
print(f"The training data has the shape: {X_train.shape}, and the training labels has the shape: {y_train.shape}")
print(f"The testing data has the shape: {X_test.shape}, and the testing labels has the shape: {y_test.shape}")

# The proportion of the classes
print(f"The proportion of the classes in the training data: {y_train.value_counts(normalize=True)}")


The training data has the shape: (9999, 768), and the training labels has the shape: (9999, 1)
The testing data has the shape: (1999, 768), and the testing labels has the shape: (1999, 1)
The proportion of the classes in the training data: O     
O         0.835184
I-PER     0.054005
I-ORG     0.046505
I-LOC     0.040104
I-MISC    0.023802
B-MISC    0.000300
B-ORG     0.000100
Name: proportion, dtype: float64


### Dataset processing 

In [4]:
# Processing the data to do a binary classification
# The classes will move from 0-4 to 0-1: I-PER -> 1 , and all the other classes -> 0

def label_mapping(x):
    if x == 'I-PER' or x == 'B-PER':
        return 1
    else:
        return 0

y_train_binary = y_train.map(label_mapping)
y_test_binary = y_test.map(label_mapping)

print(y_train_binary.value_counts(normalize=True))

O
0    0.945995
1    0.054005
Name: proportion, dtype: float64


### Binary classification

In [8]:
# Binary classification

# Let's try many supervised learning models:

# 1. Logistic Regression
from sklearn.linear_model import LogisticRegression

# 2. SVM
from sklearn.svm import SVC

# 3. k-NN
from sklearn.neighbors import KNeighborsClassifier

# 4. Random Forest
from sklearn.ensemble import RandomForestClassifier


# Fit and train the model
def fit_and_train(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return y_pred

# Evaluate the model
def evaluate(y_test, y_pred):
    accuracy = sk.metrics.accuracy_score(y_test, y_pred)
    precision = sk.metrics.precision_score(y_test, y_pred)
    recall = sk.metrics.recall_score(y_test, y_pred)
    f1 = sk.metrics.f1_score(y_test, y_pred)
    return accuracy, precision, recall, f1


print(f"Shape of training data: {X_train.shape}")
print(f"Shape of test data: {X_test.shape}")

Shape of training data: (9999, 768)
Shape of test data: (1999, 768)


In [6]:
# Dataframe to numpy array
X_train = X_train.to_numpy()
X_test = X_test.to_numpy()
y_train_binary = y_train_binary.to_numpy()
y_test_binary = y_test_binary.to_numpy()

# Change the shape of y to 1D
y_train_binary = y_train_binary.reshape(-1)
y_test_binary = y_test_binary.reshape(-1)

In [9]:
# Save the metrics in a dataframe
metrics = pd.DataFrame(columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1'])

# Logistic Regression
model = LogisticRegression()
y_pred = fit_and_train(model, X_train, y_train_binary, X_test, y_test_binary)
accuracy, precision, recall, f1 = evaluate(y_test_binary, y_pred)
#print(f"Logistic Regression: \nAccuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1: {f1}")
metrics.loc[0] = ['Logistic Regression', accuracy, precision, recall, f1]

# SVM
model = SVC()
y_pred = fit_and_train(model, X_train, y_train_binary, X_test, y_test_binary)
accuracy, precision, recall, f1 = evaluate(y_test_binary, y_pred)
#print(f"SVM: \nAccuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1: {f1}")
metrics.loc[1] = ['SVM', accuracy, precision, recall, f1]

# k-NN
model = KNeighborsClassifier()
y_pred = fit_and_train(model, X_train, y_train_binary, X_test, y_test_binary)
accuracy, precision, recall, f1 = evaluate(y_test_binary, y_pred)
#print(f"k-NN: \nAccuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1: {f1}")
metrics.loc[2] = ['k-NN', accuracy, precision, recall, f1]

# Random Forest
model = RandomForestClassifier()
y_pred = fit_and_train(model, X_train, y_train_binary, X_test, y_test_binary)
accuracy, precision, recall, f1 = evaluate(y_test_binary, y_pred)
#print(f"Random Forest: \nAccuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1: {f1}")
metrics.loc[3] = ['Random Forest', accuracy, precision, recall, f1]

print(metrics)





                 Model  Accuracy  Precision    Recall        F1
0  Logistic Regression  0.995498   0.963303  0.954545  0.958904
1                  SVM  0.994997   0.971698  0.936364  0.953704
2                 k-NN  0.988494   0.899083  0.890909  0.894977
3        Random Forest  0.947474   1.000000  0.045455  0.086957


In [10]:
#save the metrics into a csv file
metrics.to_csv('metrics.csv', index=False)