In [2]:
import pandas as pd

#tools for baseline
import re
import spacy
from spacy.matcher import Matcher
from tqdm import tqdm 

#visualize
from wordcloud import WordCloud
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go


# modeling
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import recall_score, precision_score, f1_score
import numpy as np

#
SEED=420


In [3]:
# Uploading the data 
train_data = pd.read_csv('Datasets/train.csv').sample(frac=1, random_state=SEED).reset_index(drop=True)
test_data = pd.read_csv('Datasets/test.csv').sample(frac=1, random_state=SEED).reset_index(drop=True)
unsupervised_data = pd.read_csv('Datasets/unsupervised.csv').sample(frac=1, random_state=SEED).reset_index(drop=True)

In [4]:
# Visualizing ans splitting train data
print(train_data.head())
X_train = train_data['text']
y_train = train_data['label']

In [5]:
# Visualizing ans splitting test data
print(test_data.head())
X_test = test_data['text']
y_test = test_data['label']

### 2. Baseline Models

### 2.1. Dictionary with TF-IDF

We perfomed text vectorization using TF-IDF (Term Frequency-Inverse Document Frequency) to convert the collection of reviews into a matrix of TF-IDF features. The resulting feature matrix will be then used to train the baseline models for text classification. 

The code separates the positive and negative reviews from the training set and applies the vectorization to each of them separately, resulting in two dataframes. We decided to include the m50 most frecuent words in each of the dictionaries.

In [7]:
# Defining base dictionaries as starters
dict_pos = ['incredible', 'good', 'i love', 
                   'i like', 'awesome',
                   'great', 'fantastic', 
                   'excellent', 'brillant',
                   'genius', 'applause', 
                   'well done']

dict_neg = ['awful', 'bad', 'i hate', 
                  "i don't like", 'worst', 
                  'horrible', 'dreadful', 
                  'terrible', 'poor', 'boring',
                  'weak script', 'not funny',
                  'rubbish', 'pointless', 'crap']

In [8]:
# Dividing poitive and negative data 
train_pos = train_data[train_data['label'] == 1].reset_index(drop=True)
train_neg = train_data[train_data['label'] == 0].reset_index(drop=True)

X_train_pos = train_pos['text']
y_train_pos = train_pos['label']
X_train_neg = train_neg['text']
y_train_neg = train_neg['label']

In [15]:
# TF-IDF
vectorizer = TfidfVectorizer(max_features=50,ngram_range=(1,2), min_df=5, stop_words='english')

#Positive values
X_pos = vectorizer.fit_transform(X_train_pos)
feature_names = vectorizer.get_feature_names_out()
dense_pos = X_pos.todense()
denselist_pos = dense_pos.tolist()
df_pos = pd.DataFrame(denselist_pos, columns=feature_names)

#Negative values
X_neg = vectorizer.fit_transform(X_train_neg)
feature_names = vectorizer.get_feature_names_out()
dense_neg = X_neg.todense()
denselist_neg = dense_neg.tolist()
df_neg = pd.DataFrame(denselist_neg, columns=feature_names)

In [16]:
#New words for the dictionary
new_words_pos = df_pos.columns.tolist()
new_words_neg = df_neg.columns.tolist()

#Adding new words to old dictionaries
dict_pos_new = dict_pos
dict_neg_new = dict_neg

dict_pos_new.extend(new_words_pos)
dict_neg_new.extend(new_words_neg)

### 2.2 Regex model

As a first baseline, we used the previously created dictioary and a model using regexes. We used regular expressions to count the number of times the words in the dictionaries appear in the texts, and then labeled the texts as either positive or negative based on the relative frequency of positive and negative words in the texts.

In [23]:
def get_metrics(y_trues, y_preds, verbose=True):

  recall = recall_score(y_trues, y_preds) * 100
  precision = precision_score(y_trues, y_preds) * 100
  f1 = f1_score(y_trues, y_preds) * 100

  if verbose:
    print(f'Precision: {precision:.2f}')
    print(f'Recall: {recall:.2f}')
    print(f'F1: {f1:.2f}')

  return recall, precision, f1

def get_outputs(texts, dictionary):
  """
  Objective: from the texts and a dictionnary of inputs, outputs 0 or 1

  Inputs:
    - texts, list: the list of texts
    - dictionary, dict or list: the list of words that should output the number of words in the texts
  Outputs:
    - ouptuts, list: counts of the dicitonary's words in the texts
  """
  outputs = []
  for text in texts:
    founds = re.findall(r'\b(?:{})\b'.format('|'.join(dictionary)), text)
    n = len(founds)
    outputs.append(n)

  return outputs


def get_final_outputs(positive_outputs, negative_outputs):
  """
  Objective: decision function of the two labeling functions for positive and negative

  Inputs:
    - positive_outputs, list: the list of outputs for the positive dictionary
    - negative_outputs; list: the list of outputs for the negative dictionary
  Outputs:
    - outputs, list: the same shape of inputs, that gives 1, 0 or -1 if does not know
  """
  assert len(positive_outputs) == len(negative_outputs), 'ValueError: both lists should have the same size'
  outputs = []
  for pos, neg in zip(positive_outputs, negative_outputs):
    if pos > neg:
      outputs.append(1)
    elif pos < neg:
      outputs.append(0)
    else:
      outputs.append(-1)

  return outputs

def get_dictionary_metrics(my_texts, trues, good_dictionary, bad_dictionary,
                           verbose=True):
  """
  Objective: Automate the loop

  Inputs:
    - my_texts, list: the list of texts
    - trues, np.array: the true outputs to look for
    - positive_outputs, list: the list of outputs for the positive dictionary
    - negative_outputs; list: the list of outputs for the negative dictionary
    - verbose, boolean: display the metrics
  Outputs:
    - precision, float: precision score
    - recall, float: recall score
  """
  positive_outputs = get_outputs(my_texts, good_dictionary)
  negative_outputs = get_outputs(my_texts, bad_dictionary)

  outputs = np.array(get_final_outputs(positive_outputs, negative_outputs))
  _outputs = outputs.copy()
  outputs[outputs == -1] = 1 - trues[outputs == -1]
  recall, precision, f1 = get_metrics(trues, outputs, verbose=verbose) 

  return _outputs

final_outputs = get_dictionary_metrics(my_texts, trues, dict_pos_new, dict_neg_new)



Precision: 71.31
Recall: 73.22
F1: 72.25


### 2.3. Logistic regression

Apart from the previos model, we wanted to explore which would be the results with a simple model like the logistic regression. We trained a logistic regression model with balanced class weights also using the TF-IDF representation of the training data.

In [52]:
clf = LogisticRegression(class_weight='balanced', 
                         random_state=11, max_iter=1000)

X_train_tfidf = vectorizer.fit_transform(X_train)

clf.fit(X_train_tfidf, y_train)
y_preds = clf.predict(vectorizer.transform(X_test))
r_tfidf = get_metrics(y_test, y_preds, verbose=True)

Precision: 88.15
Recall: 88.26
F1: 88.21


We can see that it performed pretty well, obtaining much better results as the Spacy rule based matching model. 