# Comparison of several algorithm for sentiment analysis

This notebook is highly inspired and partially copid from https://towardsdatascience.com/boosting-showdown-scikit-learn-vs-xgboost-vs-lightgbm-vs-catboost-in-sentiment-classification-f7c7f46fd956 

In [2]:
# Importing packages
from time import time
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

import matplotlib
matplotlib.rcParams["figure.dpi"] = 125
import matplotlib.pyplot as plt

import os
import glob
import sys

random_state = 42

## Step 1: Loading and preprocessin data

In [3]:
# Load data from web and prepare it to create the csv
%mkdir -p ../data
!wget -nc -O ../data/aclImdb_v1.tar.gz http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar --skip-old-files -zxf ../data/aclImdb_v1.tar.gz -C ../data

data_dir = "../data/algoshowdown"
imdb_rawdata_dir = "../data/aclImdb"
%mkdir -p ../data/algoshowdown

def prepare_imdb_data(dataset_type):
    if dataset_type not in ["train", "test"]:
        raise ValueError("dataset_type can only be 'train' or 'test'")
    
    output_path = "{}/{}.csv".format(data_dir, dataset_type)
    
    if os.path.isfile(output_path):
        print("{} already exist, nothin to do!".format(output_path))
        return
    
    input_path = "{}/{}".format(imdb_rawdata_dir, dataset_type)
    
    data_rows = []
    for sentiment in ["pos", "neg"]:
        folder = "{}/{}".format(input_path, sentiment)
        
        if sentiment == "neg":
            is_negative = 1
        else:
            is_negative = 0
            
        print("importing data from '{}'... ".format(folder), end="")
        counter = 0
        
        for filename in glob.glob("{}/*.txt".format(folder)):
            counter += 1
            with open(filename, "r") as fp:
                text = fp.read()
                
                data_rows.append({"text": text, "negative": is_negative})
        
        print("{} reviews read".format(counter))
    
    df = pd.DataFrame(data_rows)
    df.to_csv(output_path)
    print("reviews from '{}' saved to '{}'".format(input_path, output_path))
            
prepare_imdb_data("test")
prepare_imdb_data("train")

File ‘../data/aclImdb_v1.tar.gz’ already there; not retrieving.
../data/algoshowdown/test.csv already exist, nothin to do!
../data/algoshowdown/train.csv already exist, nothin to do!


In [25]:
# Loading data from created csv files
df_train = pd.read_csv('{}/train.csv'.format(data_dir)).sample(frac=1.0)
df_test = pd.read_csv('{}/test.csv'.format(data_dir)).sample(frac=1.0)

# Preprocessing our data
tfidf = TfidfVectorizer(max_features=2000)
X_train = tfidf.fit_transform(df_train['text']).toarray()
X_test = tfidf.transform(df_test['text']).toarray()
y_train, y_test = df_train['negative'], df_test['negative']
X_train_sub, X_valid, y_train_sub, y_valid = train_test_split(
    X_train, y_train, test_size=0.1, random_state=random_state)

# Setting up our results dataframe
df_results = pd.DataFrame(columns=['accuracy', 'run_time'])

## Initializing the algorithms

In [5]:
# models should a set populated with tuples (classifier_instance, is_early_stopping_needed_bool).
# ex: models.append((myinstance, False))
models = set()

def add_model_to_showdown(model, need_early_stopping=False):
    for registered_model, es in models:
        if type(registered_model) == type(model):
            return
    
    models.add((model, need_early_stopping))

### Decision Tree

In [5]:
from sklearn.tree import DecisionTreeClassifier

classifier = DecisionTreeClassifier(max_depth=12, random_state=random_state)

add_model_to_showdown(classifier, False) # No need for early stopping

### Random Forest Classifier

In [6]:
from sklearn.ensemble import RandomForestClassifier

claasifier = RandomForestClassifier(n_estimators=500,
                                    max_features=0.06,
                                    n_jobs=6,
                                    random_state=random_state)

add_model_to_showdown(classifier, False) # No need for early stopping

### Adaptive Boosting Classifier

In [7]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

base_estim = DecisionTreeClassifier(max_depth=1, max_features=0.06)

classifier = AdaBoostClassifier(base_estimator=base_estim,
                                n_estimators=500,
                                learning_rate=0.5,
                                random_state=random_state)

add_model_to_showdown(classifier, False) # No need for early stopping

### Gradient Boosting Classifier

In [8]:
from sklearn.ensemble import GradientBoostingClassifier

classifier = GradientBoostingClassifier(n_estimators=2000,
                                 subsample=0.67,
                                 max_features=0.06,
                                 validation_fraction=0.1,
                                 n_iter_no_change=15,
                                 verbose=0,
                                 random_state=random_state)

add_model_to_showdown(classifier, False) # No need for early stopping

### HistGradient Boosting Classifier

In [9]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

classifier = HistGradientBoostingClassifier(max_iter=2000,
                                            validation_fraction=0.1,
                                            n_iter_no_change=15,
                                            verbose=0,
                                            random_state=random_state)

add_model_to_showdown(classifier, False) # No need for early stopping

### Support Vector Machine (Linear)

In [10]:
from sklearn.svm import LinearSVC


### XGBoost Classifier

In [11]:
from xgboost import XGBClassifier

classifier = XGBClassifier(n_estimators=2000,
                           tree_method='gpu_hist',
                           subsample=0.67,
                           colsample_level=0.06,
                           verbose=0,
                           n_jobs=6,
                           random_state=random_state)

add_model_to_showdown(classifier, True) # Need for early stopping

### Long Short-Term Memory Classifier (Pytorch)

In [99]:
import torch
import torch.nn as nn
from torch import optim
import sys


class LSTMClassifier(nn.Module):
    """
    This is the simple RNN model we will be using to perform Sentiment Analysis.
    """

    def __init__(self, embedding_dim, hidden_dim, vocab_size):
        """
        Initialize the model by setting up the various layers.
        """
        super(LSTMClassifier, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.dense = nn.Linear(in_features=hidden_dim, out_features=1)
        self.sig = nn.Sigmoid()

        self.word_dict = None

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.criterion = torch.nn.BCELoss().to(self.device)
        self.optimizer = optim.Adam(self.parameters())
        #self.scheduler = torch.optim.lr_scheduler.StepLR(
        #    self.optimizer, 1, gamma=0.9)
        self.batch_size = 50

    def forward(self, x):
        """
        Perform a forward pass of our model on some input.
        """
        embeds = self.embedding(reviews)
        lstm_out, _ = self.lstm(embeds)
        out = self.dense(lstm_out)
        out = out[lengths - 1, range(len(lengths))]
        return self.sig(out.squeeze())

    def train(self, X, y):
        super(LSTMClassifier, self).train()
        
        # Convert X and y to tensors
        X_tensor = torch.from_numpy(X).long().to(self.device)
        y_tensor = torch.from_numpy(y.values).float().squeeze().to(self.device)

        # Build the dataloader
        loader = torch.utils.data.DataLoader(
            torch.utils.data.TensorDataset(X_tensor, y_tensor),
            batch_size=self.batch_size,
        )

        # Train the model
        train_loss = 0
        train_acc = 0

        for i, (text, label) in enumerate(loader):
            self.optimizer.zero_grad()
            output = self(text)
            loss = self.criterion(output, label)
            train_loss += loss.item()
            loss.backward()
            self.optimizer.step()
            train_acc += (output.round() == label).sum().item()

        # Adjust the learning rate
        #self.scheduler.step()

        return train_loss / len(y), train_acc / len(y)

    def test(self, X, y):
        # Convert X and y to tensors
        X_tensor = torch.from_numpy(X).long().to(self.device)
        y_tensor = torch.from_numpy(y.values).float().squeeze().to(self.device)

        # Build the dataloader
        loader = torch.utils.data.DataLoader(
            torch.utils.data.TensorDataset(X_tensor, y_tensor),
            batch_size=self.batch_size,
        )

        loss = 0
        acc = 0

        for i, (text, label) in enumerate(loader):
            with torch.no_grad():
                output = self(text)
                loss = self.criterion(output, label)
                loss += loss.item()
                acc += (output.round() == label).sum().item()

        return loss / len(y), acc / len(y)

    def fit(self, X_train,
            y_train,
            epochs=20,
            eval_set=None,
            early_stopping_rounds=3,
            verbose=0,
            ):

        
        if verbose:
            print("Training model using {}".format(self.device), end="")
            if str(device) == "cuda":
                print(" on {}".format(torch.cuda.get_device_name(
                    torch.cuda.current_device())))
            else:
                print(" (no info)")
            sys.stdout.flush()

        last_valid_acc = 0
        n_bad_epochs = 0
        
        for epoch in range(1, epochs + 1):

            if verbose:
                print("Running epoch #{}... ".format(epoch), end="")
                sys.stdout.flush()

            start_time = time()
            train_loss, train_acc = self.train(X_train, y_train)

            if eval_set is not None:
                valid_loss, valid_acc = self.test(
                    eval_set[0][0], eval_set[0][1])

            secs = int(time() - start_time)

            if verbose:
                print(
                    f" Done in {secs}s, Loss: {train_loss:.4f}(train) | Acc: {train_acc * 100:.1f}%(train)", 
                    end="")
                if eval_set is None:
                    print("")
                else:
                    print(
                    f" , Loss: {valid_loss:.4f}(valid) | Acc: {valid_acc * 100:.1f}%(valid)")
                sys.stdout.flush()
            
            if eval_set is not None:
                if valid_acc - last_valid_acc < 0:
                    n_bad_epochs += 1
                if n_bad_epochs > early_stopping_rounds:
                    print("Stopping training to avoid overfitting...")
                    break

                last_valid_acc = valid_acc


classifier = LSTMClassifier(32, 64, len(tfidf.vocabulary_)).to(device)

classifier.fit(X_train_sub[:100],
               y_train_sub[:100],
               eval_set=[(X_valid[:100], y_valid[:100])],
               early_stopping_rounds=5,
               verbose=1,
               )

Training model using cuda on GeForce RTX 2080 Ti
Running epoch #1... torch.Size([50])
torch.Size([1999, 50])


ZeroDivisionError: division by zero

In [68]:
y_valid.values

array([1, 0, 1, ..., 0, 1, 1])

## Training and testing the algorithms

In [14]:

for model, es_needed in models:
    name = model.__class__.__name__
    
    print("Training {}... ".format(name), end="")
    sys.stdout.flush()
    
    start_time = time()
    if es_needed:
        model.fit(X_train_sub,
              y_train_sub,
              eval_set = [(X_valid, y_valid)],
              early_stopping_rounds=5,
              verbose=0)
    else:
        model.fit(X_train, y_train)
    
    run_time = time() - start_time
    accuracy = np.mean(model.predict(X_test) == y_test)
        
    df_results.loc[name] = [accuracy, run_time]
    
    print("Done in {:.3}s".format(run_time))
    
    del model

Training XGBClassifier... Done in 41.6s
Training HistGradientBoostingClassifier... Done in 61.3s
Training DecisionTreeClassifier... Done in 13.8s
Training AdaBoostClassifier... Done in 1.29e+02s
Training GradientBoostingClassifier... Done in 1.07e+02s


## Displaying results

In [1]:
plt.bar(range(len(df_results)), df_results["accuracy"])
plt.show()

NameError: name 'plt' is not defined