# Project 2 - Text classification

## Import libraries and files

In [2]:
import os
import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import pickle

In [None]:
# import helpers
# import cooc
# import pickle_vocab
# import glove_template
# import glove_solution

# Generate embeddings using ALBERT

In [3]:
!pip install transformers



In [4]:
from transformers import AlbertTokenizer, AlbertModel
import torch

# Load pre-trained ALBERT tokenizer and model
model_name = "albert-base-v2"  # You can also use "albert-large-v2" or others
tokenizer = AlbertTokenizer.from_pretrained(model_name)
model = AlbertModel.from_pretrained(model_name)

# Move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

In [7]:
# Function to read tweets from a file
def read_tweets(file_path):
    with open(file_path, 'r') as file:
         tweets = [line.strip() for line in file.readlines()]
    return tweets

# Read positive and negative tweets from the respective files
pos_tweets = read_tweets("twitter-datasets/train_pos.txt")
neg_tweets = read_tweets("twitter-datasets/train_neg.txt")

tweets = pos_tweets + neg_tweets

In [9]:
from torch.utils.data import DataLoader, TensorDataset

# Tokenize the sentences
tokens = tokenizer(tweets, padding=True, truncation=True, return_tensors="pt")

# Move tokens to GPU
tokens = {key: val.to(device) for key, val in tokens.items()}


# Create a TensorDataset from your tokens
dataset = TensorDataset(tokens['input_ids'], tokens['attention_mask'])  # Add other token types if needed

# Create a DataLoader for batch processing
batch_size = 16  # Adjust this based on your GPU memory capacity
dataloader = DataLoader(dataset, batch_size=batch_size)

# Initialize an empty list to store embeddings
all_embeddings = []

# Process tweets in batches
with torch.no_grad():
    for batch in dataloader:
        input_ids, attention_mask = batch  # Unpack the batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)

        # Pass the batch to the model
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        # Extract embeddings for the batch
        batch_embeddings = outputs.last_hidden_state[:, 0, :]

        # Append batch embeddings to the list
        all_embeddings.append(batch_embeddings.cpu().numpy())

# Concatenate all embeddings
cls_embeddings_np = np.concatenate(all_embeddings, axis=0)

In [10]:
# Save embeddings to a .npy file
np.save("albert_embeddings.npy", cls_embeddings_np)
print("ALBERT embeddings saved as 'albert_embeddings.npy'")

ALBERT embeddings saved as 'albert_embeddings.npy'


## Load data

In [None]:
# file paths (NOT FULL DATASETS FOR NOW!!!)
DATASETS_FOLDER = 'twitter-datasets'
POS_FILE = os.path.join(DATASETS_FOLDER, 'train_pos.txt')
NEG_FILE = os.path.join(DATASETS_FOLDER, 'train_neg.txt')
TEST_FILE = os.path.join(DATASETS_FOLDER, 'test_data.txt')
VOCAB_FILE = 'vocab.pkl'
EMBEDDINGS_FILE = 'word2vec_embeddings.npy'

# download nltk ressources
nltk.download('punkt')

# Load vocabulary
with open(VOCAB_FILE, 'rb') as f:
    vocab = pickle.load(f)  # word -> index

# Load embeddings
embeddings = np.load(EMBEDDINGS_FILE)  # shape: (vocab_size, embedding_dim)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Nathan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
def load_tweets(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        tweets = f.readlines()
    # Remove newline characters
    tweets = [tweet.strip() for tweet in tweets]
    return tweets

def load_test_tweets(file_path):
    tweet_ids = []
    tweets = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            # Each line is in the format: "<tweet_id>,<tweet_text>"
            # In this dataset, test tweets are numbered but may not include commas
            tweet = line.strip()
            if tweet:
                tweet_ids.append(len(tweet_ids) + 1)  # Assuming tweet IDs are 1-based indices
                tweets.append(tweet)
    return tweet_ids, tweets

In [None]:
# load all data (NOT FULL DATASETS FOR NOW!!!)
pos_tweets = load_tweets(POS_FILE)
neg_tweets = load_tweets(NEG_FILE)
test_ids, test_tweets = load_test_tweets(TEST_FILE)

# labels
pos_labels = [1] * len(pos_tweets)
neg_labels = [0] * len(neg_tweets)

# concatenation
all_tweets = pos_tweets + neg_tweets
all_labels = pos_labels + neg_labels

## Embeddings

In [None]:
# Represent each tweet as an average of its word embeddings
def tweet_to_embedding(tweet, vocab, embeddings):
    words = tweet.split()  # Tweets are already tokenized
    indices = [vocab.get(word) for word in words if word in vocab]
    if not indices:
        # If no words in vocab, return zero vector
        return np.zeros(embeddings.shape[1])
    word_vectors = embeddings[indices]
    tweet_embedding = np.mean(word_vectors, axis=0)
    return tweet_embedding

In [None]:
# Compute embeddings for all tweets
tweet_embeddings = np.array([tweet_to_embedding(tweet, vocab, embeddings) for tweet in all_tweets])
test_embeddings = np.array([tweet_to_embedding(tweet, vocab, embeddings) for tweet in test_tweets])

# Convert labels to numpy array
labels = np.array(all_labels)

## Split data

In [None]:
# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(tweet_embeddings, labels, test_size=0.1, random_state=42)

In [None]:
# For training data
print("X_train shape:", X_train.shape)
print("Number of samples:", X_train.shape[0])
print("Number of features:", X_train.shape[1])

X_train shape: (180000, 300)
Number of samples: 180000
Number of features: 300


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

## Check for class imbalance

In [None]:
from collections import Counter

# Assuming y_train contains your training labels
class_counts = Counter(y_train)
print("Class distribution in training set:", class_counts)

# For a binary classification problem
num_positive = class_counts[1]
num_negative = class_counts[0]
total_samples = num_positive + num_negative

print(f"Number of positive samples: {num_positive} ({(num_positive/total_samples)*100:.2f}%)")
print(f"Number of negative samples: {num_negative} ({(num_negative/total_samples)*100:.2f}%)")

unique, counts = np.unique(y_train, return_counts=True)
class_distribution = dict(zip(unique, counts))
print("Class distribution in training set:", class_distribution)

Class distribution in training set: Counter({1: 90039, 0: 89961})
Number of positive samples: 90039 (50.02%)
Number of negative samples: 89961 (49.98%)
Class distribution in training set: {0: 89961, 1: 90039}


No class imbalance (approx 50/50)

## Model selection

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [None]:
# Initialize classifiers
logistic_regression = LogisticRegression(max_iter=1000)
random_forest = RandomForestClassifier()
svm = LinearSVC(dual=False)

# Train classifiers
logistic_regression.fit(X_train_scaled, y_train)
random_forest.fit(X_train, y_train)
svm.fit(X_train_scaled, y_train)

LinearSVC(dual=False)

Note: if we want to test Naive Bayes as well, we must either replace MultinomialNB with GaussianNB (because multinomialNB doesn't handle negative values) or use feature representations w/non-negative counts (bag-of-words, TF-IDF vectors) with MultinomialNB (which is probably more suitable).

### Evaluate performance of each classifier and select the best one

In [None]:
def evaluate_classifier(classifier, X_test, y_test):
    y_preds = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_preds)
    report = classification_report(y_test, y_preds)
    return accuracy, report

In [None]:
# Evaluate classifiers
logistic_accuracy, logistic_report = evaluate_classifier(logistic_regression, X_val, y_val)
random_forest_accuracy, random_forest_report = evaluate_classifier(random_forest, X_val, y_val)
svm_accuracy, svm_report = evaluate_classifier(svm, X_val, y_val)

# Select the best-performing model
best_model = max([(logistic_accuracy, 'Logistic Regression'),
                  (random_forest_accuracy, 'Random Forest'),
                  (svm_accuracy, 'SVM')], key=lambda x: x[0])

print("Best-performing model:", best_model[1])
print("Accuracy:", best_model[0])
print("Classification report:\n", evaluate_classifier(globals()[best_model[1].lower().replace(' ', '_')], X_val, y_val)[1])

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Best-performing model: Random Forest
Accuracy: 0.72185
Classification report:
               precision    recall  f1-score   support

           0       0.74      0.68      0.71     10039
           1       0.70      0.76      0.73      9961

    accuracy                           0.72     20000
   macro avg       0.72      0.72      0.72     20000
weighted avg       0.72      0.72      0.72     20000



### Hyperparameter tuning for the best-performing model

The best-performing model among Random Forest, SVM and Logistic Regression is apparently Random Forest.

In [None]:
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold

# Create the parameters grid

param_grid = {
    'n_estimators': [100, 300, 500],
    'max_depth': [None, 10, 50],
    'max_features': ['sqrt', 'log2'],
    'criterion': ['gini', 'entropy']
}

In [None]:
# Cross validation with randomized search

cv_strategy = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
rf = RandomForestClassifier(random_state=42)

rf_random = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_grid,
    n_iter=50,
    cv=cv_strategy,
    verbose=2,
    random_state=42,
    n_jobs=-1,  # Use all available cores
    scoring='accuracy'
)

In [None]:
# CAN BE REALLY LONG!!!
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 36 candidates, totalling 108 fits




[CV] END criterion=gini, max_depth=None, max_features=sqrt, n_estimators=100; total time= 4.0min
[CV] END criterion=gini, max_depth=None, max_features=sqrt, n_estimators=100; total time= 4.0min
[CV] END criterion=gini, max_depth=None, max_features=sqrt, n_estimators=100; total time= 4.0min
[CV] END criterion=gini, max_depth=None, max_features=sqrt, n_estimators=300; total time=11.0min
[CV] END criterion=gini, max_depth=None, max_features=sqrt, n_estimators=300; total time=10.4min
[CV] END criterion=gini, max_depth=None, max_features=sqrt, n_estimators=300; total time=10.5min
[CV] END criterion=gini, max_depth=None, max_features=log2, n_estimators=100; total time= 3.3min
[CV] END criterion=gini, max_depth=None, max_features=sqrt, n_estimators=500; total time=17.0min
[CV] END criterion=gini, max_depth=None, max_features=log2, n_estimators=100; total time= 3.4min
[CV] END criterion=gini, max_depth=None, max_features=log2, n_estimators=100; total time= 3.4min
[CV] END criterion=gini, max_d

KeyboardInterrupt: 

In [None]:
# Display the best parameters
print("Best Parameters found:")
print(rf_random.best_params_)

# Evaluate the best estimator on the validation set
best_rf = rf_random.best_estimator_

# Predict on validation data
y_val_pred = best_rf.predict(X_val)

# Compute accuracy
best_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy after tuning: {best_accuracy:.4f}")

# Print classification report
print("Classification Report:")
print(classification_report(y_val, y_val_pred))

After this: we can perform a more focused search with GridSearchCV