In [1]:
import pandas as pd
import numpy as np
import re
import nltk
import matplotlib.pyplot as plt

#for model-building
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score

# bag of words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

#for word embedding
import gensim
from gensim.models import Word2Vec #Word2Vec is mostly used for huge datasets

pd.options.display.max_colwidth = 200
%matplotlib inline

In [2]:
from google.colab import drive

In [3]:
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
df_train = pd.read_csv('/content/drive/MyDrive/Data/cleaned_train_df.csv')
df_test = pd.read_csv('/content/drive/MyDrive/Data/cleaned_test_df.csv')

print(df_train.shape)
df_train.head()

(35000, 2)


Unnamed: 0,clean_review,sentiment
0,much love train stomach movie premise one could steal locomotive drive arkansas chicago without hit another train along way right impossible plot line hit board imagine two disgruntle nasa employe...,0
1,good ppv like wrestlemania xx year later wwe cram many match match useless go go every match card would take forever however major highlight include huge pop demolition win tag team belt haku andr...,1
2,find right word everybody problem vaudeville type urban comedy know say know say embark potentially humiliate enterprise pre arrange speed date unfortunately come across cardboard character rather...,0
3,really suprised movie get high rating imdb one movie could easily get someone romantic comedy moonstruck really class set ethnic charm thing people seem take granted cast alone make nearly perfect...,1
4,start confess tend really enjoy action movie military guy dreck awful saw free showtime hbo still feel paid much prolong episode general hospital give possible others would give rating boggles min...,0


In [5]:
print(df_test.shape)
df_test.head()

(15000, 2)


Unnamed: 0,clean_review,sentiment
0,really liked summerslam due look arena curtain look overall interesting reason anyways could one best summerslam ever wwf lex luger main event yokozuna time ok huge fat man v strong man glad time ...,1
1,many television show appeal quite many different kind fan like farscape know youngster year old fan male female many different country think adore v miniseries elements find almost every show v ch...,1
2,film quickly get major chase scene ever increase destruction first really bad thing guy hijack steven seagal would beat pulp seagal driving probably would end whole premise movie seem like decide ...,0
3,jane austen would definitely approve one gwyneth paltrow awesome job capture attitude emma funny without excessively silly yet elegant put convince british accent british maybe best judge fool als...,1
4,expectation somewhat high go see movie think steve carell could wrong come great movie like anchorman year old virgin little miss sunshine boy wrong start right movie certain point steve carell al...,0


In [6]:
X_train = df_train['clean_review']
y_train = df_train['sentiment']
X_test = df_test['clean_review']
y_test = df_test['sentiment']

#Word2Vec using Gensim

In [7]:
# Train Word2Vec model
w2v_model = Word2Vec(sentences=[review.split() for review in X_train], vector_size=100, window=5, min_count=1)

# Get the word embeddings for each review
X_train_w2v = np.array([np.mean([w2v_model.wv[word] for word in review.split() if word in w2v_model.wv]
                                or [np.zeros(100)], axis=0) for review in X_train])

X_test_w2v = np.array([np.mean([w2v_model.wv[word] for word in review.split() if word in w2v_model.wv]
                               or [np.zeros(100)], axis=0) for review in X_test])

#GloVe Embedding

In [8]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

--2024-12-08 22:16:16--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2024-12-08 22:16:16--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2024-12-08 22:16:17--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [9]:
# Load GloVe embeddings
def load_glove_embeddings(file_path):
    embeddings_index = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

# Load the 100-dimensional GloVe embeddings
glove_embeddings = load_glove_embeddings('glove.6B.100d.txt')

# Create embedding for reviews
def get_glove_embedding(review, glove_embeddings, embed_dim=100):
    words = review.split()
    embedding = np.mean([glove_embeddings.get(word, np.zeros(embed_dim)) for word in words], axis=0)
    return embedding

# Apply GloVe embeddings to your dataset
X_train_glove = np.array([get_glove_embedding(review, glove_embeddings) for review in X_train])
X_test_glove = np.array([get_glove_embedding(review, glove_embeddings) for review in X_test])


In [10]:
!pip install transformers
!pip install torch



In [11]:
from transformers import BertTokenizer, BertModel
import torch


# Initialize BERT tokenizer and model (move model to GPU)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to('cuda')  # Move model to GPU

# Function to get BERT embeddings for a sentence
def get_bert_embedding(text, model, tokenizer, max_length=512):
    # Tokenize input text and move input tensors to GPU
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=max_length).to('cuda')

    with torch.no_grad():  # Disable gradient calculation during inference
        outputs = model(**inputs)

    # Get the embeddings from the last layer
    embeddings = outputs.last_hidden_state
    # Use the [CLS] token embedding (index 0) as the representation of the sentence
    sentence_embedding = embeddings[0, 0, :].cpu().numpy()  # Move result back to CPU

    return sentence_embedding

# Apply BERT embeddings to your dataset
X_train_bert = np.array([get_bert_embedding(review, model, tokenizer) for review in X_train])
X_test_bert = np.array([get_bert_embedding(review, model, tokenizer) for review in X_test])



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

#Comparison Using Simple FeedForward Neural Network as Baseline Model

In [12]:
!pip install tensorflow



In [13]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import pandas as pd

# Initialize an empty list to store results
results = []

# Function to define a simple feedforward neural network
def create_ffnn(input_dim):
    model = Sequential([
        Dense(128, input_dim=input_dim, activation='relu'),
        Dropout(0.5),
        Dense(64, activation='relu'),
        Dropout(0.5),
        Dense(1, activation='sigmoid')  # Change to softmax if multiclass
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Function to train, evaluate, and store results
def train_evaluate_ffnn(X_train, X_test, y_train, y_test, method_name):
    input_dim = X_train.shape[1]

    # Create the FFNN model
    model = create_ffnn(input_dim)

    # Train the model
    model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=0)

    # Predict on test data
    y_pred = (model.predict(X_test) > 0.5).astype("int32")

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # Append the results
    results.append({
        'Method': method_name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-score': f1
    })

# Train and evaluate the FFNN with Word2Vec embeddings
train_evaluate_ffnn(X_train_w2v, X_test_w2v, y_train, y_test, 'Word2Vec')

# Train and evaluate the FFNN with GloVe embeddings
train_evaluate_ffnn(X_train_glove, X_test_glove, y_train, y_test, 'GloVe')

# Train and evaluate the FFNN with BERT embeddings
train_evaluate_ffnn(X_train_bert, X_test_bert, y_train, y_test, 'BERT')

# Convert results to a DataFrame for better readability
results_df = pd.DataFrame(results)

# Display the comparison table
print(results_df)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
     Method  Accuracy  Precision    Recall  F1-score
0  Word2Vec  0.857267   0.848427  0.874028  0.861037
1     GloVe  0.801933   0.805181  0.802741  0.803959
2      BERT  0.812733   0.806567  0.828568  0.817420


In [None]:
import pandas as pd

# List of data pairs to save (training and testing sets)
datasets = [
    ('X_train_bow', 'X_test_bow'),
    ('X_train_tfidf', 'X_test_tfidf'),
    ('X_train_w2v', 'X_test_w2v'),
    ('X_train_glove', 'X_test_glove'),
    ('X_train_bert', 'X_test_bert')
]

# Google Drive folder path
drive_folder = '/content/drive/MyDrive/Data/'

# Save each pair to a CSV file in the specified folder
for train_name, test_name in datasets:
    # Get the training and testing arrays
    train_data = globals()[train_name]
    test_data = globals()[test_name]

    # Convert arrays to DataFrame
    df_train = pd.DataFrame(train_data)
    df_test = pd.DataFrame(test_data)

    # Save as CSV files in Google Drive folder
    df_train.to_csv(f'{drive_folder}{train_name}.csv', index=False)
    df_test.to_csv(f'{drive_folder}{test_name}.csv', index=False)

    print(f"Saved {train_name}.csv and {test_name}.csv to Google Drive")


Saved X_train_bow.csv and X_test_bow.csv to Google Drive
Saved X_train_tfidf.csv and X_test_tfidf.csv to Google Drive
Saved X_train_w2v.csv and X_test_w2v.csv to Google Drive
Saved X_train_glove.csv and X_test_glove.csv to Google Drive
Saved X_train_bert.csv and X_test_bert.csv to Google Drive


In [None]:
import pandas as pd

# Path to the folder where the CSV files are saved
folder_path = '/content/drive/MyDrive/Data/'

# Read the CSV files and assign them to variables
X_train_bow = pd.read_csv(f'{folder_path}X_train_bow.csv').values
X_test_bow = pd.read_csv(f'{folder_path}X_test_bow.csv').values

X_train_tfidf = pd.read_csv(f'{folder_path}X_train_tfidf.csv').values
X_test_tfidf = pd.read_csv(f'{folder_path}X_test_tfidf.csv').values

X_train_w2v = pd.read_csv(f'{folder_path}X_train_w2v.csv').values
X_test_w2v = pd.read_csv(f'{folder_path}X_test_w2v.csv').values

X_train_glove = pd.read_csv(f'{folder_path}X_train_glove.csv').values
X_test_glove = pd.read_csv(f'{folder_path}X_test_glove.csv').values

X_train_bert = pd.read_csv(f'{folder_path}X_train_bert.csv').values
X_test_bert = pd.read_csv(f'{folder_path}X_test_bert.csv').values

# Print the shapes of the variables to confirm they are loaded correctly
print(f'X_train_bow shape: {X_train_bow.shape}')
print(f'X_test_bow shape: {X_test_bow.shape}')
print(f'X_train_tfidf shape: {X_train_tfidf.shape}')
print(f'X_test_tfidf shape: {X_test_tfidf.shape}')
print(f'X_train_w2v shape: {X_train_w2v.shape}')
print(f'X_test_w2v shape: {X_test_w2v.shape}')
print(f'X_train_glove shape: {X_train_glove.shape}')
print(f'X_test_glove shape: {X_test_glove.shape}')
print(f'X_train_bert shape: {X_train_bert.shape}')
print(f'X_test_bert shape: {X_test_bert.shape}')


X_train_bow shape: (35000, 5000)
X_test_bow shape: (15000, 5000)
X_train_tfidf shape: (35000, 5000)
X_test_tfidf shape: (15000, 5000)
X_train_w2v shape: (35000, 100)
X_test_w2v shape: (15000, 100)
X_train_glove shape: (35000, 100)
X_test_glove shape: (15000, 100)
X_train_bert shape: (35000, 768)
X_test_bert shape: (15000, 768)


In [15]:
from sklearn.decomposition import PCA

# Number of principal components to keep (you can adjust this as needed)
n_components = 5000  # You can modify this value

# Initialize Logistic Regression model
model = LogisticRegression(max_iter=1000)

# Function to apply PCA to the feature set
def apply_pca(X_train, X_test, n_components=100):
    pca = PCA(n_components=n_components)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)
    return X_train_pca, X_test_pca

# Function to train and evaluate the model, and store results in a list
def train_evaluate_with_pca(X_train, X_test, y_train, y_test, method_name):
    X_train_pca, X_test_pca = apply_pca(X_train, X_test)
    model.fit(X_train_pca, y_train)
    y_pred = model.predict(X_test_pca)

    # Get classification report metrics
    report = classification_report(y_test, y_pred, output_dict=True)

    # Extract precision, recall, and f1-score for each class
    precision = report['accuracy']  # Overall accuracy
    recall = np.mean([report[str(i)]['recall'] for i in range(2)])  # Average recall for 2 classes
    f1 = np.mean([report[str(i)]['f1-score'] for i in range(2)])  # Average F1-score for 2 classes

    return {
        'Method': method_name,
        'Precision': precision,
        'Recall': recall,
        'F1-score': f1
    }

# Initialize an empty list to store results
results = []

# Apply PCA and evaluate each feature extraction method
results.append(train_evaluate_with_pca(X_train_w2v, X_test_w2v, y_train, y_test, 'Word2Vec'))
results.append(train_evaluate_with_pca(X_train_glove, X_test_glove, y_train, y_test, 'GloVe'))
results.append(train_evaluate_with_pca(X_train_bert, X_test_bert, y_train, y_test, 'BERT'))

# Convert results to a DataFrame for better readability
results_df = pd.DataFrame(results)

# Display the comparison table
results_df

Unnamed: 0,Method,Precision,Recall,F1-score
0,Word2Vec,0.8542,0.854043,0.85412
1,GloVe,0.793667,0.793648,0.793642
2,BERT,0.799667,0.799755,0.799666
