In [1]:
# Load the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import time

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
start_time = time.time()
df = pd.read_csv('preprocessed_data.csv')
end_time = time.time()
print("Time taken:", end_time - start_time, "seconds")
df.head()

Time taken: 8.854186058044434 seconds


Unnamed: 0,productId,thumbsup,thumbsdown,sentiment,preprocessed_text,preprocessed_summary
0,B005LAIHE0,1,0,1,rented expecting somewhat cheesy rom com under...,better expected
1,B002GTZSZU,2,1,1,always loved classical music movie made stick ...,great movie
2,B0004Z3558,1,0,1,good investment u husband really love karate h...,cute kid
3,B00004U3ZU,25,0,1,mario bava founding father italian horror see ...,landmark horror movie
4,B001B73PO4,1,2,1,entire series watched year old winter school b...,liberty kid


In [3]:
df['total_text'] = df['preprocessed_summary']+' '+df['preprocessed_text']
df.count()

productId               1999965
thumbsup                1999965
thumbsdown              1999965
sentiment               1999965
preprocessed_text       1999953
preprocessed_summary    1980254
total_text              1980243
dtype: int64

In [4]:
df = df.dropna(how='any')
df = df.reset_index(drop=True)
df.count()

productId               1980243
thumbsup                1980243
thumbsdown              1980243
sentiment               1980243
preprocessed_text       1980243
preprocessed_summary    1980243
total_text              1980243
dtype: int64

In [5]:
# Get a random sample of 100 rows with sentiment equal to Positive
random_sample_sentiment_pos = df[df['sentiment'] == 1].sample(n=50000, random_state=42)

# Get a random sample of 100 rows with sentiment equal to Negative
random_sample_sentiment_neg = df[df['sentiment'] == -1].sample(n=50000, random_state=42)

random_samples_combined = pd.concat([random_sample_sentiment_pos, random_sample_sentiment_neg])

X = random_samples_combined.drop(columns=['sentiment'], axis=1)
y = random_samples_combined['sentiment']

y = y.replace(-1, 0)

In [6]:
X

Unnamed: 0,productId,thumbsup,thumbsdown,preprocessed_text,preprocessed_summary,total_text
260745,B00004VXTF,0,1,wanted upgrade vhs dvd also got share movie so...,pulp,pulp wanted upgrade vhs dvd also got share mov...
397160,B00005J4A7,2,2,good movie every angle good winning evil age o...,great entertainer,great entertainer good movie every angle good ...
2725,B0051GOB26,0,1,got boyfriend christmas loved shipped really f...,fast shipping great movie,fast shipping great movie got boyfriend christ...
184318,B000059ZAT,2,2,one best classic film good acting good combina...,spartacus,spartacus one best classic film good acting go...
802638,B000WULC0K,5,1,vci entertainment present christmas carol aka ...,christmas carol alastair sim vci raised bar on...,christmas carol alastair sim vci raised bar on...
...,...,...,...,...,...,...
1413560,B000JCSPG0,0,5,well finished watching director cut last night...,quot director cut quot,quot director cut quot well finished watching ...
1185477,B001H1SW4M,18,10,creepy movie sure creepy enough film begin pro...,ridiculous,ridiculous creepy movie sure creepy enough fil...
1343585,B00144N8MI,6,2,generally good photography marred unfocused ed...,overvalued,overvalued generally good photography marred u...
1336372,B0009QZ48I,2,0,read review dvd went ahead bought anyway live ...,listened review,listened review read review dvd went ahead bou...


In [7]:
from sklearn.model_selection import train_test_split

# Splitting the data into training, validation and testing sets (60%, 20%, 20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42, stratify=y_test)

# Print the shapes of the resulting sets
print("Training set shape:", X_train.shape, y_train.shape)
print("Validation set shape:", X_val.shape, y_val.shape)
print("Testing set shape:", X_test.shape, y_test.shape)

Training set shape: (60000, 6) (60000,)
Validation set shape: (20000, 6) (20000,)
Testing set shape: (20000, 6) (20000,)


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import preprocessing

def getTfidfVectorizerFeatures(max_features):
  vectorizer = TfidfVectorizer(max_features=max_features)
  X_train_vectorized = vectorizer.fit_transform(X_train['total_text'])
  X_val_vectorized = vectorizer.transform(X_val['total_text'])
  X_test_vectorized = vectorizer.transform(X_test['total_text'])
  scaler = preprocessing.StandardScaler(with_mean=False)
  X_train_scaled = scaler.fit_transform(X_train_vectorized)
  X_val_scaled = scaler.transform(X_val_vectorized)
  X_test_scaled = scaler.transform(X_test_vectorized)
  return X_train_scaled, X_val_scaled, X_test_scaled

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import preprocessing

def getCountVectorizerFeatures(max_features):
  vectorizer = CountVectorizer(max_features=max_features)
  X_train_vectorized = vectorizer.fit_transform(X_train['total_text'])
  X_val_vectorized = vectorizer.transform(X_val['total_text'])
  X_test_vectorized = vectorizer.transform(X_test['total_text'])
  scaler = preprocessing.StandardScaler(with_mean=False)
  X_train_scaled = scaler.fit_transform(X_train_vectorized)
  X_val_scaled = scaler.transform(X_val_vectorized)
  X_test_scaled = scaler.transform(X_test_vectorized)
  return X_train_scaled, X_val_scaled, X_test_scaled

In [25]:
# Load pre-trained GloVe embeddings
def load_glove_embeddings(file_path):
    embeddings_index = {}
    with open(file_path, encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

def getGloveEmbeddings(max_features):
    glove_file = '../HW2/glove.6B/'+max_features
    glove_model = load_glove_embeddings(glove_file)

    X_train_vectorized = []
    for text in X_train['total_text']:
        words = text.split()[:5000]
        # mean embedding
        embedding = np.mean([glove_model[word] for word in words if word in glove_model], axis=0)
        X_train_vectorized.append(embedding)

    X_val_vectorized = []
    for text in X_val['total_text']:
        words = text.split()[:5000]
        # mean embedding
        embedding = np.mean([glove_model[word] for word in words if word in glove_model], axis=0)
        X_val_vectorized.append(embedding)

    X_test_vectorized = []
    for text in X_test['total_text']:
        words = text.split()[:5000]
        # mean embedding
        embedding = np.mean([glove_model[word] for word in words if word in glove_model], axis=0)
        X_test_vectorized.append(embedding)

    return X_train_vectorized, X_val_vectorized, X_test_vectorized

In [11]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.linear_model import LogisticRegression

def LogisticRegressionModel(X_train_scaled, X_val_scaled):
  start_time = time.time()
  lr_cv_model = LogisticRegression(penalty='l2',max_iter=1000,C=1,random_state=42)
  lr_cv_model.fit(X_train_scaled, y_train)

  predicted_train = lr_cv_model.predict(X_train_scaled)
  predicted_val = lr_cv_model.predict(X_val_scaled)

  accuracy_train = accuracy_score(y_train,predicted_train)

  accuracy_val = accuracy_score(y_val,predicted_val)

  end_time = time.time()
  print("Time taken:", end_time - start_time, "seconds")
  print("Training Accuracy: ", accuracy_train)
  print("Validation Accuracy: ", accuracy_val)
  lr_cv_report=classification_report(y_val,predicted_val,target_names=['Positive','Negative'])
  print("Classification Report")
  print(lr_cv_report)
  lr_cv_cm = confusion_matrix(y_val, predicted_val)
  print("Confusion Matrix")
  print(lr_cv_cm)


In [12]:
from sklearn.linear_model import SGDClassifier

def SVMClassifier(X_train_scaled, X_val_scaled):
  start_time = time.time()
  svm_cv_model = SGDClassifier(loss='hinge',max_iter=1000,random_state=42)
  svm_cv_model.fit(X_train_scaled, y_train)

  predicted_train = svm_cv_model.predict(X_train_scaled)
  predicted_val = svm_cv_model.predict(X_val_scaled)

  accuracy_train = accuracy_score(y_train,predicted_train)

  accuracy_val = accuracy_score(y_val,predicted_val)

  end_time = time.time()
  print("Time taken:", end_time - start_time, "seconds")
  print("Training Accuracy: ", accuracy_train)
  print("Validation Accuracy: ", accuracy_val)
  svm_cv_report=classification_report(y_val,predicted_val,target_names=['Positive','Negative'])
  print("Classification Report")
  print(svm_cv_report)
  svm_cv_cm = confusion_matrix(y_val, predicted_val)
  print("Confusion Matrix")
  print(svm_cv_cm)

In [13]:
from sklearn.naive_bayes import MultinomialNB

def MNBClassifier(X_train_scaled, X_val_scaled):
  start_time = time.time()
  mnb_cv_model = MultinomialNB()
  mnb_cv_model.fit(X_train_scaled, y_train)

  predicted_train = mnb_cv_model.predict(X_train_scaled)
  predicted_val = mnb_cv_model.predict(X_val_scaled)

  accuracy_train = accuracy_score(y_train,predicted_train)

  accuracy_val = accuracy_score(y_val,predicted_val)

  end_time = time.time()
  print("Time taken:", end_time - start_time, "seconds")
  print("Training Accuracy: ", accuracy_train)
  print("Validation Accuracy: ", accuracy_val)
  mnb_cv_report=classification_report(y_val,predicted_val,target_names=['Positive','Negative'])
  print("Classification Report")
  print(mnb_cv_report)
  mnb_cv_cm = confusion_matrix(y_val, predicted_val)
  print("Confusion Matrix")
  print(mnb_cv_cm)

In [14]:
from sklearn.naive_bayes import GaussianNB

def GNBClassifier(X_train_scaled, X_val_scaled):
  start_time = time.time()
  gnb_cv_model = GaussianNB()
  gnb_cv_model.fit(X_train_scaled, y_train)

  predicted_train = gnb_cv_model.predict(X_train_scaled)
  predicted_val = gnb_cv_model.predict(X_val_scaled)

  accuracy_train = accuracy_score(y_train,predicted_train)

  accuracy_val = accuracy_score(y_val,predicted_val)

  end_time = time.time()
  print("Time taken:", end_time - start_time, "seconds")
  print("Training Accuracy: ", accuracy_train)
  print("Validation Accuracy: ", accuracy_val)
  gnb_cv_report=classification_report(y_val,predicted_val,target_names=['Positive','Negative'])
  print("Classification Report")
  print(gnb_cv_report)
  gnb_cv_cm = confusion_matrix(y_val, predicted_val)
  print("Confusion Matrix")
  print(gnb_cv_cm)

In [15]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from keras.optimizers import Adam
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

def MLPClassifier(X_train_scaled, X_val_scaled):
  start_time = time.time()

  if isinstance(X_train_scaled, list):
      shape = len(X_train_scaled[1])
      X_train_scaled = np.array(X_train_scaled)
      X_val_scaled = np.array(X_val_scaled)
  else:
      shape = X_train_scaled.shape[1]
      X_train_scaled = X_train_scaled.toarray()
      X_val_scaled = X_val_scaled.toarray()
      
  def getModel():
      mlp_classifier = Sequential([
      Dense(128, activation='relu', input_shape=(shape,)),
      Dense(128, activation='relu'),
      Dense(1, activation='sigmoid')
      ])
      optimizer = Adam()
      mlp_classifier.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
      return mlp_classifier

  mlp_model = getModel()
  mlp_model.fit(X_train_scaled, y_train, epochs=20, batch_size=100, verbose=False)

  predicted_train = mlp_model.predict(X_train_scaled)
  predicted_val = mlp_model.predict(X_val_scaled)

  predicted_train = (predicted_train > 0.5).astype(int)
  predicted_val = (predicted_val > 0.5).astype(int)

  accuracy_train = accuracy_score(y_train,predicted_train)

  accuracy_val = accuracy_score(y_val,predicted_val)

  end_time = time.time()
  print("Time taken:", end_time - start_time, "seconds")
  print("Training Accuracy: ", accuracy_train)
  print("Validation Accuracy: ", accuracy_val)
  mlp_cv_report=classification_report(y_val,predicted_val,target_names=['Positive','Negative'])
  print("Classification Report")
  print(mlp_cv_report)
  mlp_cv_cm = confusion_matrix(y_val, predicted_val)
  print("Confusion Matrix")
  print(mlp_cv_cm)


In [26]:
max_feature_sizes = ['glove.6B.50d.txt', 'glove.6B.100d.txt', 'glove.6B.200d.txt', 'glove.6B.300d.txt']
for size in max_feature_sizes:
  print("Glove Embeddings No. of Dimensions = ", size)
  print("")
  X_train_scaled, X_val_scaled, X_test_scaled = getGloveEmbeddings(size)
  print("Logistic Regression")
  print("")
  LogisticRegressionModel(X_train_scaled, X_val_scaled)
  print("")
  print("")
  print("Support Vector Machine")
  print("")
  SVMClassifier(X_train_scaled, X_val_scaled)
  print("")
  print("")
  print("Gaussian Naive Bayes")
  print("")
  GNBClassifier(X_train_scaled, X_val_scaled)
  print("")
  print("")
  print("Multilayer Perceptron")
  print("")
  MLPClassifier(X_train_scaled, X_val_scaled)
  print("")
  print("-----------------------------------------------------")

Glove Embeddings No. of Dimensions =  glove.6B.50d.txt

Logistic Regression

Time taken: 7.723412036895752 seconds
Training Accuracy:  0.7653433333333334
Validation Accuracy:  0.76504
Classification Report
              precision    recall  f1-score   support

    Positive       0.76      0.78      0.77    100000
    Negative       0.77      0.75      0.76    100000

    accuracy                           0.77    200000
   macro avg       0.77      0.77      0.77    200000
weighted avg       0.77      0.77      0.77    200000

Confusion Matrix
[[77552 22448]
 [24544 75456]]


Support Vector Machine

Time taken: 1.6705729961395264 seconds
Training Accuracy:  0.7621466666666666
Validation Accuracy:  0.76217
Classification Report
              precision    recall  f1-score   support

    Positive       0.74      0.80      0.77    100000
    Negative       0.78      0.72      0.75    100000

    accuracy                           0.76    200000
   macro avg       0.76      0.76      0.76  

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m18750/18750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 236us/step
[1m6250/6250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 298us/step
Time taken: 69.39710092544556 seconds
Training Accuracy:  0.8219116666666667
Validation Accuracy:  0.807765
Classification Report
              precision    recall  f1-score   support

    Positive       0.80      0.82      0.81    100000
    Negative       0.82      0.79      0.80    100000

    accuracy                           0.81    200000
   macro avg       0.81      0.81      0.81    200000
weighted avg       0.81      0.81      0.81    200000

Confusion Matrix
[[82312 17688]
 [20759 79241]]

-----------------------------------------------------
Glove Embeddings No. of Dimensions =  glove.6B.100d.txt

Logistic Regression

Time taken: 7.420844078063965 seconds
Training Accuracy:  0.8053933333333333
Validation Accuracy:  0.80515
Classification Report
              precision    recall  f1-score   support

    Positive      

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m18750/18750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 243us/step
[1m6250/6250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 240us/step
Time taken: 77.29339098930359 seconds
Training Accuracy:  0.865705
Validation Accuracy:  0.8486
Classification Report
              precision    recall  f1-score   support

    Positive       0.83      0.87      0.85    100000
    Negative       0.87      0.83      0.85    100000

    accuracy                           0.85    200000
   macro avg       0.85      0.85      0.85    200000
weighted avg       0.85      0.85      0.85    200000

Confusion Matrix
[[87175 12825]
 [17455 82545]]

-----------------------------------------------------
Glove Embeddings No. of Dimensions =  glove.6B.200d.txt

Logistic Regression

Time taken: 20.349468231201172 seconds
Training Accuracy:  0.832105
Validation Accuracy:  0.83128
Classification Report
              precision    recall  f1-score   support

    Positive       0.83      0.83      

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m18750/18750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 258us/step
[1m6250/6250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 278us/step
Time taken: 81.45496010780334 seconds
Training Accuracy:  0.897765
Validation Accuracy:  0.87596
Classification Report
              precision    recall  f1-score   support

    Positive       0.87      0.88      0.88    100000
    Negative       0.88      0.87      0.87    100000

    accuracy                           0.88    200000
   macro avg       0.88      0.88      0.88    200000
weighted avg       0.88      0.88      0.88    200000

Confusion Matrix
[[88424 11576]
 [13232 86768]]

-----------------------------------------------------
Glove Embeddings No. of Dimensions =  glove.6B.300d.txt

Logistic Regression

Time taken: 30.151658058166504 seconds
Training Accuracy:  0.841995
Validation Accuracy:  0.841565
Classification Report
              precision    recall  f1-score   support

    Positive       0.84      0.84    

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m18750/18750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 270us/step
[1m6250/6250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 275us/step
Time taken: 89.64800500869751 seconds
Training Accuracy:  0.9153083333333333
Validation Accuracy:  0.886465
Classification Report
              precision    recall  f1-score   support

    Positive       0.88      0.89      0.89    100000
    Negative       0.89      0.88      0.89    100000

    accuracy                           0.89    200000
   macro avg       0.89      0.89      0.89    200000
weighted avg       0.89      0.89      0.89    200000

Confusion Matrix
[[89292 10708]
 [11999 88001]]

-----------------------------------------------------


In [14]:
max_feature_sizes = [500, 1500, 2500]
for size in max_feature_sizes:
  print("Tfidf Vectorizer No. of Features = ", size)
  print("")
  X_train_scaled, X_val_scaled, X_test_scaled = getTfidfVectorizerFeatures(size)
  print("Logistic Regression")
  print("")
  LogisticRegressionModel(X_train_scaled, X_val_scaled)
  print("")
  print("")
  print("Support Vector Machine")
  print("")
  SVMClassifier(X_train_scaled, X_val_scaled)
  print("")
  print("")
  print("Multinomial Naive Bayes")
  print("")
  MNBClassifier(X_train_scaled, X_val_scaled)
  print("")
  print("")
  print("Multilayer Perceptron")
  print("")
  MLPClassifier(X_train_scaled, X_val_scaled)
  print("")
  print("-----------------------------------------------------")

Tfidf Vectorizer No. of Features =  500

Logistic Regression

Time taken: 1.2804358005523682 seconds
Training Accuracy:  0.849875
Validation Accuracy:  0.8492
Classification Report
              precision    recall  f1-score   support

    Positive       0.85      0.84      0.85    100000
    Negative       0.84      0.86      0.85    100000

    accuracy                           0.85    200000
   macro avg       0.85      0.85      0.85    200000
weighted avg       0.85      0.85      0.85    200000

Confusion Matrix
[[84289 15711]
 [14449 85551]]


Support Vector Machine

Time taken: 4.001117944717407 seconds
Training Accuracy:  0.8408816666666666
Validation Accuracy:  0.839735
Classification Report
              precision    recall  f1-score   support

    Positive       0.85      0.83      0.84    100000
    Negative       0.83      0.85      0.84    100000

    accuracy                           0.84    200000
   macro avg       0.84      0.84      0.84    200000
weighted avg    

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m18750/18750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 315us/step
[1m6250/6250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 298us/step
Time taken: 121.2576949596405 seconds
Training Accuracy:  0.975595
Validation Accuracy:  0.906255
Classification Report
              precision    recall  f1-score   support

    Positive       0.89      0.92      0.91    100000
    Negative       0.92      0.89      0.90    100000

    accuracy                           0.91    200000
   macro avg       0.91      0.91      0.91    200000
weighted avg       0.91      0.91      0.91    200000

Confusion Matrix
[[92455  7545]
 [11204 88796]]

-----------------------------------------------------
Tfidf Vectorizer No. of Features =  1500

Logistic Regression

Time taken: 1.884268045425415 seconds
Training Accuracy:  0.8869383333333334
Validation Accuracy:  0.884345
Classification Report
              precision    recall  f1-score   support

    Positive       0.89      0.88      0.8

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m18750/18750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 361us/step
[1m6250/6250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 357us/step
Time taken: 221.21512126922607 seconds
Training Accuracy:  0.9974716666666666
Validation Accuracy:  0.93577
Classification Report
              precision    recall  f1-score   support

    Positive       0.92      0.95      0.94    100000
    Negative       0.95      0.92      0.93    100000

    accuracy                           0.94    200000
   macro avg       0.94      0.94      0.94    200000
weighted avg       0.94      0.94      0.94    200000

Confusion Matrix
[[95110  4890]
 [ 7956 92044]]

-----------------------------------------------------
Tfidf Vectorizer No. of Features =  2500

Logistic Regression

Time taken: 2.0013020038604736 seconds
Training Accuracy:  0.89761
Validation Accuracy:  0.894305
Classification Report
              precision    recall  f1-score   support

    Positive       0.90      0.89      0.8

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m18750/18750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 447us/step
[1m6250/6250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 447us/step
Time taken: 331.0425090789795 seconds
Training Accuracy:  0.9987483333333333
Validation Accuracy:  0.942755
Classification Report
              precision    recall  f1-score   support

    Positive       0.93      0.95      0.94    100000
    Negative       0.95      0.93      0.94    100000

    accuracy                           0.94    200000
   macro avg       0.94      0.94      0.94    200000
weighted avg       0.94      0.94      0.94    200000

Confusion Matrix
[[95234  4766]
 [ 6683 93317]]

-----------------------------------------------------


In [None]:
max_feature_sizes = [500, 1500, 2500]
for size in max_feature_sizes:
  print("Count Vectorizer No. of Features = ", size)
  print("")
  X_train_scaled, X_val_scaled, X_test_scaled = getCountVectorizerFeatures(size)
  print("Logistic Regression")
  print("")
  LogisticRegressionModel(X_train_scaled, X_val_scaled)
  print("")
  print("")
  print("Support Vector Machine")
  print("")
  SVMClassifier(X_train_scaled, X_val_scaled)
  print("")
  print("")
  print("Multinomial Naive Bayes")
  print("")
  MNBClassifier(X_train_scaled, X_val_scaled)
  print("")
  print("")
  print("Multilayer Perceptron")
  print("")
  MLPClassifier(X_train_scaled, X_val_scaled)
  print("")
  print("-----------------------------------------------------")

Count Vectorizer No. of Features =  500

Logistic Regression

Time taken: 2.277576446533203 seconds
Training Accuracy:  0.8483383333333333
Validation Accuracy:  0.84742
Classification Report
              precision    recall  f1-score   support

    Positive       0.86      0.83      0.85    100000
    Negative       0.84      0.86      0.85    100000

    accuracy                           0.85    200000
   macro avg       0.85      0.85      0.85    200000
weighted avg       0.85      0.85      0.85    200000

Confusion Matrix
[[83396 16604]
 [13912 86088]]


Support Vector Machine

Time taken: 7.507729530334473 seconds
Training Accuracy:  0.842605
Validation Accuracy:  0.84116
Classification Report
              precision    recall  f1-score   support

    Positive       0.86      0.82      0.84    100000
    Negative       0.82      0.87      0.85    100000

    accuracy                           0.84    200000
   macro avg       0.84      0.84      0.84    200000
weighted avg     

In [None]:
# # Get a random sample of 100 rows with sentiment equal to Positive
# random_sample_sentiment_pos = df[df['sentiment'] == 1].sample(n=50000, random_state=42)

# # Get a random sample of 100 rows with sentiment equal to Negative
# random_sample_sentiment_neg = df[df['sentiment'] == -1].sample(n=50000, random_state=42)

# random_samples_combined = pd.concat([random_sample_sentiment_pos, random_sample_sentiment_neg])

# X = random_samples_combined.drop(columns=['sentiment'], axis=1)
# y = random_samples_combined['sentiment']

# y = y.replace(-1, 0)

In [None]:
# from sklearn.model_selection import train_test_split

# # Splitting the data into training, validation and testing sets (60%, 20%, 20%)
# X_train, X_test, y_train, y_test = train_test_split(X['preprocessed_text'], y, test_size=0.4, random_state=42, stratify=y)
# X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42, stratify=y_test)

# # Print the shapes of the resulting sets
# print("Training set shape:", X_train.shape, y_train.shape)
# print("Validation set shape:", X_val.shape, y_val.shape)
# print("Testing set shape:", X_test.shape, y_test.shape)


In [8]:
# Load pre-trained GloVe embeddings
def load_glove_embeddings(file_path):
    embeddings_index = {}
    with open(file_path, encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

# Load pre-trained GloVe embeddings
glove_file = '../HW2/glove.6B/glove.6B.100d.txt'
glove_model = load_glove_embeddings(glove_file)

In [9]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

word_tokenizer = Tokenizer()
word_tokenizer.fit_on_texts(X_train['preprocessed_summary'])
X_train_seq = word_tokenizer.texts_to_sequences(X_train['preprocessed_summary'])
X_val_seq = word_tokenizer.texts_to_sequences(X_val['preprocessed_summary'])
X_test_seq = word_tokenizer.texts_to_sequences(X_test['preprocessed_summary'])

# Padding all reviews to fixed length
maxlen = 500
X_train_padded = pad_sequences(X_train_seq, padding='post', maxlen=maxlen)
X_val_padded = pad_sequences(X_val_seq, padding='post', maxlen=maxlen)
X_test_padded = pad_sequences(X_test_seq, padding='post', maxlen=maxlen)

In [10]:
from numpy import zeros
from numpy import asarray

vocab_length = len(word_tokenizer.word_index) + 1
embedding_matrix = zeros((vocab_length, 100))
for word, index in word_tokenizer.word_index.items():
    embedding_vector = glove_model.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

# Print Embedding Matrix shape
embedding_matrix.shape

(17047, 100)

In [11]:
from tensorflow.keras.models import Sequential
import keras
from keras.layers import LSTM, Dense, Dropout, Embedding
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from keras.optimizers import Adam

start_time = time.time()

embedding_layer = Embedding(input_dim=vocab_length, output_dim=100, embeddings_initializer=keras.initializers.Constant(embedding_matrix), trainable=False)

def getModel():
    lstm_model = Sequential()
    lstm_model.add(embedding_layer)
    lstm_model.add(LSTM(128))
    lstm_model.add(Dense(1, activation='sigmoid'))
    optimizer = Adam(learning_rate=0.001)
    lstm_model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['acc'])
    return lstm_model

lstm_model = getModel()
lstm_model.fit(X_train_padded, y_train, epochs=20, batch_size=32, verbose=True)


Epoch 1/20
[1m 337/1875[0m [32m━━━[0m[37m━━━━━━━━━━━━━━━━━[0m [1m7:07[0m 278ms/step - acc: 0.4951 - loss: 0.6934

KeyboardInterrupt: 

In [None]:
predicted_train = lstm_model.predict(X_train)
predicted_val = lstm_model.predict(X_val)

predicted_train = (predicted_train > 0.5).astype(int)
predicted_val = (predicted_val > 0.5).astype(int)

accuracy_train = accuracy_score(y_train,predicted_train)

accuracy_val = accuracy_score(y_val,predicted_val)

end_time = time.time()
print("Time taken:", end_time - start_time, "seconds")

print("Training Accuracy: ", accuracy_train)
print("Validation Accuracy: ", accuracy_val)