Getting drive


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
# ----- path variable ----------
path = "/content/drive/MyDrive/NLP_CW/data"
print(os.path.isdir(path))
# ----- path variable ------

True


Putting data into CSV file into drive

In [None]:
import csv
data_label = 1
with open('reviews.csv', 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['Data Label', 'Text']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for folder_path in os.listdir(path):
      current_folder_path = path +"/" + folder_path
      print(f"current folder_path is {current_folder_path}")
      for text_file in os.listdir(current_folder_path):
          with open(os.path.join(current_folder_path, text_file)) as f:
            s = f.read().replace("<br />",". ")
            writer.writerow({'Data Label': data_label, 'Text': s})
      data_label = 0

current folder_path is /content/drive/MyDrive/NLP_CW/data/pos
current folder_path is /content/drive/MyDrive/NLP_CW/data/neg


Splitting data into train,dev,and test

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

df_reviews = pd.read_csv('reviews.csv')

# Initial shuffle - DataFrame
df_reviews = df_reviews.sample(frac=1, random_state=1).reset_index(drop=True)

# Group data
grouped = df_reviews.groupby('Data Label')


# Dictionary to store train, test, dev sets for each class
class_sets = {label: {'train': None, 'test': None, 'dev': None} for label, _ in grouped}

# Split each class into train, test, dev sets maintaining the ratio
for label, group in grouped:
   # Split 60-40 initially
    train, test_dev = train_test_split(group, test_size=0.4, random_state=42)
    # Split 50-50 from the remaining 40%
    test, dev = train_test_split(test_dev, test_size=0.5, random_state=42)

    class_sets[label]['train'] = train
    class_sets[label]['test'] = test
    class_sets[label]['dev'] = dev

# Concatenate dataframes for each class to get final train, test, dev sets
train_set = pd.concat([class_sets[label]['train'] for label in class_sets])
dev_set = pd.concat([class_sets[label]['dev'] for label in class_sets])
test_set = pd.concat([class_sets[label]['test'] for label in class_sets])

# Shuffle
train_set = train_set.sample(frac=1, random_state=42).reset_index(drop=True)
dev_set = dev_set.sample(frac=1, random_state=42).reset_index(drop=True)
test_set = test_set.sample(frac=1, random_state=42).reset_index(drop=True)







Chcking if data-frame was split equally


In [None]:


# Count the number of rows with a certain value in the 'column_name'
count_specific_value = len(test_set[test_set['Data Label'] == 1])

# Display the count
print(f"Number of rows with 'specific_value' in 'column_name': {count_specific_value}")


Number of rows with 'specific_value' in 'column_name': 400


In [None]:
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams
from nltk.corpus import wordnet
from nltk import pos_tag
from nltk.corpus import opinion_lexicon
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('words')
# nltk.download('opinion_lexicon')
lemmatizer = WordNetLemmatizer()



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package opinion_lexicon to /root/nltk_data...
[nltk_data]   Unzipping corpora/opinion_lexicon.zip.


Helper functions including lemmatizer

In [None]:
def concat_feature_list_into_list(my_list):
  ans = []
  for item in my_list:
    ans.append(''.join(item))
  return ans

def get_ngrams(lemmatized_words, range_):
  total_list = []
  ans = []
  for x in range(1,range_ + 1):
    total_list += list(ngrams(lemmatized_words, x))
  return total_list

def lemmatize_text(text):
  stop_words = set(stopwords.words('english'))
  t = text.split('.')
  text = []
  lemmatized_words = []

  for t_ in t:
     text.append(''.join([char for char in t_ if char not in string.punctuation]))
  # print(text)
  # text = ''.join([char for char in text if char not in string.punctuation])
  for sentence in text:
    words = nltk.word_tokenize(sentence)
    pos_tags = pos_tag(words)
    ind = 0
    for word, pos in pos_tags:
      if (word.lower() not in stop_words) and (word == word.lower() or (word !=word.lower() and ind == 0)):
          if pos.startswith('V'):  # Check if the word is a verb
              lemma = lemmatizer.lemmatize(word, pos='v')  # Lemmatize verb
          elif pos.startswith('N'):  # Check if the word is a noun
              lemma = lemmatizer.lemmatize(word, pos='n')  # Lemmatize noun
          elif pos.startswith('J'):  # Check if the word is an adjective
              lemma = lemmatizer.lemmatize(word, pos='a')  # Lemmatize adjective
          elif pos.startswith('R'):  # Check if the word is an adverb
              lemma = lemmatizer.lemmatize(word, pos='r')  # Lemmatize adverb
          elif pos.startswith('U'):
            lemma = lemmatizer.lemmatize(word, pos='r')  # Lemmatize interjection
          else:
            lemma = lemmatizer.lemmatize(word)  # Default to lemmatizing as a noun
          if lemma not in lemmatized_words:
              lemmatized_words.append(lemma)
      ind = 1

  return lemmatized_words

Testing if looping through dataframe works


In [None]:
for index, row in train_set[20:25].iterrows():
  t = lemmatize_text(row["Text"])
  t = get_ngrams(t,2)
  t = concat_feature_list_into_list(t)
  t = ' '.join(t)
  print(row["Data Label"], t)

0 Yeah get and his milieu but at the same time feel be largely overrated more than a little disturbing Overrated because supposed realism he introduce mill crowd crumble architecture etc moot by absurd downright goofy way that character behave In pursuit of utilize many nonactors their deerintheheadlights stare painfully awkward line delivery give whole terribly offkilter inconsistent And frankly toothless misshapenlyfeatured people painful to look disturb me least casual prevalent homosexual content Not prudish or homophobic neither emphasis place upon homoerotic image situation contrary neorealism otherwise espouse so it come off gratuitous force One can almost hear him say stick cute naked boy in this scene seem try play up angle thumb nose critic other enjoy aspect himself regardless what audience might prefer The disjointedness 9 10 different story strike failing storyteller rather an He bore with each wrap them unconvincingly conviction Even final dialog film which some find pith

Helper function - to get label and features

In [None]:
def get_features_and_labels(data_frame):
  labels, features = [] , []
  for index, row in data_frame.iterrows():
      t = lemmatize_text(row["Text"])
      t = get_ngrams(t,3)
      t = concat_feature_list_into_list(t)
      t = ' '.join(t)
      features.append(t)
      labels.append(row["Data Label"])
      # print(row["Data Label"], t)
  return features,labels



In [None]:
text, labels = get_features_and_labels(train_set[20:25])

for label, text in zip(labels,text):
  print(label, text)

0 Yeah get and his milieu but at the same time feel be largely overrated more than a little disturbing Overrated because supposed realism he introduce mill crowd crumble architecture etc moot by absurd downright goofy way that character behave In pursuit of utilize many nonactors their deerintheheadlights stare painfully awkward line delivery give whole terribly offkilter inconsistent And frankly toothless misshapenlyfeatured people painful to look disturb me least casual prevalent homosexual content Not prudish or homophobic neither emphasis place upon homoerotic image situation contrary neorealism otherwise espouse so it come off gratuitous force One can almost hear him say stick cute naked boy in this scene seem try play up angle thumb nose critic other enjoy aspect himself regardless what audience might prefer The disjointedness 9 10 different story strike failing storyteller rather an He bore with each wrap them unconvincingly conviction Even final dialog film which some find pith

In [None]:
training_text, training_labels = get_features_and_labels(train_set)


Don't RUN  below since feature sets were generated by changing the helper functions.

In [None]:
training_text_0_0, training_labels_0_0 = get_features_and_labels(train_set)

In [None]:
training_text_0_1, training_labels_0_1 = get_features_and_labels(train_set)

In [None]:
training_text_1_0, training_labels_1_0 = get_features_and_labels(train_set)

In [None]:
print(len(training_text))


2400


Import Built-in NB

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report

In [None]:
test_text, test_labels = get_features_and_labels(test_set)
dev_text, dev_labels = get_features_and_labels(dev_set)

In [None]:
dev_text_0_0, dev_labels_0_0 =  get_features_and_labels(dev_set)
test_text_0_0, test_labels_0_0= get_features_and_labels(test_set)

In [None]:
dev_text_0_1, dev_labels_0_1 =  get_features_and_labels(dev_set)
test_text_0_1, test_labels_0_1= get_features_and_labels(test_set)

In [None]:
dev_text_1_0, dev_labels_1_0 =  get_features_and_labels(dev_set)
test_text_1_0, test_labels_1_0 = get_features_and_labels(test_set)



```
# This is formatted as code
```

CUSTOM-TFIDF : really slow, better to use built-in TFIDF

In [None]:
from TFIDF import TFIDF

tfidf = TFIDF()


In [None]:
tfidf.fit(training_text)
tfidf_train = tfidf.return_tf_idf_values()
tfidf.fit(dev_text)
tfidf_test = tfidf.return_tf_idf_values()

Real TFIDF

In [None]:
tfidf = TfidfVectorizer(min_df = 20).fit(training_text)
tfidf_train = tfidf.transform(training_text).toarray()
tfidf_test = tfidf.transform(dev_text).toarray()

In [None]:
tfidf_0_0 = TfidfVectorizer(min_df = 20).fit(training_text_0_0)
tfidf_train_0_0 = tfidf_0_0.transform(training_text_0_0).toarray()
tfidf_test_0_0 = tfidf_0_0.transform(dev_text_0_0).toarray()

In [None]:
tfidf_0_1 = TfidfVectorizer(min_df = 20).fit(training_text_0_1)
tfidf_train_0_1 = tfidf_0_1.transform(training_text_0_1).toarray()
tfidf_test_0_1 = tfidf_0_1.transform(dev_text_0_1).toarray()

In [None]:
tfidf_1_0 = TfidfVectorizer(min_df = 20).fit(training_text_1_0)
tfidf_train_1_0 = tfidf_1_0.transform(training_text_1_0).toarray()
tfidf_test_1_0 = tfidf_1_0.transform(dev_text_1_0).toarray()

Built in *Gaussian* Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
NB = GaussianNB()
NB.fit(tfidf_train,training_labels)
predicted_NB = NB.predict(tfidf_test)
print(classification_report(test_labels, predicted_NB))

              precision    recall  f1-score   support

           0       0.81      0.78      0.79       400
           1       0.79      0.81      0.80       400

    accuracy                           0.80       800
   macro avg       0.80      0.80      0.80       800
weighted avg       0.80      0.80      0.80       800



Custom NaiveBayes

In [None]:
from NB import Classifier
NBClassifier = Classifier()
NBClassifier.train(tfidf_train,training_labels)
predicted_custom_NB = NBClassifier.predict(tfidf_test)
print(classification_report(test_labels, predicted_custom_NB))


ModuleNotFoundError: ignored

Custom TF-IDF Vectorizer BELOW

SVM for classifcation- importing built-in SVM

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score


In [None]:
# best is rbf kernel
# {'sigmoid', 'rbf', 'linear', 'precomputed', 'poly'}
svm_classifier = SVC(kernel='rbf', gamma='scale', probability=True)
svm_classifier.fit(tfidf_train, training_labels)

# Predict on the test set
predictions = svm_classifier.predict(tfidf_test)

# Evaluate the model
accuracy = accuracy_score(test_labels, predictions)
print(f"Accuracy: {accuracy}")

print(classification_report(dev_labels, predictions))


Accuracy: 0.84375
              precision    recall  f1-score   support

           0       0.85      0.83      0.84       400
           1       0.83      0.86      0.85       400

    accuracy                           0.84       800
   macro avg       0.84      0.84      0.84       800
weighted avg       0.84      0.84      0.84       800



In [None]:
svm_classifier = SVC()
svm_classifier.fit(tfidf_train, training_labels)

# Predict on the test set
predictions = svm_classifier.predict(tfidf_test)

# Evaluate the model
accuracy = accuracy_score(test_labels, predictions)
print(f"Accuracy: {accuracy}")

print(classification_report(dev_labels, predictions))

Accuracy: 0.84375
              precision    recall  f1-score   support

           0       0.85      0.83      0.84       400
           1       0.83      0.86      0.85       400

    accuracy                           0.84       800
   macro avg       0.84      0.84      0.84       800
weighted avg       0.84      0.84      0.84       800



In [None]:
svm_classifier = SVC()
svm_classifier.fit(tfidf_train_0_0, training_labels_0_0)

# Predict on the test set
predictions = svm_classifier.predict(tfidf_test_0_0)

# Evaluate the model
accuracy = accuracy_score(test_labels_0_0, predictions)
print(f"Accuracy: {accuracy}")

print(classification_report(test_labels_0_0, predictions))

Accuracy: 0.82125
              precision    recall  f1-score   support

           0       0.83      0.81      0.82       400
           1       0.81      0.83      0.82       400

    accuracy                           0.82       800
   macro avg       0.82      0.82      0.82       800
weighted avg       0.82      0.82      0.82       800



In [None]:
svm_classifier = SVC()
svm_classifier.fit(tfidf_train_0_1, training_labels_0_1)

# Predict on the test set
predictions = svm_classifier.predict(tfidf_test_0_1)

# Evaluate the model
accuracy = accuracy_score(test_labels_0_1, predictions)
print(f"Accuracy: {accuracy}")

print(classification_report(test_labels_0_1, predictions))

Accuracy: 0.82125
              precision    recall  f1-score   support

           0       0.83      0.81      0.82       400
           1       0.81      0.83      0.82       400

    accuracy                           0.82       800
   macro avg       0.82      0.82      0.82       800
weighted avg       0.82      0.82      0.82       800



In [None]:
svm_classifier = SVC()
svm_classifier.fit(tfidf_train_1_0, training_labels_1_0)

# Predict on the test set
predictions = svm_classifier.predict(tfidf_test_1_0)

# Evaluate the model
accuracy = accuracy_score(test_labels_1_0, predictions)
print(f"Accuracy: {accuracy}")

print(classification_report(test_labels_1_0, predictions))

Accuracy: 0.84375
              precision    recall  f1-score   support

           0       0.85      0.83      0.84       400
           1       0.83      0.86      0.85       400

    accuracy                           0.84       800
   macro avg       0.84      0.84      0.84       800
weighted avg       0.84      0.84      0.84       800



Logistic Regression - importing built in

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
# Example of adjusting parameters
#  set c = 1, penalty = 'l2' maxiter = [250,500,1000]
#  set c = [1,0.25,0.0.0625], penalty = 'l2' , maxiter = 1000
#  set c = 1, penalty = ['l1','l2] ,maxiter = 1000
logistic_classifier = LogisticRegression(C=1, penalty='l2', max_iter=1000)

logistic_classifier.fit(tfidf_train, training_labels)

# Predict on the test set
predictions = logistic_classifier.predict(tfidf_test)

# Evaluate the model
accuracy = accuracy_score(dev_labels, predictions)
print(f"Accuracy: {accuracy}")

# Additional evaluation metrics
print(classification_report(dev_labels, predictions))

Accuracy: 0.82125
              precision    recall  f1-score   support

           0       0.82      0.82      0.82       400
           1       0.82      0.82      0.82       400

    accuracy                           0.82       800
   macro avg       0.82      0.82      0.82       800
weighted avg       0.82      0.82      0.82       800



SGD Classifier - built in

In [None]:
from sklearn.linear_model import SGDClassifier

Best- with stopwords excluded and interjections lemmatized

In [None]:
# loss = ['huber','hinge','log_loss'],alpha = 0.001,max_iter = 1000,ranom_state = 42)
# loss = 'hinge', alpha = [0.001,0.0005,0.00025], max_iter = 1000, random_state = 42)
# best is log_loss
sgd_classifier = SGDClassifier(loss='log_loss', alpha=0.001, max_iter=1000, random_state=42)
sgd_classifier.fit(tfidf_train, training_labels)

# Predict on the test set
predictions = sgd_classifier.predict(tfidf_test)

# Evaluate the model
accuracy = accuracy_score(dev_labels, predictions)
print(f"Accuracy: {accuracy}")

# Additional evaluation metrics
print(classification_report(dev_labels, predictions))

Accuracy: 0.84125
              precision    recall  f1-score   support

           0       0.85      0.82      0.84       400
           1       0.83      0.86      0.84       400

    accuracy                           0.84       800
   macro avg       0.84      0.84      0.84       800
weighted avg       0.84      0.84      0.84       800



Same as above, except arguments are using default values

In [None]:

sgd_classifier = SGDClassifier()
sgd_classifier.fit(tfidf_train, training_labels)

# Predict on the test set
predictions = sgd_classifier.predict(tfidf_test)

# Evaluate the model
accuracy = accuracy_score(dev_labels, predictions)
print(f"Accuracy: {accuracy}")

# Additional evaluation metrics
print(classification_report(dev_labels, predictions))

Accuracy: 0.81125
              precision    recall  f1-score   support

           0       0.81      0.82      0.81       400
           1       0.82      0.80      0.81       400

    accuracy                           0.81       800
   macro avg       0.81      0.81      0.81       800
weighted avg       0.81      0.81      0.81       800



For feature set (0,0) - stopwords included, interjection not lemmatized


In [None]:
sgd_classifier = SGDClassifier()
sgd_classifier.fit(tfidf_train_0_0, training_labels_0_0)

# Predict on the test set
predictions = sgd_classifier.predict(tfidf_test_0_0)

# Evaluate the model
accuracy = accuracy_score(dev_labels_0_0, predictions)
print(f"Accuracy: {accuracy}")

# Additional evaluation metrics
print(classification_report(dev_labels_0_0, predictions))

Accuracy: 0.79125
              precision    recall  f1-score   support

           0       0.77      0.84      0.80       400
           1       0.82      0.74      0.78       400

    accuracy                           0.79       800
   macro avg       0.79      0.79      0.79       800
weighted avg       0.79      0.79      0.79       800



For feature set (0,1) - stopwords included, interjection lemmatized

In [None]:
sgd_classifier = SGDClassifier()
sgd_classifier.fit(tfidf_train_0_1, training_labels_0_1)

# Predict on the test set
predictions = sgd_classifier.predict(tfidf_test_0_1)

# Evaluate the model
accuracy = accuracy_score(dev_labels_0_1, predictions)
print(f"Accuracy: {accuracy}")

# Additional evaluation metrics
print(classification_report(dev_labels_0_1, predictions))

Accuracy: 0.8025
              precision    recall  f1-score   support

           0       0.81      0.80      0.80       400
           1       0.80      0.81      0.80       400

    accuracy                           0.80       800
   macro avg       0.80      0.80      0.80       800
weighted avg       0.80      0.80      0.80       800



For feature set (1,0) - stopwords excluded, interjection not lemmatized

In [None]:
sgd_classifier = SGDClassifier()
sgd_classifier.fit(tfidf_train_1_0, training_labels_1_0)

# Predict on the test set
predictions = sgd_classifier.predict(tfidf_test_1_0)

# Evaluate the model
accuracy = accuracy_score(dev_labels_1_0, predictions)
print(f"Accuracy: {accuracy}")

# Additional evaluation metrics
print(classification_report(dev_labels_1_0, predictions))

Accuracy: 0.81
              precision    recall  f1-score   support

           0       0.79      0.84      0.82       400
           1       0.83      0.78      0.80       400

    accuracy                           0.81       800
   macro avg       0.81      0.81      0.81       800
weighted avg       0.81      0.81      0.81       800



Final SGD and SVM on test set

In [None]:
tfidf_test_final = tfidf.transform(test_text).toarray()

svm_classifier = SVC(kernel='rbf', gamma='scale', probability=True)
print("---SVM on test set with best feature set---")
# best set for SVM
svm_classifier.fit(tfidf_train, training_labels)

# Predict on the test set
predictions = svm_classifier.predict(tfidf_test_final)

# Evaluate the model
accuracy = accuracy_score(test_labels, predictions)
print(f"Accuracy: {accuracy}")

print(classification_report(test_labels, predictions))




---SVM on test set with best feature set---
Accuracy: 0.86375
              precision    recall  f1-score   support

           0       0.90      0.82      0.86       400
           1       0.83      0.91      0.87       400

    accuracy                           0.86       800
   macro avg       0.87      0.86      0.86       800
weighted avg       0.87      0.86      0.86       800

---SGD Classifier on test set with best feature set---
Accuracy: 0.86125
              precision    recall  f1-score   support

           0       0.90      0.81      0.85       400
           1       0.83      0.92      0.87       400

    accuracy                           0.86       800
   macro avg       0.87      0.86      0.86       800
weighted avg       0.87      0.86      0.86       800



In [None]:
sgd_classifier = SGDClassifier(loss='log_loss', alpha=0.001, max_iter=1000, random_state=42)
print("---SGD Classifier on test set with best feature set---")
sgd_classifier.fit(tfidf_train, training_labels)

# Predict on the test set
predictions = sgd_classifier.predict(tfidf_test_final)

# Evaluate the model
accuracy = accuracy_score(test_labels, predictions)
print(f"Accuracy: {accuracy}")

# Additional evaluation metrics
print(classification_report(test_labels, predictions))

---SGD Classifier on test set with best feature set---
Accuracy: 0.86125
              precision    recall  f1-score   support

           0       0.90      0.81      0.85       400
           1       0.83      0.92      0.87       400

    accuracy                           0.86       800
   macro avg       0.87      0.86      0.86       800
weighted avg       0.87      0.86      0.86       800

