In [1]:

import numpy as np
import pandas as pd
import nltk
from nltk import ngrams

import string
import nltk

nltk.download('stopwords')
nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.lancaster import LancasterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shikh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\shikh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
import os

postive_data_path = "../CodeA/data/data/pos/"
negative_data_path ="../CodeA/data/data/neg/"

In [3]:
pos_files = os.listdir(postive_data_path)
for f in range(len(pos_files)):
    pos_files[f] = postive_data_path + pos_files[f]

In [4]:
neg_files = os.listdir(negative_data_path)
for f in range(len(neg_files)):
    neg_files[f] = negative_data_path + neg_files[f]

In [6]:
pos_labels = [1]*len(pos_files)
neg_labels = [0]*len(neg_files)

In [7]:
df = pd.DataFrame(columns=["file_names","labels"])
df["file_names"] = pos_files + neg_files
df["labels"] = pos_labels + neg_labels

In [8]:
#Code to get the contents in those .txt files.

def get_data(df):
    all_txt = []
    for i,j in df.iterrows(): #https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.iterrows.html
        with open(j["file_names"],'r', encoding='utf-8') as f: #Asked GPT since there was an error loading the data 
            txt = f.read()
            all_txt.append(txt)
#         print(j)
    df["reviews"] = all_txt
    return df

In [9]:
df = get_data(df)

In [10]:
#Tokenization - Data 
from nltk.tokenize import word_tokenize

tokenized = []

for i in df.index:
    tokenized.append(word_tokenize(df['reviews'][i].lower()))
    #tokenized.sort()
    
df.insert(3,'tokenized_review', tokenized,True) 
df.head()

Unnamed: 0,file_names,labels,reviews,tokenized_review
0,../CodeA/data/data/pos/10000_8.txt,1,Homelessness (or Houselessness as George Carli...,"[homelessness, (, or, houselessness, as, georg..."
1,../CodeA/data/data/pos/10008_7.txt,1,"You know, Robin Williams, God bless him, is co...","[you, know, ,, robin, williams, ,, god, bless,..."
2,../CodeA/data/data/pos/10013_7.txt,1,"Like one of the previous commenters said, this...","[like, one, of, the, previous, commenters, sai..."
3,../CodeA/data/data/pos/10019_8.txt,1,"When it comes to movies I can be pretty picky,...","[when, it, comes, to, movies, i, can, be, pret..."
4,../CodeA/data/data/pos/10020_8.txt,1,The legendary Boris Karloff ended his illustri...,"[the, legendary, boris, karloff, ended, his, i..."


**Stemming, bigrams and TF-IDF**

In [10]:
import pandas as pd
import numpy as np
from nltk.util import ngrams
from nltk.stem import LancasterStemmer
import nltk
import math



# Function to calculate TF-IDF with n-grams and filter based on frequency threshold
def calculate_tfidf_ngrams_with_threshold(documents, n, threshold):
    # Apply stemming
    st = LancasterStemmer()
    stemmed_documents = [[' '.join([st.stem(word) for word in words])] for words in documents]

    # Generate n-grams and calculate frequency
    all_bigrams = []
    for doc in stemmed_documents:
        #Splitting terms in the document to words(bigrams)
        terms = list(ngrams(doc[0].split(), n))
        all_bigrams.extend(terms)

    # Calculate the frequency of all bigrams
    frequency_word_bigrams = nltk.FreqDist(all_bigrams)

    # Filter bigrams based on the threshold
    filtered_bigrams = {bigram: freq for bigram, freq in frequency_word_bigrams.items() if freq >= threshold}

    # Create a vocabulary
    all_terms = set(filtered_bigrams.keys())
    vocabulary = sorted(list(all_terms))

    # Calculate TF
    #https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html
    tf_matrix = pd.DataFrame(0, index=df.index, columns=vocabulary, dtype=float)
    #Iterating through index and documents 
    for i, doc in enumerate(stemmed_documents): #https://realpython.com/python-enumerate/
        terms = list(ngrams(doc[0].split(), n))
        for term in terms:
            #counting the frequency of each term in each document.
            #Incrementing the tf matrix by 1 if the term is in the vocabulary
            if term in vocabulary:
                tf_matrix.at[i, term] += 1

    # Calculate IDF
    #https://pandas.pydata.org/docs/reference/api/pandas.Series.html
    idf_vector = pd.Series(0, index=vocabulary, dtype=float)
    N = len(stemmed_documents)
    for term in vocabulary:
        #counting the number of documents in which the bigram appears 
        df_term = sum([1 for doc in stemmed_documents if term in list(ngrams(doc[0].split(), n))])
        #Calculating the IDF value of each bigram
        idf_vector.at[term] = math.log(N / (1 + df_term), 10)

    # Calculate TF-IDF
    tfidf_matrix = tf_matrix * idf_vector

    return tfidf_matrix, vocabulary

# Calculate TF-IDF matrix with bigrams and filter based on frequency threshold
tfidf_matrix, vocabulary = calculate_tfidf_ngrams_with_threshold(df['tokenized_review'], n=2, threshold=100)


In [11]:
# Split data into train, developement  and test sets with Feature 1 - Stemming, removed stopwords and punctuation
# and Applied TF-IDF
X_train1, X_test1, y_train1, y_test1 = train_test_split(tfidf_matrix, df.labels, test_size=0.2, random_state=42)
X_train1, X_dev1, y_train1, y_dev1 = train_test_split(X_train1, y_train1,test_size=0.2, train_size=0.8, random_state=42)

In [12]:
# Create and train a machine learning model (e.g., Naive Bayes) - for Feature 1
model_1 = MultinomialNB()
model_1.fit(X_train1, y_train1)
model_1.predict(X_dev1)
# Evaluate the model's performance on the dev data
accuracy1 = model_1.score(X_dev1, y_dev1)
print(accuracy1)

0.765625


**Lemmatization, removed stopwords and punctuations, bigrams and TF-IDF**

In [29]:
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams
import nltk
import string

stoplist = set(nltk.corpus.stopwords.words('english'))

# Function to calculate TF-IDF with n-grams and filter based on frequency threshold
def calculate_tfidf_ngrams_with_threshold(documents, n, threshold):
    lemmatizer = WordNetLemmatizer()
    lemmatized_documents = [[' '.join([lemmatizer.lemmatize(word) for word in words if word not in stoplist and word not in string.punctuation])] for words in documents]

    # Generate n-grams and calculate frequency
    all_bigrams = []
    for doc in lemmatized_documents:
        terms = list(ngrams(doc[0].split(), n))
        all_bigrams.extend(terms)

    # Calculate the frequency of all bigrams
    frequency_word_bigrams = nltk.FreqDist(all_bigrams)

    # Filter bigrams based on the threshold
    filtered_bigrams = {bigram: freq for bigram, freq in frequency_word_bigrams.items() if freq >= threshold}

    # Create a vocabulary
    all_terms = set(filtered_bigrams.keys())
    vocabulary_2 = sorted(list(all_terms))

    # Calculate TF
    tf_matrix = pd.DataFrame(0, index=range(len(lemmatized_documents)), columns=vocabulary_2, dtype=float)
    for i, doc in enumerate(lemmatized_documents):
        terms = list(ngrams(doc[0].split(), n))
        for term in terms:
            if term in vocabulary_2:
                tf_matrix.at[i, term] += 1

    # Calculate IDF
    idf_vector = pd.Series(0, index=vocabulary_2, dtype=float)
    N = len(lemmatized_documents)
    for term in vocabulary_2:
        df_term = sum([1 for doc in lemmatized_documents if term in list(ngrams(doc[0].split(), n))])
        idf_vector.at[term] = math.log(N / (1 + df_term), 10)

    # Calculate TF-IDF
    tfidf_matrix_2 = tf_matrix * idf_vector

    return tfidf_matrix_2, vocabulary_2

# Calculate TF-IDF matrix with bigrams and filter based on frequency threshold
tfidf_matrix_2, vocabulary_2 = calculate_tfidf_ngrams_with_threshold(df['tokenized_review'], n=2, threshold=100)




In [21]:
# Split data into train, developement  and test sets with Feature 1 - Stemming, removed stopwords and punctuation
# and Applied TF-IDF
X_train2, X_test2, y_train2, y_test2 = train_test_split(tfidf_matrix_2, df.labels, test_size=0.2, random_state=42)
X_train2, X_dev2, y_train2, y_dev2 = train_test_split(X_train2, y_train2,test_size=0.2, train_size=0.8, random_state=42)

In [22]:
# Create and train a machine learning model (e.g., Naive Bayes) - for Feature 1
model_2 = MultinomialNB()
model_2.fit(X_train2, y_train2)
model_2.predict(X_dev2)
# Evaluate the model's performance on the dev data
accuracy2 = model_2.score(X_dev2, y_dev2)
print(accuracy2)

0.534375


**Stemming, removed stopwords and punctuations, bigrams and TF-IDF**

In [13]:
import pandas as pd
import numpy as np
from nltk.util import ngrams
from nltk.stem import LancasterStemmer
import nltk
import math

stoplist = set(nltk.corpus.stopwords.words('english'))

# Function to calculate TF-IDF with n-grams and filter based on frequency threshold
def calculate_tfidf_ngrams_with_threshold(documents, n, threshold):
    # Apply stemming
    st = LancasterStemmer()
    stemmed_documents_2 = [[' '.join([st.stem(word) for word in words if word not in stoplist and word not in string.punctuation])] for words in documents]


    # Generate n-grams and calculate frequency
    all_bigrams = []
    for doc in stemmed_documents_2:
        terms = list(ngrams(doc[0].split(), n))
        all_bigrams.extend(terms)

    # Calculate the frequency of all bigrams
    frequency_word_bigrams = nltk.FreqDist(all_bigrams)

    # Filter bigrams based on the threshold
    filtered_bigrams = {bigram: freq for bigram, freq in frequency_word_bigrams.items() if freq >= threshold}

    # Create a vocabulary
    all_terms = set(filtered_bigrams.keys())
    vocabulary_3 = sorted(list(all_terms))

    # Calculate TF
    tf_matrix = pd.DataFrame(0, index=df.index, columns=vocabulary_3, dtype=float)
    for i, doc in enumerate(stemmed_documents_2):
        terms = list(ngrams(doc[0].split(), n))
        for term in terms:
            if term in vocabulary_3:
                tf_matrix.at[i, term] += 1

    # Calculate IDF
    idf_vector = pd.Series(0, index=vocabulary_3, dtype=float)
    N = len(stemmed_documents_2)
    for term in vocabulary_3:
        df_term = sum([1 for doc in stemmed_documents_2 if term in list(ngrams(doc[0].split(), n))])
        idf_vector.at[term] = math.log(N / (1 + df_term), 10)

    # Calculate TF-IDF
    tfidf_matrix_3 = tf_matrix * idf_vector

    return tfidf_matrix_3, vocabulary_3

# Calculate TF-IDF matrix with bigrams and filter based on frequency threshold
tfidf_matrix_3, vocabulary_3 = calculate_tfidf_ngrams_with_threshold(df['tokenized_review'], n=2, threshold=100)


In [14]:
# Split data into train, developement  and test sets with Feature 1 - Stemming, removed stopwords and punctuation
# and Applied TF-IDF
X_train3, X_test3, y_train3, y_test3 = train_test_split(tfidf_matrix_3, df.labels, test_size=0.2, random_state=42)
X_train3, X_dev3, y_train3, y_dev3 = train_test_split(X_train3, y_train3,test_size=0.2, train_size=0.8, random_state=42)

In [15]:
# Create and train a machine learning model (e.g., Naive Bayes) - for Feature 1
model_3 = MultinomialNB()
model_3.fit(X_train3, y_train3)
model_3.predict(X_dev3)
# Evaluate the model's performance on the dev data
accuracy3 = model_3.score(X_dev3, y_dev3)
print(accuracy3)

0.5421875


**Stemming, removed stopwords and punctuations, unigrams and TF-IDF**

In [13]:
import pandas as pd
import numpy as np
from nltk.util import ngrams
from nltk.stem import LancasterStemmer
import math

stoplist = set(nltk.corpus.stopwords.words('english'))

# Function to calculate TF-IDF
def calculate_tfidf(documents):
    # Apply stemming
    st = LancasterStemmer()
    stemmed_documents_4 = [[' '.join([st.stem(word) for word in words if word not in stoplist and word not in string.punctuation])] for words in documents]


    # Create a vocabulary
    all_terms = set()
    for doc in stemmed_documents_4:
        all_terms.update(doc[0].split())
    vocabulary_4 = sorted(list(all_terms))

    # Calculate TF
    tf_matrix = pd.DataFrame(0, index=df.index, columns=vocabulary_4, dtype=float)
    for i, doc in enumerate(stemmed_documents_4):
        for term in doc[0].split():
            tf_matrix.at[i, term] += 1

    # Calculate IDF
    idf_vector = pd.Series(0, index=vocabulary_4, dtype=float)
    N = len(stemmed_documents_4)
    for term in vocabulary_4:
        df_term = sum([1 for doc in stemmed_documents_4 if term in doc[0].split()])
        idf_vector.at[term] = math.log(N / (1 + df_term), 10)

    # Calculate TF-IDF
    tfidf_matrix_4 = tf_matrix * idf_vector

    return tfidf_matrix_4, vocabulary_4

# Calculate TF-IDF matrix and get vocabulary
tfidf_matrix_4, vocabulary_4 = calculate_tfidf(df['tokenized_review'])


In [14]:
# Split data into train, developement  and test sets with Feature 1 - Stemming, removed stopwords and punctuation
# and Applied TF-IDF
X_train4, X_test4, y_train4, y_test4 = train_test_split(tfidf_matrix_4, df.labels, test_size=0.2, random_state=42)
X_train4, X_dev4, y_train4, y_dev4 = train_test_split(X_train4, y_train4,test_size=0.2, train_size=0.8, random_state=42)

In [15]:
# Create and train a machine learning model (e.g., Naive Bayes) - for Feature 1
model_4 = MultinomialNB()
model_4.fit(X_train4, y_train4)
model_4.predict(X_dev4)
# Evaluate the model's performance on the dev data
accuracy4 = model_4.score(X_dev4, y_dev4)
print(accuracy4)

0.784375


**Lemmatization, removed stopwords and punctuations, unigrams and TF-IDF**

In [16]:
import pandas as pd
import numpy as np
from nltk.util import ngrams
from nltk.stem import LancasterStemmer
import math

stoplist = set(nltk.corpus.stopwords.words('english'))

# Function to calculate TF-IDF
def calculate_tfidf(documents):
    # Apply stemming
    lemmatizer = WordNetLemmatizer()
    lemmatized_documents_2 = [[' '.join([lemmatizer.lemmatize(word) for word in words if word not in stoplist and word not in string.punctuation])] for words in documents]


    # Create a vocabulary
    all_terms = set()
    for doc in lemmatized_documents_2:
        all_terms.update(doc[0].split())
    vocabulary_5 = sorted(list(all_terms))

    # Calculate TF
    tf_matrix = pd.DataFrame(0, index=df.index, columns=vocabulary_5, dtype=float)
    for i, doc in enumerate(lemmatized_documents_2):
        for term in doc[0].split():
            tf_matrix.at[i, term] += 1

    # Calculate IDF
    idf_vector = pd.Series(0, index=vocabulary_5, dtype=float)
    N = len(lemmatized_documents_2)
    for term in vocabulary_5:
        df_term = sum([1 for doc in lemmatized_documents_2 if term in doc[0].split()])
        idf_vector.at[term] = math.log(N / (1 + df_term), 10)

    # Calculate TF-IDF
    tfidf_matrix_5 = tf_matrix * idf_vector

    return tfidf_matrix_5, vocabulary_5

# Calculate TF-IDF matrix and get vocabulary
tfidf_matrix_5, vocabulary_5 = calculate_tfidf(df['tokenized_review'])


In [17]:
# Split data into train, developement  and test sets with Feature 1 - Stemming, removed stopwords and punctuation
# and Applied TF-IDF
X_train5, X_test5, y_train5, y_test5 = train_test_split(tfidf_matrix_5, df.labels, test_size=0.2, random_state=42)
X_train5, X_dev5, y_train5, y_dev5 = train_test_split(X_train5, y_train5,test_size=0.2, train_size=0.8, random_state=42)

In [18]:
# Create and train a machine learning model (e.g., Naive Bayes) - for Feature 1
model_5 = MultinomialNB()
model_5.fit(X_train5, y_train5)
model_5.predict(X_dev5)
# Evaluate the model's performance on the dev data
accuracy5 = model_5.score(X_dev5, y_dev5)
print(accuracy5)

0.7859375


**Stemming, removed stopwords and punctuations, trigrams and TF-IDF**

In [11]:
import pandas as pd
import numpy as np
from nltk.util import ngrams
from nltk.stem import LancasterStemmer
import nltk
import math

stoplist = set(nltk.corpus.stopwords.words('english'))

# Function to calculate TF-IDF with n-grams and filter based on frequency threshold
def calculate_tfidf_ngrams_with_threshold(documents, n, threshold):
    # Apply stemming
    st = LancasterStemmer()
    stemmed_documents_5 = [[' '.join([st.stem(word) for word in words if word not in stoplist and word not in string.punctuation])] for words in documents]


    # Generate n-grams and calculate frequency
    all_bigrams = []
    for doc in stemmed_documents_5:
        terms = list(ngrams(doc[0].split(), n))
        all_bigrams.extend(terms)

    # Calculate the frequency of all bigrams
    frequency_word_bigrams = nltk.FreqDist(all_bigrams)

    # Filter bigrams based on the threshold
    filtered_bigrams = {bigram: freq for bigram, freq in frequency_word_bigrams.items() if freq >= threshold}

    # Create a vocabulary
    all_terms = set(filtered_bigrams.keys())
    vocabulary_6 = sorted(list(all_terms))

    # Calculate TF
    tf_matrix = pd.DataFrame(0, index=df.index, columns=vocabulary_6, dtype=float)
    for i, doc in enumerate(stemmed_documents_5):
        terms = list(ngrams(doc[0].split(), n))
        for term in terms:
            if term in vocabulary_6:
                tf_matrix.at[i, term] += 1

    # Calculate IDF
    idf_vector = pd.Series(0, index=vocabulary_6, dtype=float)
    N = len(stemmed_documents_5)
    for term in vocabulary_6:
        df_term = sum([1 for doc in stemmed_documents_5 if term in list(ngrams(doc[0].split(), n))])
        idf_vector.at[term] = math.log(N / (1 + df_term), 10)

    # Calculate TF-IDF
    tfidf_matrix_6 = tf_matrix * idf_vector

    return tfidf_matrix_6, vocabulary_6

# Calculate TF-IDF matrix with bigrams and filter based on frequency threshold
tfidf_matrix_6, vocabulary_6 = calculate_tfidf_ngrams_with_threshold(df['tokenized_review'], n=3, threshold=100)


In [12]:
# Split data into train, developement  and test sets with Feature 1 - Stemming, removed stopwords and punctuation
# and Applied TF-IDF
X_train6, X_test6, y_train6, y_test6 = train_test_split(tfidf_matrix_6, df.labels, test_size=0.2, random_state=42)
X_train6, X_dev6, y_train6, y_dev6 = train_test_split(X_train6, y_train6,test_size=0.2, train_size=0.8, random_state=42)

In [13]:
# Create and train a machine learning model (e.g., Naive Bayes) - for Feature 1
model_6 = MultinomialNB()
model_6.fit(X_train6, y_train6)
model_6.predict(X_dev6)
# Evaluate the model's performance on the dev data
accuracy6 = model_6.score(X_dev6, y_dev6)
print(accuracy6)

0.509375


**Lemmatization, removed stopwords and punctuations, trigrams and TF-IDF**

In [14]:
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams
import nltk
import string

stoplist = set(nltk.corpus.stopwords.words('english'))

# Function to calculate TF-IDF with n-grams and filter based on frequency threshold
def calculate_tfidf_ngrams_with_threshold(documents, n, threshold):
    lemmatizer = WordNetLemmatizer()
    lemmatized_documents_3 = [[' '.join([lemmatizer.lemmatize(word) for word in words if word not in stoplist and word not in string.punctuation])] for words in documents]

    # Generate n-grams and calculate frequency
    all_bigrams = []
    for doc in lemmatized_documents_3:
        terms = list(ngrams(doc[0].split(), n))
        all_bigrams.extend(terms)

    # Calculate the frequency of all bigrams
    frequency_word_bigrams = nltk.FreqDist(all_bigrams)

    # Filter bigrams based on the threshold
    filtered_bigrams = {bigram: freq for bigram, freq in frequency_word_bigrams.items() if freq >= threshold}

    # Create a vocabulary
    all_terms = set(filtered_bigrams.keys())
    vocabulary_7 = sorted(list(all_terms))

    # Calculate TF
    tf_matrix = pd.DataFrame(0, index=range(len(lemmatized_documents_3)), columns=vocabulary_7, dtype=float)
    for i, doc in enumerate(lemmatized_documents_3):
        terms = list(ngrams(doc[0].split(), n))
        for term in terms:
            if term in vocabulary_7:
                tf_matrix.at[i, term] += 1

    # Calculate IDF
    idf_vector = pd.Series(0, index=vocabulary_7, dtype=float)
    N = len(lemmatized_documents_3)
    for term in vocabulary_7:
        df_term = sum([1 for doc in lemmatized_documents_3 if term in list(ngrams(doc[0].split(), n))])
        idf_vector.at[term] = math.log(N / (1 + df_term), 10)

    # Calculate TF-IDF
    tfidf_matrix_7 = tf_matrix * idf_vector

    return tfidf_matrix_7, vocabulary_7

# Calculate TF-IDF matrix with bigrams and filter based on frequency threshold
tfidf_matrix_7, vocabulary_7 = calculate_tfidf_ngrams_with_threshold(df['tokenized_review'], n=3, threshold=100)




In [15]:
# Split data into train, developement  and test sets with Feature 1 - Stemming, removed stopwords and punctuation
# and Applied TF-IDF
X_train7, X_test7, y_train7, y_test7 = train_test_split(tfidf_matrix_7, df.labels, test_size=0.2, random_state=42)
X_train7, X_dev7, y_train7, y_dev7 = train_test_split(X_train7, y_train7,test_size=0.2, train_size=0.8, random_state=42)

In [16]:
# Create and train a machine learning model (e.g., Naive Bayes) - for Feature 1
model_7 = MultinomialNB()
model_7.fit(X_train7, y_train7)
model_7.predict(X_dev7)
# Evaluate the model's performance on the dev data
accuracy7 = model_7.score(X_dev7, y_dev7)
print(accuracy7)

0.50625
