<a href="https://colab.research.google.com/github/spmani99/nlp-projects/blob/main/ngram_probability_exploration_with_nlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import string
import nltk
import re
import numpy as np
import pandas as pd

from nltk import ngrams
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from collections import Counter

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dncna\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dncna\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Step 1: Import Movie Reviews

In [None]:
with open("Movie_Reviews.txt", "r") as file:
    movie_reviews = file.readlines()
print("Movie reviews imported successfully.")

Movie reviews imported successfully.


In [None]:
movie_reviews

['Positive Reviews\n',
 '\n',
 '1. "Forrest Gump is an absolute masterpiece! Tom Hanks delivers an unforgettable performance, and the storytelling is heartwarming. This movie is a journey through life that will make you laugh, cry, and appreciate the simple beauties of existence."\n',
 '\n',
 '2. "The Shawshank Redemption is a timeless classic. The powerful themes of hope, friendship, and redemption make it a must-watch. Morgan Freeman and Tim Robbins give exceptional performances in this brilliantly crafted film."\n',
 '\n',
 '3. "The epic conclusion to The Lord of the Rings trilogy, The Return of the King, is a cinematic triumph. The breathtaking visuals, epic battles, and emotionally resonant story make it a monumental achievement in filmmaking."\n',
 '\n',
 '4. "La La Land is a love letter to the magic of Hollywood and dreams. The chemistry between Ryan Gosling and Emma Stone is enchanting, and the music and dance sequences are a pure delight. A modern musical masterpiece."\n',
 '\

In [None]:
test_array = [movie_reviews[-1]]

def split_reviews(reviews):

    pos_array = []
    neg_array = []

    pos_index = reviews.index("Positive Reviews\n")
    neg_index = reviews.index("Negative Reviews\n")

    pos_array += reviews[pos_index+1: neg_index]
    neg_array += reviews[neg_index+1:]

    return pos_array, neg_array

movie_reviews.pop()
pos_array, neg_array = split_reviews(movie_reviews)

print(pos_array)
print(neg_array)
print(test_array)

["It's clear that the movie has both its enthusiasts and critics. While it may not be to everyone's taste, it's worth watching with an open mind to form your own opinion. \n"]


In [None]:
pos_df = pd.DataFrame({"review": pos_array})
neg_df = pd.DataFrame({"review": neg_array})
test_df = pd.DataFrame({"review": test_array})

In [None]:
pos_df, neg_df, test_df

(                                               review
 1                                                  \n
 2   1. "Forrest Gump is an absolute masterpiece! T...
 3                                                  \n
 4   2. "The Shawshank Redemption is a timeless cla...
 5                                                  \n
 6   3. "The epic conclusion to The Lord of the Rin...
 7                                                  \n
 8   4. "La La Land is a love letter to the magic o...
 9                                                  \n
 10  5. "Wes Anderson's whimsical style shines in T...
 11                                                 \n
 12  6."Inception is mind-bending brilliance! Chris...
 13                                                 \n
 14  7. "The Social Network is a captivating explor...
 15                                                 \n
 16  8. "Will Smith's portrayal of Chris Gardner in...
 17                                                 \n
 18  9. "E

### Step 2: Pre-process the Text Data

In [None]:
class TextPreprocessing:

    def __init__(self, stopwords_list_english):
        self.stopwords_list_english=stopwords_list_english

    def removePunc(self, text):
        punctuationfree = "".join([i for i in text if i not in string.punctuation])
        return punctuationfree

    def removeUnwanted(self, text):
        text = re.sub('\n ','',text)
        text = re.sub('\n','',text)
        text = re.sub(r"^\s+","",text)
        text = re.sub(r"\s+"," ",text)
        text = re.sub(r"\u200d","",text)
        text = re.sub(r"\u200c","",text)

        return text

    def removeNum(self, text):
        remove_digits = str.maketrans('', '', string.digits)
        return text.translate(remove_digits)

    def lowerCase(self, text):
        return text.lower()

    def tokenize(self, text):
        return word_tokenize(text)

    def removeStopwordsEnglish(self, text):
        output= [i for i in text if i not in self.stopwords_list_english]
        return output

    def pipeline(self, df, column_name):

        df_temp = df.copy()
        df_temp[column_name] = df_temp[column_name].apply(lambda x: self.removePunc(x))
        df_temp[column_name] = df_temp[column_name].apply(lambda x: self.removeNum(x))
        df_temp[column_name] = df_temp[column_name].apply(lambda x: self.removeUnwanted(x))

        df_temp = df_temp[df_temp[column_name].astype(bool)].reset_index(drop=True)

        df_temp[column_name] = df_temp[column_name].apply(lambda x: self.lowerCase(x))
        df_temp[column_name] = df_temp[column_name].apply(lambda x: self.tokenize(x))
        df_temp[column_name] = df_temp[column_name].apply(lambda x: self.removeStopwordsEnglish(x))

        return df_temp

preprocessing = TextPreprocessing(
    stopwords_list_english=stopwords.words('english')
)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dncna\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dncna\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
pos_df = preprocessing.pipeline(pos_df, "review")
neg_df = preprocessing.pipeline(neg_df, "review")
test_df = preprocessing.pipeline(test_df, "review")

In [None]:
pos_df

Unnamed: 0,review
0,"[forrest, gump, absolute, masterpiece, tom, ha..."
1,"[shawshank, redemption, timeless, classic, pow..."
2,"[epic, conclusion, lord, rings, trilogy, retur..."
3,"[la, la, land, love, letter, magic, hollywood,..."
4,"[wes, andersons, whimsical, style, shines, gra..."
5,"[inception, mindbending, brilliance, christoph..."
6,"[social, network, captivating, exploration, cr..."
7,"[smiths, portrayal, chris, gardner, pursuit, h..."
8,"[eternal, sunshine, spotless, mind, beautifull..."
9,"[princess, bride, timeless, fairy, tale, perfe..."


Step 3: Choose N and Implement N-Gram Model (e.g., Bigram)

In [None]:
#N-gram number
n = 1

# Function to tokenize and generate bigrams
def generate_ngrams(text):
    ngrams_list = list(zip(*[text[i:] for i in range(n)]))
    return ngrams_list

def process_datasets(df):

    temp_df = df.copy()

    # Create a new column with bigrams
    temp_df['ngrams'] = temp_df['review'].apply(generate_ngrams)

    # Flatten the bigrams lists and count their occurrences
    all_ngrams = [item for sublist in temp_df['ngrams'] for item in sublist]
    ngram_counts = Counter(all_ngrams)
    total_ngrams = sum(ngram_counts.values())

    # Convert the bigram frequencies to a DataFrame
    ngram_df = pd.DataFrame(list(ngram_counts.items()), columns=['ngram', 'frequency']).sort_values(by='frequency', ascending=False)

    return ngram_df, total_ngrams

ngram_pos, total_pos = process_datasets(pos_df)
ngram_neg, total_neg = process_datasets(neg_df)

In [None]:
ngram_pos

Unnamed: 0,ngram,frequency
39,"(film,)",7
14,"(make,)",5
60,"(love,)",3
3,"(masterpiece,)",3
175,"(romance,)",2
...,...,...
92,"(charming,)",1
95,"(inception,)",1
96,"(mindbending,)",1
97,"(brilliance,)",1


In [None]:
ngram_neg

Unnamed: 0,ngram,frequency
22,"(movie,)",4
3,"(film,)",4
64,"(cinematic,)",3
167,"(may,)",3
57,"(dialogue,)",3
...,...,...
81,"(lazy,)",1
82,"(filmmaking,)",1
83,"(superman,)",1
84,"(iv,)",1


Step 4: Calculate the N-gram probabilities for each N-gram

In [None]:
def calculate_ngram_probabilities(df, total):
    df_records = df.to_dict('records')

    for i in df_records:
        print("N-gram: {} ----- Frequency: {} ----- Probability: {}".format(i['ngram'], i['frequency'], i['frequency']/total))

    print("==========================================")

calculate_ngram_probabilities(ngram_pos, total_pos)
calculate_ngram_probabilities(ngram_neg, total_neg)

N-gram: ('film',) ----- Frequency: 7 ----- Probability: 0.025925925925925925
N-gram: ('make',) ----- Frequency: 5 ----- Probability: 0.018518518518518517
N-gram: ('love',) ----- Frequency: 3 ----- Probability: 0.011111111111111112
N-gram: ('masterpiece',) ----- Frequency: 3 ----- Probability: 0.011111111111111112
N-gram: ('romance',) ----- Frequency: 2 ----- Probability: 0.007407407407407408
N-gram: ('world',) ----- Frequency: 2 ----- Probability: 0.007407407407407408
N-gram: ('timeless',) ----- Frequency: 2 ----- Probability: 0.007407407407407408
N-gram: ('classic',) ----- Frequency: 2 ----- Probability: 0.007407407407407408
N-gram: ('managed',) ----- Frequency: 2 ----- Probability: 0.007407407407407408
N-gram: ('story',) ----- Frequency: 2 ----- Probability: 0.007407407407407408
N-gram: ('theres',) ----- Frequency: 2 ----- Probability: 0.007407407407407408
N-gram: ('characters',) ----- Frequency: 2 ----- Probability: 0.007407407407407408
N-gram: ('experience',) ----- Frequency: 2 ---

Step 5: Calculate N-Gram Probability for Test Review

In [None]:
tokens = test_df['review'][0]
tokens

['clear',
 'movie',
 'enthusiasts',
 'critics',
 'may',
 'everyones',
 'taste',
 'worth',
 'watching',
 'open',
 'mind',
 'form',
 'opinion']

In [None]:
def calculate_sentence_probability(tokens, ngram_df, total):

    df_records = dict(ngram_df.values)
    test_ngrams = generate_ngrams(tokens)

    # Calculate the probability of the sentence using ngram probabilities
    sentence_probability = 1.0  # Initialize the probability to 1.0

    for ngram in test_ngrams:
        if ngram in df_records:
            sentence_probability = sentence_probability * (df_records[ngram]/total)

    return sentence_probability

pos_prob = calculate_sentence_probability(tokens, ngram_pos, total_pos)
neg_prob = calculate_sentence_probability(tokens, ngram_neg, total_neg)

print("Probability of the sentence with respect to the positive dataset: ", pos_prob)
print("Probability of the sentence with respect to the negative dataset:", neg_prob)

Probability of the sentence with respect to the positive dataset:  7.081412322944303e-20
Probability of the sentence with respect to the negative dataset: 6.279337062757202e-14


In [None]:
if pos_prob>neg_prob:
    print("Positive Sentiment")
elif pos_prob<neg_prob:
    print("Negative Sentiment")
elif pos_prob==neg_prob:
    print("Neutral Sentiment")

Negative Sentiment
