In [50]:
import pandas as pd

# Load the dataset
df = pd.read_csv("movies_with_summary.csv")

In [51]:
import pandas as pd

true_sentiment_df = pd.read_csv('moviesWithSentiments.csv', usecols=['true_sentiment'])

df['true_sentiment'] = true_sentiment_df['true_sentiment']

In [52]:
# Check for missing values
missing_values = df.isnull().sum()
print("Missing Values:")
print(missing_values)

# Drop rows with missing values in specific columns
df.dropna(subset=['Summary'], inplace=True)
df.dropna(subset=['Short Summary'], inplace=True)

# Drop duplicates (if needed)
df.drop_duplicates(inplace=True)

df = df[df['Runtime'] > 0]

Missing Values:
Title                0
Year                 0
Summary              5
Short Summary        1
Runtime              0
Rating               0
Movie Poster         0
true_sentiment    3838
dtype: int64


In [53]:
# Display the cleaned dataset
print("Cleaned Dataset:")
print(df.head())

Cleaned Dataset:
                                               Title  Year  \
0                        Patton Oswalt: Annihilation  2017   
1                                      New York Doll  2005   
2  Mickey's Magical Christmas: Snowed in at the H...  2001   
4                                      And Then I Go  2017   
5                           An Extremely Goofy Movie  2000   

                                             Summary  \
0  Patton Oswald, despite a personal tragedy, pro...   
1  A recovering alcoholic and recently converted ...   
2  After everyone is snowed in at the House of Mo...   
4  In the cruel world of junior high, Edwin suffe...   
5  It's a big time in Max's life. He's college bo...   

                                       Short Summary  Runtime  Rating  \
0  Patton Oswalt, despite a personal tragedy, pro...       66     7.4   
1  A recovering alcoholic and recently converted ...       75     7.9   
2  Mickey and all his friends hold their own Chri...  

**TextBlob model:**

In [61]:
from textblob import TextBlob
import nltk
from nltk.corpus import stopwords

# Download the NLTK stopwords and POS tagging data
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

# Define a list of neutral words and proper noun tags
NEUTRAL_WORDS = set(stopwords.words('english'))
PROPER_NOUN_TAGS = {'NNP', 'NNPS'}

# Function to preprocess text and remove neutral words and proper nouns
def preprocess_text(text):
    # Tokenize the text
    tokens = nltk.word_tokenize(text)

    # Tag the tokens with parts-of-speech
    tagged_tokens = nltk.pos_tag(tokens)

    # Remove neutral words and proper nouns
    filtered_tokens = [word for word, tag in tagged_tokens if word.lower() not in NEUTRAL_WORDS and tag not in PROPER_NOUN_TAGS]

    # Join the filtered tokens back into a single string
    filtered_text = ' '.join(filtered_tokens)

    return filtered_text

df['filtered_tokens'] = df['Summary'].apply(preprocess_text)

# Function to calculate sentiment polarity after preprocessing text
def calculate_sentiment_polarity(text):
    # Preprocess the text to remove neutral words and proper nouns
    preprocessed_text = preprocess_text(text)

    # Use TextBlob to calculate sentiment polarity
    blob = TextBlob(preprocessed_text)
    polarity = blob.sentiment.polarity

    return polarity

def classify_sentiment(polarity):
    if polarity == 0:
        return 'neutral'
    elif polarity > 0:
        return 'happy'
    else:
        return 'sad'

df['sentiment_polarity_textblob'] = df['filtered_tokens'].apply(calculate_sentiment_polarity)

# Apply sentiment classification to the evaluation movie summaries
df['Predicted_Sentiment_TextBlob'] = df['sentiment_polarity_textblob'].apply(classify_sentiment)



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [62]:
h = 0
s = 0
n = 0

for sentiment in df['Predicted_Sentiment_TextBlob']:
    if sentiment == 'happy':
        h += 1
    elif sentiment == 'sad':
        s += 1
    else:
        n += 1

print(h, s, n)



2067 1371 255


**Vader pretrained model:**

In [59]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer

# Download the NLTK stopwords data
nltk.download('stopwords')
nltk.download('vader_lexicon')
nltk.download('punkt')

# Load the VADER sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Define a list of neutral words
NEUTRAL_WORDS = set(stopwords.words('english'))

# Function to preprocess text and remove neutral words
def preprocess_text(text):
    # Tokenize the text
    tokens = nltk.word_tokenize(text)

    # Remove neutral words
    filtered_tokens = [word for word in tokens if word.lower() not in NEUTRAL_WORDS]

    # Join the filtered tokens back into a single string
    filtered_text = ' '.join(filtered_tokens)

    return filtered_text

# Function to calculate sentiment polarity after preprocessing text
def calculate_sentiment_polarity(text):
    # Preprocess the text to remove neutral words
    preprocessed_text = preprocess_text(text)

    # Use VADER to calculate sentiment polarity
    polarity_scores = sia.polarity_scores(preprocessed_text)

    return polarity_scores['compound']

def classify_sentiment(polarity):
    if polarity == 0:
        return 'neutral'
    elif polarity > 0:
        return 'happy'
    else:
        return 'sad'

df['sentiment_polarity_vader'] = df['Summary'].apply(calculate_sentiment_polarity)

# Apply sentiment classification to the evaluation movie summaries
df['Predicted_Sentiment_Vader'] = df['sentiment_polarity_vader'].apply(classify_sentiment)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [60]:
h = 0
s = 0
n = 0

for sentiment in df['Predicted_Sentiment_Vader']:
    if sentiment == 'happy':
        h += 1
    elif sentiment == 'sad':
        s += 1
    else:
        n += 1

print(h, s, n)

1690 1886 117


**BERT fined tuned model for sentiment analysis:**

In [64]:
import pandas as pd
import torch
import nltk
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Download NLTK stopwords
nltk.download('stopwords')

# Load BERT model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("sbcBI/sentiment_analysis")
model = AutoModelForSequenceClassification.from_pretrained("sbcBI/sentiment_analysis")

# Define a list of neutral words
NEUTRAL_WORDS = set(stopwords.words('english'))

# Function to preprocess text and remove neutral words
def preprocess_text(text):
    # Remove neutral words
    tokens = nltk.word_tokenize(text)
    filtered_tokens = [word for word in tokens if word.lower() not in NEUTRAL_WORDS]
    filtered_text = ' '.join(filtered_tokens)

    return filtered_text

# Iterate through each movie summary
for index, row in df.iterrows():
    summary = row['Summary']

    # Preprocess the text to remove neutral words
    preprocessed_text = preprocess_text(summary)

    # Tokenize the preprocessed text
    inputs = tokenizer(preprocessed_text, return_tensors="pt", padding=True, truncation=True)

    # Perform inference
    with torch.no_grad():
        outputs = model(**inputs)

    # Get predicted sentiment
    predicted_class = torch.argmax(outputs.logits, dim=1).item()

    # Map predicted class to sentiment label
    sentiment_label = {0: 'happy', 1: 'sad', 2: 'neutral'}[predicted_class]

    # Update dataframe with sentiment label
    df.at[index, 'Predicted_Sentiment_BERT'] = sentiment_label


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [65]:
df.head()

Unnamed: 0,Title,Year,Summary,Short Summary,Runtime,Rating,Movie Poster,true_sentiment,filtered_tokens,sentiment_polarity_textblob,Predicted_Sentiment_TextBlob,sentiment_polarity_vader,Predicted_Sentiment_Vader,Predicted_Sentiment_BERT
0,Patton Oswalt: Annihilation,2017,"Patton Oswald, despite a personal tragedy, pro...","Patton Oswalt, despite a personal tragedy, pro...",66,7.4,https://hydramovies.com/wp-content/uploads/201...,happy,", despite personal tragedy , produces best sta...",0.625,happy,0.9339,happy,neutral
1,New York Doll,2005,A recovering alcoholic and recently converted ...,A recovering alcoholic and recently converted ...,75,7.9,https://hydramovies.com/wp-content/uploads/201...,sad,"recovering alcoholic recently converted , `` '...",-0.125,sad,-0.5106,sad,sad
2,Mickey's Magical Christmas: Snowed in at the H...,2001,After everyone is snowed in at the House of Mo...,Mickey and all his friends hold their own Chri...,65,6.8,https://hydramovies.com/wp-content/uploads/201...,happy,"everyone snowed , suggests throw party . Every...",0.65,happy,0.8352,happy,neutral
4,And Then I Go,2017,"In the cruel world of junior high, Edwin suffe...","In the cruel world of junior high, Edwin suffe...",99,7.6,https://hydramovies.com/wp-content/uploads/201...,sad,"cruel world junior high , suffers state anxiet...",-0.255,sad,-0.9403,sad,neutral
5,An Extremely Goofy Movie,2000,It's a big time in Max's life. He's college bo...,"Max goes to college, but to his embarassment h...",79,6.4,https://hydramovies.com/wp-content/uploads/201...,happy,'s big time 's life . 's college bound friends...,0.1375,happy,0.8316,happy,happy


In [68]:
# Save the DataFrame to a CSV file
df.to_csv('movies_with_predictions.csv', index=False)

**Results evaluation:**

In [66]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Assuming your DataFrame containing the predicted sentiments is called df_predicted
# Extract the first 50 rows with no NaN values in the 'true_sentiment' column
df_first_50 = df.dropna(subset=['true_sentiment']).head(50)

# Extract ground truth labels and predicted labels
ground_truth_labels = df_first_50['true_sentiment']
predicted_labels_bert = df_first_50['Predicted_Sentiment_BERT']
predicted_labels_vader = df_first_50['Predicted_Sentiment_Vader']
predicted_labels_textblob = df_first_50['Predicted_Sentiment_TextBlob']

# Convert sentiment labels to numerical format
sentiment_label_mapping = {'happy': 0, 'sad': 1, 'neutral': 2}
ground_truth_numerical = [sentiment_label_mapping[label] for label in ground_truth_labels]

# Convert predicted labels to numerical format
predicted_labels_bert_numerical = [sentiment_label_mapping[label] for label in predicted_labels_bert]
predicted_labels_vader_numerical = [sentiment_label_mapping[label] for label in predicted_labels_vader]
predicted_labels_textblob_numerical = [sentiment_label_mapping[label] for label in predicted_labels_textblob]

# Calculate accuracy
accuracy_bert = accuracy_score(ground_truth_numerical, predicted_labels_bert_numerical)
accuracy_vader = accuracy_score(ground_truth_numerical, predicted_labels_vader_numerical)
accuracy_textblob = accuracy_score(ground_truth_numerical, predicted_labels_textblob_numerical)

# Generate confusion matrices
conf_matrix_bert = confusion_matrix(ground_truth_numerical, predicted_labels_bert_numerical)
conf_matrix_vader = confusion_matrix(ground_truth_numerical, predicted_labels_vader_numerical)
conf_matrix_textblob = confusion_matrix(ground_truth_numerical, predicted_labels_textblob_numerical)

# Generate classification reports
classification_report_bert = classification_report(ground_truth_numerical, predicted_labels_bert_numerical, target_names=['happy', 'sad', 'neutral'])
classification_report_vader = classification_report(ground_truth_numerical, predicted_labels_vader_numerical, target_names=['happy', 'sad', 'neutral'])
classification_report_textblob = classification_report(ground_truth_numerical, predicted_labels_textblob_numerical, target_names=['happy', 'sad', 'neutral'])

# Print or visualize the evaluation results
print("BERT Accuracy:", accuracy_bert)
print("BERT Confusion Matrix:\n", conf_matrix_bert)
print("BERT Classification Report:\n", classification_report_bert)

print("VADER Accuracy:", accuracy_vader)
print("VADER Confusion Matrix:\n", conf_matrix_vader)
print("VADER Classification Report:\n", classification_report_vader)

print("TextBlob Accuracy:", accuracy_textblob)
print("TextBlob Confusion Matrix:\n", conf_matrix_textblob)
print("TextBlob Classification Report:\n", classification_report_textblob)


BERT Accuracy: 0.3409090909090909
BERT Confusion Matrix:
 [[ 8  1  3]
 [ 7  1  6]
 [10  2  6]]
BERT Classification Report:
               precision    recall  f1-score   support

       happy       0.32      0.67      0.43        12
         sad       0.25      0.07      0.11        14
     neutral       0.40      0.33      0.36        18

    accuracy                           0.34        44
   macro avg       0.32      0.36      0.30        44
weighted avg       0.33      0.34      0.30        44

VADER Accuracy: 0.38636363636363635
VADER Confusion Matrix:
 [[9 3 0]
 [6 7 1]
 [9 8 1]]
VADER Classification Report:
               precision    recall  f1-score   support

       happy       0.38      0.75      0.50        12
         sad       0.39      0.50      0.44        14
     neutral       0.50      0.06      0.10        18

    accuracy                           0.39        44
   macro avg       0.42      0.44      0.35        44
weighted avg       0.43      0.39      0.32       