In [2]:
!pip3 install tmdbv3api tqdm nltk



In [None]:
import re
import os
import nltk
import pandas as pd
from tqdm import tqdm
from tmdbv3api import TMDb, Movie
from nltk.corpus import wordnet as wn


nltk.download("wordnet")

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sarveshmhadgut/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
tmdb = TMDb()
tmdb.api_key = os.environ.get("MY_API_KEY")
tmdb_movie = Movie()

In [None]:
def remove_duplicates_from_file(
    input_file="../datasets/reviews.txt", output_file="../datasets/cleaned_reviews.txt"
):
    unique_lines = set()

    with open(input_file, "r") as infile:
        lines = infile.readlines()

    with open(output_file, "w") as outfile:
        for line in lines:
            if line not in unique_lines:
                outfile.write(line)
                unique_lines.add(line)


remove_duplicates_from_file()

In [None]:
review_df = pd.read_csv(
    "../datasets/cleaned_reviews.txt", sep="\t", names=["Reviews", "Comments"]
)

In [6]:
movie_df = pd.read_csv("../datasets/final_data.csv")

In [7]:
movie_df

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title,comb
0,James Cameron,CCH Pounder,Joel David Moore,Wes Studi,Action Adventure Fantasy Sci-Fi,avatar,CCH Pounder Joel David Moore Wes Studi James C...
1,Gore Verbinski,Johnny Depp,Orlando Bloom,Jack Davenport,Action Adventure Fantasy,pirates of the caribbean: at world's end,Johnny Depp Orlando Bloom Jack Davenport Gore ...
2,Sam Mendes,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Action Adventure Thriller,spectre,Christoph Waltz Rory Kinnear Stephanie Sigman ...
3,Christopher Nolan,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Action Thriller,the dark knight rises,Tom Hardy Christian Bale Joseph Gordon-Levitt ...
4,Doug Walker,Doug Walker,Rob Walker,unknown,Documentary,star wars: episode vii - the force awakens ...,Doug Walker Rob Walker unknown Doug Walker Doc...
...,...,...,...,...,...,...,...
6133,Robert Rodriguez,Priyanka Chopra Jonas,Pedro Pascal,YaYa Gosselin,Family Action Fantasy Comedy,we can be heroes,Priyanka Chopra Jonas Pedro Pascal YaYa Gossel...
6134,Paul Greengrass,Tom Hanks,Helena Zengel,unknown,Drama Western Adventure,news of the world,Tom Hanks Helena Zengel unknown Paul Greengras...
6135,Regina King,Kingsley Ben-Adir,Eli Goree,Aldis Hodge,Drama,one night in miami...,Kingsley Ben-Adir Eli Goree Aldis Hodge Regina...
6136,Emerald Fennell,Carey Mulligan,Bo Burnham,Alison Brie,Thriller Crime Drama,promising young woman,Carey Mulligan Bo Burnham Alison Brie Emerald ...


In [8]:
review_df

Unnamed: 0,Reviews,Comments
0,1,The Da Vinci Code book is just awesome.
1,1,this was the first clive cussler i've ever rea...
2,1,i liked the Da Vinci Code a lot.
3,1,I liked the Da Vinci Code but it ultimatly did...
4,1,that's not even an exaggeration ) and at midni...
...,...,...
1246,0,Brokeback Mountain was boring.
1247,0,So Brokeback Mountain was really depressing.
1248,0,"As I sit here, watching the MTV Movie Awards, ..."
1249,0,Ok brokeback mountain is such a horrible movie.


In [25]:
def cast_sentiment_analysis(movie_name, casts, reviews=review_df):
    sentiment_dict = {}
    movie_tokens = re.sub(r"[^\w\s]", "", movie_name).strip().lower().split()
    cast_tokens = {
        cast: re.sub(r"[^\w\s]", "", cast).strip().lower().split() for cast in casts
    }

    cast_sentiments = {cast: {"pos": 0, "neg": 0} for cast in casts}

    for _, review in reviews.iterrows():
        review_content = review["Comments"].lower()

        movie_presence = all(
            re.search(rf"\b{re.escape(word)}\b", review_content)
            for word in movie_tokens
        )

        if not movie_presence:
            continue

        for cast, tokens in cast_tokens.items():
            cast_presence = any(
                re.search(rf"\b{re.escape(word)}\b", review_content) for word in tokens
            )

            if cast_presence:
                if review["Reviews"] == 1:
                    cast_sentiments[cast]["pos"] += 1
                elif review["Reviews"] == 0:
                    cast_sentiments[cast]["neg"] += 1

    for cast, counts in cast_sentiments.items():
        total = counts["pos"] + counts["neg"]
        sentiment_score = counts["pos"] / total if total > 0 else 0.5
        sentiment_dict[cast] = sentiment_score

    return sentiment_dict

In [11]:
for index, row in tqdm(
    movie_df.iterrows(), total=movie_df.shape[0], desc="Processing Movies"
):
    movie_name = row["movie_title"]
    casts = row[
        ["director_name", "actor_1_name", "actor_2_name", "actor_3_name"]
    ].tolist()

    casts = [re.sub(r"[^\w\s]", "", str(cast)).strip() for cast in casts]

    cast_sentiment = cast_sentiment_analysis(movie_name, casts)

    movie_df.at[index, "cast_sentiment"] = cast_sentiment

Processing Movies: 100%|██████████| 6138/6138 [01:41<00:00, 60.75it/s]


In [13]:
movie_df

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title,comb,cast_sentiment
0,James Cameron,CCH Pounder,Joel David Moore,Wes Studi,Action Adventure Fantasy Sci-Fi,avatar,CCH Pounder Joel David Moore Wes Studi James C...,"{'James Cameron': 0.5, 'CCH Pounder': 0.5, 'Jo..."
1,Gore Verbinski,Johnny Depp,Orlando Bloom,Jack Davenport,Action Adventure Fantasy,pirates of the caribbean: at world's end,Johnny Depp Orlando Bloom Jack Davenport Gore ...,"{'Gore Verbinski': 0.5, 'Johnny Depp': 0.5, 'O..."
2,Sam Mendes,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Action Adventure Thriller,spectre,Christoph Waltz Rory Kinnear Stephanie Sigman ...,"{'Sam Mendes': 0.5, 'Christoph Waltz': 0.5, 'R..."
3,Christopher Nolan,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Action Thriller,the dark knight rises,Tom Hardy Christian Bale Joseph Gordon-Levitt ...,"{'Christopher Nolan': 0.5, 'Tom Hardy': 0.5, '..."
4,Doug Walker,Doug Walker,Rob Walker,unknown,Documentary,star wars: episode vii - the force awakens ...,Doug Walker Rob Walker unknown Doug Walker Doc...,"{'Doug Walker': 0.5, 'Rob Walker': 0.5, 'unkno..."
...,...,...,...,...,...,...,...,...
6133,Robert Rodriguez,Priyanka Chopra Jonas,Pedro Pascal,YaYa Gosselin,Family Action Fantasy Comedy,we can be heroes,Priyanka Chopra Jonas Pedro Pascal YaYa Gossel...,"{'Robert Rodriguez': 0.5, 'Priyanka Chopra Jon..."
6134,Paul Greengrass,Tom Hanks,Helena Zengel,unknown,Drama Western Adventure,news of the world,Tom Hanks Helena Zengel unknown Paul Greengras...,"{'Paul Greengrass': 0.5, 'Tom Hanks': 0.5, 'He..."
6135,Regina King,Kingsley Ben-Adir,Eli Goree,Aldis Hodge,Drama,one night in miami...,Kingsley Ben-Adir Eli Goree Aldis Hodge Regina...,"{'Regina King': 0.5, 'Kingsley BenAdir': 0.5, ..."
6136,Emerald Fennell,Carey Mulligan,Bo Burnham,Alison Brie,Thriller Crime Drama,promising young woman,Carey Mulligan Bo Burnham Alison Brie Emerald ...,"{'Emerald Fennell': 0.5, 'Carey Mulligan': 0.5..."


In [None]:
def get_synonym_set(word):
    synonym_set = set()
    synonym_set.add(word.lower())
    for syn in wn.synsets(word):
        for lemma in syn.lemmas():
            synonym_set.add(lemma.name().lower())
    return synonym_set


def contains_synonym(text, target_word):
    synonyms = get_synonym_set(target_word)

    words_in_text = set(text.lower().split())

    return not synonyms.isdisjoint(words_in_text)


sentence = "The film was visually stunning, but the plot was a headache."
target_word = "visual"

result = contains_synonym(sentence, target_word)
print(result)

False


In [8]:
def keyword_sentiment_analysis(keywords, reviews=review_df):
    pos, neg = 0, 0
    keywords_clean = re.sub(r"[^\w\s]", "", str(keywords)).lower().split()
    keyword_set = get_synonym_set(keywords_clean)

    for _, review in reviews.iterrows():
        review_content = review["Comments"].lower()

        keyword_presence = any(
            re.search(rf"\b{re.escape(word)}\b", review_content) for word in keyword_set
        )

        if keyword_presence:
            if review["Reviews"] == 1:
                pos += 1
            elif review["Reviews"] == 0:
                neg += 1

    total = pos + neg
    return pos / total if total > 0 else 0.5

In [9]:
def generic_sentiment_analysis(reviews=review_df):
    generic_keywords = [
        "good",
        "bad",
        "average",
        "excellent",
        "poor",
        "awesome",
        "terrible",
    ]

    return keyword_sentiment_analysis(generic_keywords, reviews)

In [None]:
movie_df["plot_sentiment"] = None
movie_df["visuals_sentiment"] = None
movie_df["generic_sentiment"] = None

visuals_keywords_default = "animation visuals cinematography"

for index, row in tqdm(movie_df.iterrows(), total=movie_df.shape[0]):
    plot_keywords = row.get("plot_keywords", "")
    print(plot_keywords)
    plot_keywords = str(plot_keywords).strip()

    plot_sentiment = keyword_sentiment_analysis(plot_keywords) if plot_keywords else 0.5
    visuals_sentiment = keyword_sentiment_analysis(visuals_keywords_default)

    movie_df.at[index, "plot_sentiment"] = plot_sentiment
    movie_df.at[index, "visuals_sentiment"] = visuals_sentiment

    generic_sentiment = 1 - (plot_sentiment + visuals_sentiment) / 2
    movie_df.at[index, "generic_sentiment"] = generic_sentiment

In [19]:
movie_df

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title,comb,cast_sentiment,plot_sentiment,visuals_sentiment,generic_sentiment
0,James Cameron,CCH Pounder,Joel David Moore,Wes Studi,Action Adventure Fantasy Sci-Fi,avatar,CCH Pounder Joel David Moore Wes Studi James C...,"{'James Cameron': 0.5, 'CCH Pounder': 0.5, 'Jo...",0.5,0.5,0.5
1,Gore Verbinski,Johnny Depp,Orlando Bloom,Jack Davenport,Action Adventure Fantasy,pirates of the caribbean: at world's end,Johnny Depp Orlando Bloom Jack Davenport Gore ...,"{'Gore Verbinski': 0.5, 'Johnny Depp': 0.5, 'O...",0.5,0.5,0.5
2,Sam Mendes,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Action Adventure Thriller,spectre,Christoph Waltz Rory Kinnear Stephanie Sigman ...,"{'Sam Mendes': 0.5, 'Christoph Waltz': 0.5, 'R...",0.5,0.5,0.5
3,Christopher Nolan,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Action Thriller,the dark knight rises,Tom Hardy Christian Bale Joseph Gordon-Levitt ...,"{'Christopher Nolan': 0.5, 'Tom Hardy': 0.5, '...",0.5,0.5,0.5
4,Doug Walker,Doug Walker,Rob Walker,unknown,Documentary,star wars: episode vii - the force awakens ...,Doug Walker Rob Walker unknown Doug Walker Doc...,"{'Doug Walker': 0.5, 'Rob Walker': 0.5, 'unkno...",0.5,0.5,0.5
...,...,...,...,...,...,...,...,...,...,...,...
6133,Robert Rodriguez,Priyanka Chopra Jonas,Pedro Pascal,YaYa Gosselin,Family Action Fantasy Comedy,we can be heroes,Priyanka Chopra Jonas Pedro Pascal YaYa Gossel...,"{'Robert Rodriguez': 0.5, 'Priyanka Chopra Jon...",0.5,0.5,0.5
6134,Paul Greengrass,Tom Hanks,Helena Zengel,unknown,Drama Western Adventure,news of the world,Tom Hanks Helena Zengel unknown Paul Greengras...,"{'Paul Greengrass': 0.5, 'Tom Hanks': 0.5, 'He...",0.5,0.5,0.5
6135,Regina King,Kingsley Ben-Adir,Eli Goree,Aldis Hodge,Drama,one night in miami...,Kingsley Ben-Adir Eli Goree Aldis Hodge Regina...,"{'Regina King': 0.5, 'Kingsley BenAdir': 0.5, ...",0.5,0.5,0.5
6136,Emerald Fennell,Carey Mulligan,Bo Burnham,Alison Brie,Thriller Crime Drama,promising young woman,Carey Mulligan Bo Burnham Alison Brie Emerald ...,"{'Emerald Fennell': 0.5, 'Carey Mulligan': 0.5...",0.5,0.5,0.5


In [22]:
movie_df.to_csv("../datasets/movie_sentiment_results.csv", index=False)