In [None]:
!pip install vaderSentiment
!pip install transformers
!pip install sentence_transformers




In [None]:
import pandas as pd
import numpy as np
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.stem import PorterStemmer
from nltk.sentiment import SentimentIntensityAnalyzer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer as VaderSentiment
from nltk.corpus import sentiwordnet as swn
from transformers import pipeline
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
nltk.download('stopwords')
nltk.download('vader_lexicon')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('sentiwordnet')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package sentiwordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/sentiwordnet.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

# Read data

In [None]:
movies = pd.read_csv(r"movies_data_updated.csv", encoding='utf-8', engine='python')
reviews = pd.read_csv(r"reviews_data.csv", encoding='utf-8', engine='python')

In [None]:
movies = movies.drop(columns=['keywords'])

In [None]:
movies.isna().sum()

id                         0
title                      1
vote_average               0
vote_count                 0
budget                     0
overview                  51
tagline                 2091
genres                     2
production_companies       2
production_countries       2
spoken_languages           2
cast                      62
director                  43
dtype: int64

# Preprocess data

In [None]:

# Convert budget to numeric, coerce errors to NaN, then fill NaN with 0 or another appropriate value
movies['budget'] = pd.to_numeric(movies['budget'], errors='coerce').fillna(0)

# Define budget bins and labels
bins = [0, 2000000, 49000000, float('inf')]
labels = ['low budget', 'mid budget', 'high budget']

# Apply budget categorization
movies['budget_category'] = pd.cut(movies['budget'], bins=bins, labels=labels)

In [None]:
def split_and_remove_spaces(value, delimiter=' '):
    if isinstance(value, str):
        return [i.replace(" ", "") for i in value.split(delimiter)]
    return value


# Columns that need to be split by space
space_split_columns = ['overview']

# Columns that need to be split by comma and space
comma_space_split_columns = [
    'cast', 'director', 'tagline', 'production_companies',
    'production_countries', 'spoken_languages'
]

# Apply the transformations
for column in space_split_columns:
    movies[column] = movies[column].apply(lambda x: split_and_remove_spaces(x, ' '))

for column in comma_space_split_columns:
    movies[column] = movies[column].apply(lambda x: split_and_remove_spaces(x, ', '))

In [None]:
# Apply the transformations
for column in space_split_columns:
    movies[column] = movies[column].apply(lambda x: split_and_remove_spaces(x, ' '))

for column in comma_space_split_columns:
    movies[column] = movies[column].apply(lambda x: split_and_remove_spaces(x, ', '))

In [None]:
def remove_spaces(value):
    if isinstance(value, list):
        return [i.replace(" ", "") for i in value]
    elif isinstance(value, str):
        return value.replace(" ", "")
    return value

In [None]:

columns_to_clean = [
    'cast', 'director', 'tagline', 'production_companies',
    'production_countries', 'spoken_languages'
]

for column in columns_to_clean:
    movies[column] = movies[column].apply(remove_spaces)

In [None]:
# Ensure id columns are of the same type, convert to int
movies['id'] = pd.to_numeric(movies['id'], errors='coerce').fillna(0).astype(int)
reviews['movie_id'] = pd.to_numeric(reviews['movie_id'], errors='coerce').fillna(0).astype(int)

# Merge movie and review data
combined_data = pd.merge(movies, reviews, left_on='id', right_on='movie_id')

In [None]:
def preprocess_text(text):
    # Check if the input is a string or a float
    if isinstance(text, str):
        # Text cleaning and normalization
        text = text.lower()  # Convert to lowercase

        # Tokenization
        tokens = word_tokenize(text)

        # Stop word removal
        stop_words = set(stopwords.words('english'))
        filtered_tokens = [word for word in tokens
                           if word not in stop_words and
                           any(list(swn.senti_synsets(word)))] # Keep words with sentiment scores


        # Stemming
        stemmer = PorterStemmer()
        stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]

        # Join the stemmed tokens back into a string
        preprocessed_text = ' '.join(stemmed_tokens)

    else:
        # If the input is not a string (e.g., a float), return it as is
        preprocessed_text = text

    return preprocessed_text

In [None]:
# Aggregate reviews for each movie
aggregated_reviews = combined_data.groupby('id')['content'].apply(lambda x: ' '.join(x)).reset_index()
aggregated_reviews['content']=aggregated_reviews['content'].apply(preprocess_text)


In [None]:
aggregated_reviews.head()

Unnamed: 0,id,content
0,11,write review theme music begin build mind well...
1,12,one best anim film ever seen great charact amu...
2,13,best movi ever best movi ever
3,14,film american beauti film purpos purpos life f...
4,15,greatest movi time gener opinion movi agre gre...


In [None]:
aggregated_reviews=pd.read_csv("/content/aggregated_reviews.csv")

# Caculate sentiment score

In [None]:
sentiment_pipeline = pipeline("sentiment-analysis", model="siebert/sentiment-roberta-large-english")

def enhanced_sentiment_model(text):
    if isinstance(text, str):
        # TextBlob sentiment
        textblob_sentiment = TextBlob(text).sentiment.polarity

        # VADER sentiment
        vader_analyzer = VaderSentiment()
        vader_scores = vader_analyzer.polarity_scores(text)
        vader_sentiment = vader_scores['compound']  # Use compound score

        # Hugging Face Transformers sentiment
        # Truncate the input text to the maximum sequence length of the model
        result = sentiment_pipeline(text[:512])[0]  # Truncate to 512 tokens
        huggingface_sentiment = result['score']
        if result['label'] == 'NEGATIVE':
            huggingface_sentiment = -huggingface_sentiment  # Adjust score for negative sentiment

        # Combine sentiment scores (weights can be adjusted as needed)
        combined_sentiment = 0.3*textblob_sentiment + 0.3*vader_sentiment +0.4* huggingface_sentiment

    else:
        combined_sentiment = text  # Handle non-string inputs

    return combined_sentiment

In [None]:

# Apply sentiment analysis to the aggregated reviews
aggregated_reviews['sentiment'] = aggregated_reviews['content'].apply(enhanced_sentiment_model)

# Merge sentiment scores with movie data, specifying suffixes to avoid conflicts
movies_model = pd.merge(movies, aggregated_reviews[['id', 'sentiment']], on='id', how='left', suffixes=('_original', ''))

# Fill any NaN values in the sentiment column with 0
movies_model['sentiment'] = movies_model['sentiment'].fillna(0)

In [None]:
movies_model.to_csv("sentiment_reviews.csv", index=False)

In [None]:
movies_model=pd.read_csv("/content/sentiment_reviews.csv")

In [None]:
movies_model.head()

Unnamed: 0,id,title,vote_average,vote_count,budget,overview,tagline,genres,production_companies,production_countries,spoken_languages,cast,director,budget_category,sentiment
0,653346,Kingdom of the Planet of the Apes,6.924,805,160000000.0,"[Several, generations, in, the, future, follow...",[Noonecanstopthereign.],"['Science Fiction', 'Adventure', 'Action']","[['20thCenturyStudios', 'OddballEntertainment'...",[['UnitedStatesofAmerica']],[['English']],"[OwenTeague(Noa), FreyaAllan(Nova/Mae), KevinD...",[WesBall],high budget,0.761463
1,929590,Civil War,7.075,1285,50000000.0,"[In, the, near, future,, a, group, of, war, jo...",[Welcometothefrontline.],"['War', 'Action', 'Drama']","[['DNAFilms', 'IPR.VC', 'A24']]","[['Finland', 'UnitedKingdom', 'UnitedStatesofA...",[['English']],"[KirstenDunst(Lee), WagnerMoura(Joel), CaileeS...",[AlexGarland],high budget,0.114547
2,823464,Godzilla x Kong: The New Empire,7.23,2506,150000000.0,"[Following, their, explosive, showdown,, Godzi...",[Risetogetherorfallalone.],"['Science Fiction', 'Action', 'Adventure']",[['LegendaryPictures']],[['UnitedStatesofAmerica']],[['English']],"[RebeccaHall(Dr.IleneAndrews), BrianTyreeHenry...",[AdamWingard],high budget,0.711782
3,719221,Tarot,6.497,334,8000000.0,"[When, a, group, of, friends, recklessly, viol...",[Yourfateisinthecards.],"['Horror', 'Thriller']","[['ScreenGems', 'AlloyEntertainment', 'GroundC...",[['UnitedStatesofAmerica']],[['English']],"[HarrietSlater(Haley), AdainBradley(Grant), Av...","[SpenserCohen, AnnaHalberg]",mid budget,-0.640762
4,614933,Atlas,6.733,600,100.0,"[A, brilliant, counterterrorism, analyst, with...",[Thefutureofhumanityisinherhands.],"['Science Fiction', 'Action']","[['SafehousePictures', 'ASAPEntertainment', 'N...",[['UnitedStatesofAmerica']],[['English']],"[JenniferLopez(AtlasShepherd), SimuLiu(HarlanS...",[BradPeyton],low budget,-0.036062


# Combind Movie tag

In [None]:
movies=movies_model

Tag with cast and director

In [None]:
movies['tags'] = (movies['overview'].astype(str) + ' ' +
                  movies['genres'].astype(str) + ' ' +
                  movies['cast'].astype(str) + ' ' +
                  movies['tagline'].astype(str) + ' ' +
                  movies['director'].astype(str) + ' ' +
                  movies['production_companies'].astype(str) + ' ' +
                  movies['production_countries'].astype(str) + ' ' +
                  movies['spoken_languages'].astype(str) + ' ' +
                  movies['budget'].astype(str) + ' ' +
                  movies['sentiment'].astype(str))

In [None]:
movies.head()

Unnamed: 0,id,title,vote_average,vote_count,budget,overview,tagline,genres,production_companies,production_countries,spoken_languages,cast,director,budget_category,sentiment_original,sentiment,tags
0,653346,Kingdom of the Planet of the Apes,6.924,805,160000000.0,"[Several, generations, in, the, future, follow...",[Noonecanstopthereign.],"['Science Fiction', 'Adventure', 'Action']","[['20thCenturyStudios', 'OddballEntertainment'...",[['UnitedStatesofAmerica']],[['English']],"[OwenTeague(Noa), FreyaAllan(Nova/Mae), KevinD...",[WesBall],high budget,0.985279,0.998688,"['Several', 'generations', 'in', 'the', 'futur..."
1,929590,Civil War,7.075,1285,50000000.0,"[In, the, near, future,, a, group, of, war, jo...",[Welcometothefrontline.],"['War', 'Action', 'Drama']","[['DNAFilms', 'IPR.VC', 'A24']]","[['Finland', 'UnitedKingdom', 'UnitedStatesofA...",[['English']],"[KirstenDunst(Lee), WagnerMoura(Joel), CaileeS...",[AlexGarland],high budget,0.978631,0.998684,"['In', 'the', 'near', 'future,', 'a', 'group',..."
2,823464,Godzilla x Kong: The New Empire,7.23,2506,150000000.0,"[Following, their, explosive, showdown,, Godzi...",[Risetogetherorfallalone.],"['Science Fiction', 'Action', 'Adventure']",[['LegendaryPictures']],[['UnitedStatesofAmerica']],[['English']],"[RebeccaHall(Dr.IleneAndrews), BrianTyreeHenry...",[AdamWingard],high budget,0.518629,0.998841,"['Following', 'their', 'explosive', 'showdown,..."
3,719221,Tarot,6.497,334,8000000.0,"[When, a, group, of, friends, recklessly, viol...",[Yourfateisinthecards.],"['Horror', 'Thriller']","[['ScreenGems', 'AlloyEntertainment', 'GroundC...",[['UnitedStatesofAmerica']],[['English']],"[HarrietSlater(Haley), AdainBradley(Grant), Av...","[SpenserCohen, AnnaHalberg]",mid budget,0.999226,0.998179,"['When', 'a', 'group', 'of', 'friends', 'reckl..."
4,614933,Atlas,6.733,600,100.0,"[A, brilliant, counterterrorism, analyst, with...",[Thefutureofhumanityisinherhands.],"['Science Fiction', 'Action']","[['SafehousePictures', 'ASAPEntertainment', 'N...",[['UnitedStatesofAmerica']],[['English']],"[JenniferLopez(AtlasShepherd), SimuLiu(HarlanS...",[BradPeyton],low budget,0.997686,0.990291,"['A', 'brilliant', 'counterterrorism', 'analys..."


In [None]:
for index, row in movies.iterrows():
    # Join the list of tags into a single string and remove extra characters
    tags_sentence = ''.join(row['tags']).replace("'", " ").replace(",", " ").replac`e("[", " ").replace("]", " ")

    # Update the 'tags' column with the sentence
    movies.at[index, 'tags'] = tags_sentence

In [None]:
movies.head()

Unnamed: 0,id,title,vote_average,vote_count,budget,overview,tagline,genres,production_companies,production_countries,spoken_languages,cast,director,budget_category,sentiment_original,sentiment,tags
0,653346,Kingdom of the Planet of the Apes,6.924,805,160000000.0,"[Several, generations, in, the, future, follow...",[Noonecanstopthereign.],"['Science Fiction', 'Adventure', 'Action']","[['20thCenturyStudios', 'OddballEntertainment'...",[['UnitedStatesofAmerica']],[['English']],"[OwenTeague(Noa), FreyaAllan(Nova/Mae), KevinD...",[WesBall],high budget,0.985279,0.998688,Several generations in the futur...
1,929590,Civil War,7.075,1285,50000000.0,"[In, the, near, future,, a, group, of, war, jo...",[Welcometothefrontline.],"['War', 'Action', 'Drama']","[['DNAFilms', 'IPR.VC', 'A24']]","[['Finland', 'UnitedKingdom', 'UnitedStatesofA...",[['English']],"[KirstenDunst(Lee), WagnerMoura(Joel), CaileeS...",[AlexGarland],high budget,0.978631,0.998684,In the near future a group ...
2,823464,Godzilla x Kong: The New Empire,7.23,2506,150000000.0,"[Following, their, explosive, showdown,, Godzi...",[Risetogetherorfallalone.],"['Science Fiction', 'Action', 'Adventure']",[['LegendaryPictures']],[['UnitedStatesofAmerica']],[['English']],"[RebeccaHall(Dr.IleneAndrews), BrianTyreeHenry...",[AdamWingard],high budget,0.518629,0.998841,Following their explosive showdown ...
3,719221,Tarot,6.497,334,8000000.0,"[When, a, group, of, friends, recklessly, viol...",[Yourfateisinthecards.],"['Horror', 'Thriller']","[['ScreenGems', 'AlloyEntertainment', 'GroundC...",[['UnitedStatesofAmerica']],[['English']],"[HarrietSlater(Haley), AdainBradley(Grant), Av...","[SpenserCohen, AnnaHalberg]",mid budget,0.999226,0.998179,When a group of friends reckl...
4,614933,Atlas,6.733,600,100.0,"[A, brilliant, counterterrorism, analyst, with...",[Thefutureofhumanityisinherhands.],"['Science Fiction', 'Action']","[['SafehousePictures', 'ASAPEntertainment', 'N...",[['UnitedStatesofAmerica']],[['English']],"[JenniferLopez(AtlasShepherd), SimuLiu(HarlanS...",[BradPeyton],low budget,0.997686,0.990291,A brilliant counterterrorism analys...


In [None]:
ps=PorterStemmer()
def stem(text):
    y=[]
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)


In [None]:
movies['tags']=movies['tags'].apply(stem)

In [None]:
movies['tags'][0]

'sever gener in the futur follow "caesar s" reign ape are now the domin speci and live harmoni while human have been reduc to live in the shadows. as a new tyrann ape leader build hi empir one young ape undertak a harrow journey that will caus him to question all that he ha known about the past and to make choic that will defin a futur for ape and human alike. scienc fiction adventur action owenteague(noa) freyaallan(nova/mae) kevindurand(proximuscaesar) petermacon(raka) williamh.macy(trevathan) ekadarville(sylva) travisjeffery(anaya) lydiapeckham(soona) neilsandilands(koro) "ras-samuelwelda abzgi(lightning)" sarawiseman(dar) kadenhartcher(oda/rust) andymcphee(honoredelder) ninagallas(youngster#1) samuelfalé(youngster#2) dichenlachman(korina) virginielaverdure(leadtech) markushamilton(tech#1) benjaminscott(tech#2) nirishbhatsurambadka(youngster#3) francesberry(laika) peterhayes(feralhuman#1) shereedacosta(feralhuman#2) souleymanediasse(feralhuman#3) olgamiller(feralhuman#4) dmitriymill

In [None]:
movies.head()

Unnamed: 0,id,title,vote_average,vote_count,budget,overview,tagline,genres,production_companies,production_countries,spoken_languages,cast,director,budget_category,sentiment_original,sentiment,tags
0,653346,Kingdom of the Planet of the Apes,6.924,805,160000000.0,"[Several, generations, in, the, future, follow...",[Noonecanstopthereign.],"['Science Fiction', 'Adventure', 'Action']","[['20thCenturyStudios', 'OddballEntertainment'...",[['UnitedStatesofAmerica']],[['English']],"[OwenTeague(Noa), FreyaAllan(Nova/Mae), KevinD...",[WesBall],high budget,0.985279,0.998688,"sever gener in the futur follow ""caesar s"" rei..."
1,929590,Civil War,7.075,1285,50000000.0,"[In, the, near, future,, a, group, of, war, jo...",[Welcometothefrontline.],"['War', 'Action', 'Drama']","[['DNAFilms', 'IPR.VC', 'A24']]","[['Finland', 'UnitedKingdom', 'UnitedStatesofA...",[['English']],"[KirstenDunst(Lee), WagnerMoura(Joel), CaileeS...",[AlexGarland],high budget,0.978631,0.998684,in the near futur a group of war journalist at...
2,823464,Godzilla x Kong: The New Empire,7.23,2506,150000000.0,"[Following, their, explosive, showdown,, Godzi...",[Risetogetherorfallalone.],"['Science Fiction', 'Action', 'Adventure']",[['LegendaryPictures']],[['UnitedStatesofAmerica']],[['English']],"[RebeccaHall(Dr.IleneAndrews), BrianTyreeHenry...",[AdamWingard],high budget,0.518629,0.998841,follow their explos showdown godzilla and kong...
3,719221,Tarot,6.497,334,8000000.0,"[When, a, group, of, friends, recklessly, viol...",[Yourfateisinthecards.],"['Horror', 'Thriller']","[['ScreenGems', 'AlloyEntertainment', 'GroundC...",[['UnitedStatesofAmerica']],[['English']],"[HarrietSlater(Haley), AdainBradley(Grant), Av...","[SpenserCohen, AnnaHalberg]",mid budget,0.999226,0.998179,when a group of friend recklessli violat the s...
4,614933,Atlas,6.733,600,100.0,"[A, brilliant, counterterrorism, analyst, with...",[Thefutureofhumanityisinherhands.],"['Science Fiction', 'Action']","[['SafehousePictures', 'ASAPEntertainment', 'N...",[['UnitedStatesofAmerica']],[['English']],"[JenniferLopez(AtlasShepherd), SimuLiu(HarlanS...",[BradPeyton],low budget,0.997686,0.990291,a brilliant counterterror analyst with a deep ...


# Train model

cosine similar

In [None]:
# Vectorize the tags for cosine similarity calculation
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['tags'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

def recommend_movies(title, movies_df=movies, cosine_sim=cosine_sim, top_n=10):
    # Check if the exact movie title is in the dataframe
    if title not in movies_df['title'].values:
        # Find similar titles using cosine similarity
        title_tfidf = tfidf.transform([title])
        title_cosine_sim = cosine_similarity(title_tfidf, tfidf_matrix)
        similar_titles_index = title_cosine_sim.argmax()
        similar_title = movies_df.iloc[similar_titles_index]['title']
        print(f"Movie '{title}' not found. Did you mean '{similar_title}'?")
        title = similar_title

    idx = movies_df[movies_df['title'] == title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]
    movie_indices = [i[0] for i in sim_scores]
    recommended_movies = movies_df.iloc[movie_indices]

    # Further rank the recommended movies based on sentiment analysis score
    recommended_movies = recommended_movies.sort_values(by='sentiment', ascending=False)

    return recommended_movies['title']

SentenceTransformer

In [None]:
model = SentenceTransformer('all-MiniLM-L12-v2')
embeddings = model.encode(movies['tags'].tolist(), show_progress_bar=True)

In [None]:
def recommend_movies(prompt_description, model, embeddings, df, top_n=10):
    # Check if the exact movie title is in the dataframe
    if prompt_description in df['title'].values:
        # Get the index of the movie
        idx = df[df['title'] == prompt_description].index[0]
        # Use the tags of the movie as the new prompt description
        prompt_description = df.loc[idx, 'tags']
    else:
        # Find similar titles using cosine similarity on the 'title' column
        title_tfidf = model.encode(df['title'].tolist())
        prompt_embedding = model.encode([prompt_description])
        title_cosine_sim = cosine_similarity(prompt_embedding, title_tfidf).flatten()
        most_similar_title_index = title_cosine_sim.argmax()
        most_similar_title = df.iloc[most_similar_title_index]['title']
        print(f"Movie '{prompt_description}' not found. Did you mean '{most_similar_title}'?")
        prompt_description = df.loc[most_similar_title_index, 'tags']

    # Generate embedding for the movie's tags
    prompt_embedding = model.encode([prompt_description])

    # Compute cosine similarity between the prompt and all movie descriptions
    cosine_similarities = cosine_similarity(prompt_embedding, embeddings).flatten()

    # Get indices of the top_n most similar movies
    top_indices = cosine_similarities.argsort()[-top_n:][::-1]

    # Retrieve the corresponding titles
    recommended_titles = df['title'].iloc[top_indices]

    return recommended_titles

# Load model

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd

model = SentenceTransformer('all-MiniLM-L12-v2')

# Generate embeddings for each movie description
# Load the embeddings
embeddings = np.load('movie_embeddings.npy')
df = pd.read_csv(r'e:\program\pythonProject\Movie sreaching alo\CleanedTMDB1000.csv')

# RUN

raw **hti**


In [None]:


recommended_movies = recommend_movies('Civil War')
print(recommended_movies)

6412                   La Dolce Vita
7950    Mr. Smith Goes to Washington
318                       Casablanca
7939           The War of the Worlds
7941           The War of the Worlds
7068                  Ocean's Eleven
8131                        Saboteur
788        Spider-Man: Far From Home
5675                As the Gods Will
88                      Citizen Kane
Name: title, dtype: object


In [None]:


recommended_movies = recommend_movies('Civil War')
print(recommended_movies.sort_values(by='sentiment', ascending=False))

318                       Casablanca
88                      Citizen Kane
7068                  Ocean's Eleven
7939           The War of the Worlds
7941           The War of the Worlds
6412                   La Dolce Vita
7950    Mr. Smith Goes to Washington
8131                        Saboteur
788        Spider-Man: Far From Home
5675                As the Gods Will
Name: title, dtype: object


In [None]:
reviews.head()

Unnamed: 0,movie_id,author,content,rating
0,653346,Manuel São Bento,FULL SPOILER-FREE REVIEW @ https://fandomwire....,7.0
1,653346,CinemaSerf,Quick question. So it was a virus that led to ...,7.0
2,653346,r96sk,"Has its moments, though overall I kinda found ...",6.0
3,653346,Midi-chlorian_Count,Just seen this and thought it was a pretty goo...,6.0
4,653346,Hotplix,"""Kingdom of the Planet of the Apes"" is a thril...",8.0


# **tage with out cast and director**

In [None]:
movies_none=pd.read_csv("/content/sentiment_reviews.csv")

In [None]:
movies_none=movies_model

In [None]:
movies_none['tags'] = (movies_none['overview'].astype(str) + ' ' +
                  movies_none['genres'].astype(str) + ' ' +
                  movies_none['cast'].astype(str) + ' ' +
                  movies_none['tagline'].astype(str) + ' ' +
                  movies_none['director'].astype(str) + ' ' +
                  movies_none['production_companies'].astype(str) + ' ' +
                  movies_none['production_countries'].astype(str) + ' ' +
                  movies_none['spoken_languages'].astype(str) + ' ' +
                  movies_none['budget'].astype(str) + ' ' +
                  movies_none['sentiment'].astype(str))

In [None]:
movies_none['tags']=movies_none['tags'].apply(stem)

In [None]:
for index, row in movies_none.iterrows():
    # Join the list of tags into a single string and remove extra characters
    tags_sentence = ''.join(row['tags']).replace("'", " ").replace(",", " ").replace("[", " ").replace("]", " ")

    # Update the 'tags' column with the sentence
    movies_none.at[index, 'tags'] = tags_sentence

In [None]:
movies_none.head()

Unnamed: 0,id,title,vote_average,vote_count,budget,overview,tagline,genres,production_companies,production_countries,spoken_languages,cast,director,budget_category,sentiment,tags
0,653346,Kingdom of the Planet of the Apes,6.924,805,160000000.0,"[Several, generations, in, the, future, follow...",[Noonecanstopthereign.],"['Science Fiction', 'Adventure', 'Action']","[['20thCenturyStudios', 'OddballEntertainment'...",[['UnitedStatesofAmerica']],[['English']],"[OwenTeague(Noa), FreyaAllan(Nova/Mae), KevinD...",[WesBall],high budget,0.761463,several generations in the futur...
1,929590,Civil War,7.075,1285,50000000.0,"[In, the, near, future,, a, group, of, war, jo...",[Welcometothefrontline.],"['War', 'Action', 'Drama']","[['DNAFilms', 'IPR.VC', 'A24']]","[['Finland', 'UnitedKingdom', 'UnitedStatesofA...",[['English']],"[KirstenDunst(Lee), WagnerMoura(Joel), CaileeS...",[AlexGarland],high budget,0.114547,in the near future a group ...
2,823464,Godzilla x Kong: The New Empire,7.23,2506,150000000.0,"[Following, their, explosive, showdown,, Godzi...",[Risetogetherorfallalone.],"['Science Fiction', 'Action', 'Adventure']",[['LegendaryPictures']],[['UnitedStatesofAmerica']],[['English']],"[RebeccaHall(Dr.IleneAndrews), BrianTyreeHenry...",[AdamWingard],high budget,0.711782,following their explosive showdown ...
3,719221,Tarot,6.497,334,8000000.0,"[When, a, group, of, friends, recklessly, viol...",[Yourfateisinthecards.],"['Horror', 'Thriller']","[['ScreenGems', 'AlloyEntertainment', 'GroundC...",[['UnitedStatesofAmerica']],[['English']],"[HarrietSlater(Haley), AdainBradley(Grant), Av...","[SpenserCohen, AnnaHalberg]",mid budget,-0.640762,when a group of friends reckl...
4,614933,Atlas,6.733,600,100.0,"[A, brilliant, counterterrorism, analyst, with...",[Thefutureofhumanityisinherhands.],"['Science Fiction', 'Action']","[['SafehousePictures', 'ASAPEntertainment', 'N...",[['UnitedStatesofAmerica']],[['English']],"[JenniferLopez(AtlasShepherd), SimuLiu(HarlanS...",[BradPeyton],low budget,-0.036062,a brilliant counterterrorism analys...


In [None]:
ps=PorterStemmer()
def stem(text):
    y=[]
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)


In [None]:
# Function to clean and preprocess the data
def preprocess_column(column):
    # Remove square brackets
    column = column.str.replace(r"[\[\]]", "", regex=True)
    # Replace commas with spaces
    column = column.str.replace(",", " ", regex=False)
    # Remove parentheses and their contents
    column = column.str.replace(r"\(.*?\)", "", regex=True)
    # Remove extra spaces
    column = column.str.strip()
    return column

In [None]:
movies_none['cast'] = movies_none['cast'].astype(str).fillna('')
movies_none['director'] = movies_none['director'].astype(str).fillna('')

In [None]:
movies_none['tags']=movies_none['tags'].apply(stem)
movies_none['cast']=preprocess_column(movies_none['cast'])
movies_none['director']=preprocess_column(movies_none['director'])

In [None]:
movies_none['tags'][0]

'sever gener in the futur follow "caesar s" reign ape are now the domin speci and live harmoni while human have been reduc to live in the shadows. as a new tyrann ape leader build hi empir one young ape undertak a harrow journey that will caus him to question all that he ha known about the past and to make choic that will defin a futur for ape and human alike. scienc fiction adventur action owenteague(noa) freyaallan(nova/mae) kevindurand(proximuscaesar) petermacon(raka) williamh.macy(trevathan) ekadarville(sylva) travisjeffery(anaya) lydiapeckham(soona) neilsandilands(koro) "ras-samuelwelda abzgi(lightning)" sarawiseman(dar) kadenhartcher(oda/rust) andymcphee(honoredelder) ninagallas(youngster#1) samuelfalé(youngster#2) dichenlachman(korina) virginielaverdure(leadtech) markushamilton(tech#1) benjaminscott(tech#2) nirishbhatsurambadka(youngster#3) francesberry(laika) peterhayes(feralhuman#1) shereedacosta(feralhuman#2) souleymanediasse(feralhuman#3) olgamiller(feralhuman#4) dmitriymill

In [None]:
movies_none.head()

Unnamed: 0,id,title,vote_average,vote_count,budget,overview,tagline,genres,production_companies,production_countries,spoken_languages,cast,director,budget_category,sentiment,tags
0,653346,Kingdom of the Planet of the Apes,6.924,805,160000000.0,"[Several, generations, in, the, future, follow...",[Noonecanstopthereign.],"['Science Fiction', 'Adventure', 'Action']","[['20thCenturyStudios', 'OddballEntertainment'...",[['UnitedStatesofAmerica']],[['English']],'OwenTeague' 'FreyaAllan' 'KevinDurand' 'Pe...,'WesBall',high budget,0.761463,"sever gener in the futur follow ""caesar s"" rei..."
1,929590,Civil War,7.075,1285,50000000.0,"[In, the, near, future,, a, group, of, war, jo...",[Welcometothefrontline.],"['War', 'Action', 'Drama']","[['DNAFilms', 'IPR.VC', 'A24']]","[['Finland', 'UnitedKingdom', 'UnitedStatesofA...",[['English']],'KirstenDunst' 'WagnerMoura' 'CaileeSpaeny' ...,'AlexGarland',high budget,0.114547,in the near futur a group of war journalist at...
2,823464,Godzilla x Kong: The New Empire,7.23,2506,150000000.0,"[Following, their, explosive, showdown,, Godzi...",[Risetogetherorfallalone.],"['Science Fiction', 'Action', 'Adventure']",[['LegendaryPictures']],[['UnitedStatesofAmerica']],[['English']],'RebeccaHall' 'BrianTyreeHenry' 'DanStevens'...,'AdamWingard',high budget,0.711782,follow their explos showdown godzilla and kong...
3,719221,Tarot,6.497,334,8000000.0,"[When, a, group, of, friends, recklessly, viol...",[Yourfateisinthecards.],"['Horror', 'Thriller']","[['ScreenGems', 'AlloyEntertainment', 'GroundC...",[['UnitedStatesofAmerica']],[['English']],'HarrietSlater' 'AdainBradley' 'Avantika' '...,'SpenserCohen' 'AnnaHalberg',mid budget,-0.640762,when a group of friend recklessli violat the s...
4,614933,Atlas,6.733,600,100.0,"[A, brilliant, counterterrorism, analyst, with...",[Thefutureofhumanityisinherhands.],"['Science Fiction', 'Action']","[['SafehousePictures', 'ASAPEntertainment', 'N...",[['UnitedStatesofAmerica']],[['English']],'JenniferLopez' 'SimuLiu' 'SterlingK.Brown' ...,'BradPeyton',low budget,-0.036062,a brilliant counterterror analyst with a deep ...


In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd

In [None]:
# Load the model and encode the tags, cast, and actors
model = SentenceTransformer('all-MiniLM-L12-v2')
embeddings_tags = model.encode(movies_none['tags'].tolist(), show_progress_bar=True)
embeddings_cast = model.encode(movies_none['cast'].tolist(), show_progress_bar=True)
embeddings_director = model.encode(movies_none['director'].tolist(), show_progress_bar=True)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

In [None]:
def recommend_movies(prompt_description, model, embeddings_tags, embeddings_cast, embeddings_actors, df, top_n=10):
    # Check if the exact movie title is in the dataframe
    if prompt_description in df['title'].values:
        # Get the index of the movie
        idx = df[df['title'] == prompt_description].index[0]
        # Use the tags, cast, and actors of the movie as the new prompt description
        prompt_tags = df.loc[idx, 'tags']
        prompt_cast = df.loc[idx, 'cast']
        prompt_director = df.loc[idx, 'director']
        print(prompt_tags)

    else:
        # Find similar titles using cosine similarity on the 'title' column
        title_embeddings = model.encode(df['title'].tolist())
        prompt_embedding = model.encode([prompt_description])
        title_cosine_sim = cosine_similarity(prompt_embedding, title_embeddings).flatten()
        most_similar_title_index = title_cosine_sim.argmax()
        most_similar_title = df.iloc[most_similar_title_index]['title']
        print(f"Movie '{prompt_description}' not found. Did you mean '{most_similar_title}'?")
        prompt_tags = df.loc[most_similar_title_index, 'tags']
        prompt_cast = df.loc[most_similar_title_index, 'cast']
        prompt_director = df.loc[most_similar_title_index, 'director']

    # Generate embeddings for the prompt description (tags, cast, and actors)
    prompt_tags_embedding = model.encode([prompt_tags])
    prompt_cast_embedding = model.encode([prompt_cast])
    prompt_director_embedding = model.encode([prompt_director])

    # Compute cosine similarities for tags, cast, and actors separately
    tags_similarities = cosine_similarity(prompt_tags_embedding, embeddings_tags).flatten()
    cast_similarities = cosine_similarity(prompt_cast_embedding, embeddings_cast).flatten()
    director_similarities = cosine_similarity(prompt_director_embedding, embeddings_director).flatten()

    # Adjust weights based on importance (0.3 for cast, 0.3 for actors, and 0.4 for tags)
    combined_similarities = 0 * cast_similarities + 0 * director_similarities + 1 * tags_similarities

    # Get indices of the top_n most similar movies
    top_indices = combined_similarities.argsort()[-top_n:][::-1]

    # Retrieve the corresponding titles
    recommended_titles = df.iloc[top_indices][['title', 'sentiment']]

    return recommended_titles

In [None]:
recommendations = recommend_movies('Civil War', model, embeddings_tags, embeddings_cast, embeddings_director, movies_none, top_n=10)
print(recommendations.sort_values(by='sentiment', ascending=False))


in the near futur a group of war journalist attempt to surviv while report the truth as the unit state stand on the brink of civil war. war action drama kirstendunst(lee) wagnermoura(joel) caileespaeny(jessie) stephenmckinleyhenderson(sammy) nelsonlee(tony) nickofferman(president) jeffersonwhite(dave) evanlai(bohai) vincepisani(concierge) justinjamesboykin(americansoldier(middleeast)) jessmatney(checkpointsoldier) greghill(pete) edmunddonovan(eddie) sonoyamizuno(anya) timjames(hangingcaptive) simeonfreeman(commercialsoldiermike) jamesyaegashi(commercialcorporal) deangrimes(commericalsoldier#1) alexamansour(aidworkerrefugeecamp) marthab.knighton(elderlywoman) melissasaint-amand(shopassistant) karlglusman(spotter) jinha(sniper) jojot.gibbs(wfwhitehousesergeant) jaredshaw(wfwhitehouseassaulter#1) justingarza(wfwhitehouseassaulter#2) brianphilpot(wfwhitehouseassaulter#3) tywauntornes(wfwhitehouseassaulter#4) juanifeliz(joybutler) jesseplemons(militiasoldier(uncredited)) jeffbosley(sfcbrown