In [208]:
import numpy as np
import pandas as pd
import ast
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [235]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [236]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [233]:
movies.columns

Index(['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew',
       'tags'],
      dtype='object')

In [211]:
movies = movies.merge(credits, on='title')

In [212]:
movies = movies[['movie_id','title','overview','genres','keywords','cast','crew']]

In [213]:
movies.duplicated().sum()

np.int64(0)

In [214]:
movies.dropna(inplace=True)

In [None]:
# extract_Name a genre theke name nisi 
# extract_director a crew theke director ar Num nisi

In [215]:
def extract_names(obj, limit=None):
    """Extract names from JSON-like strings"""
    try:
        items = ast.literal_eval(obj)
        names = [item['name'] for i, item in enumerate(items) if limit is None or i < limit]
        return names
    except:
        return []

def extract_directors(obj):
    """Extract director names from crew JSON"""
    try:
        crew = ast.literal_eval(obj)
        return [item['name'] for item in crew if item['job'] == 'Director']
    except:
        return []

def clean_spaces(items):
    """Remove spaces from list of strings"""
    return [item.replace(" ", "") for item in items]

In [216]:
movies['genres'] = movies['genres'].apply(lambda x: extract_names(x))

In [217]:
movies['keywords'] = movies['keywords'].apply(lambda x: extract_names(x))

In [218]:
movies['cast'] = movies['cast'].apply(lambda x: extract_names(x, limit=3))

In [219]:
movies['crew'] = movies['crew'].apply(extract_directors)

In [220]:
movies['overview'] = movies['overview'].apply(lambda x: x.split())

In [221]:
movies['crew'] = movies['crew'].apply(clean_spaces)
movies['genres'] = movies['genres'].apply(clean_spaces)
movies['cast'] = movies['cast'].apply(clean_spaces)
movies['keywords'] = movies['keywords'].apply(clean_spaces)

In [222]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4806 entries, 0 to 4808
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  4806 non-null   int64 
 1   title     4806 non-null   object
 2   overview  4806 non-null   object
 3   genres    4806 non-null   object
 4   keywords  4806 non-null   object
 5   cast      4806 non-null   object
 6   crew      4806 non-null   object
dtypes: int64(1), object(6)
memory usage: 300.4+ KB


CREW , genres , Cast , Keywords , Overview Use kore amara sudhu akta tags name column banabo


In [223]:
movies['tags'] = movies['crew'] + movies['genres'] + movies['cast'] + movies['keywords'] + movies['overview']

Movie dataFrame theke sudhu amra movie_id title tags use kore dataframe nisi

In [224]:
df = movies[['movie_id', 'title','tags']]

In [225]:
df['tags'] = df['tags'].apply(lambda x: " ".join(x).lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tags'] = df['tags'].apply(lambda x: " ".join(x).lower())


Apply stemming to text

In [None]:
ps = PorterStemmer()

def stem_text(text):
    return " ".join([ps.stem(word) for word in text.split()])

df['tags'] = df['tags'].apply(stem_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tags'] = df['tags'].apply(stem_text)


Bag-of-Words (BoW)

In [227]:
cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(df['tags']).toarray()

In [228]:
similarity = cosine_similarity(vectors)

In [229]:
df

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,jamescameron action adventur fantasi sciencefi...
1,285,Pirates of the Caribbean: At World's End,goreverbinski adventur fantasi action johnnyde...
2,206647,Spectre,sammend action adventur crime danielcraig chri...
3,49026,The Dark Knight Rises,christophernolan action crime drama thriller c...
4,49529,John Carter,andrewstanton action adventur sciencefict tayl...
...,...,...,...
4804,9367,El Mariachi,robertrodriguez action crime thriller carlosga...
4805,72766,Newlyweds,edwardburn comedi romanc edwardburn kerrybishé...
4806,231617,"Signed, Sealed, Delivered",scottsmith comedi drama romanc tvmovi ericmabi...
4807,126186,Shanghai Calling,danielhsia danielhenney elizacoup billpaxton w...


In [230]:
def recommend(movie, num_recommendations=5):
    """
    Get movie recommendations based on similarity
    
    Args:
        movie (str): Movie title to find recommendations for
        num_recommendations (int): Number of recommendations to return (default: 5)
    
    Returns:
        list: List of recommended movie titles
    """
    matching_movies = df[df['title'].str.lower() == movie.lower()]
    
    if matching_movies.empty:
        print(f"Movie '{movie}' not found in database")
        return []
    
    movie_index = matching_movies.index[0]
    distances = similarity[movie_index]
    movies_list = sorted(enumerate(distances), reverse=True, key=lambda x: x[1])[1:num_recommendations+1]
    
    recommendations = [df.iloc[idx]['title'] for idx, _ in movies_list]
    return recommendations

def display_recommendations(movie, num_recommendations=5):
    """Display recommendations in a formatted way"""
    recommendations = recommend(movie, num_recommendations)
    
    if recommendations:
        print(f"\nMovies similar to '{movie}':\n")
        for i, rec_movie in enumerate(recommendations, 1):
            print(f"{i}. {rec_movie}")
    else:
        print(f"Could not find recommendations for '{movie}'")

In [240]:
display_recommendations("Newlyweds")


Movies similar to 'Newlyweds':

1. Bad Grandpa
2. How to Fall in Love
3. The R.M.
4. Time Changer
5. After Earth
