In [26]:
import pandas as pd
import pickle

# Load the data
df = pd.read_csv('interpret.csv')

# Ensure that 'rating' is numeric (float)
df['rating'] = pd.to_numeric(df['rating'], errors='coerce')

# Drop rows where rating might be NaN after conversion (if any)
df.dropna(subset=['rating'], inplace=True)

# Sort the DataFrame by 'rating' in descending order
sorted_df = df.sort_values(by='rating', ascending=False)

# Select the top 500 movies
top_500 = sorted_df.head(500)

# Extract only the 'movieId' and 'tmdbId'
result = top_500[['title', 'tmdbId', 'rating']]

# Serialize the result using pickle
with open('top_500_movies.pkl', 'wb') as file:
    pickle.dump(result, file)

print("Data has been serialized to 'top_500_movies.pkl'")


Data has been serialized to 'top_500_movies.pkl'


In [27]:
result

Unnamed: 0,title,tmdbId,rating
3359,"Farmer's Wife, The",143750.0,10.000
4028,Promise Her Anything,265966.0,10.000
772,To Cross the Rubicon,277270.0,9.000
357,"Shawshank Redemption, The",278.0,8.704
1264,"Godfather, The",238.0,8.696
...,...,...,...
3202,Out of the Past,678.0,7.587
4447,"Longest Day, The",9289.0,7.587
1365,"Wizard of Oz, The",630.0,7.585
4777,"Raisin in the Sun, A",29478.0,7.585


In [5]:
import pandas as pd

# Assuming df is your existing DataFrame already loaded
df=pd.read_csv('interpret.csv')
# Load the movie data
movies_df = pd.read_csv('../Datasets/movie.csv')

# Extract year from the 'title' column
movies_df['year'] = movies_df['title'].str.extract('(\(\d{4}\))')

# Remove parentheses around the year
movies_df['year'] = movies_df['year'].str.replace('[()]', '', regex=True)

# Merge this with the existing DataFrame df
df = pd.merge(df, movies_df[['movieId', 'year']], on='movieId', how='left')

# Check the first few rows to verify
print(df.head())


   movieId                        title  \
0        1                    Toy Story   
1        2                      Jumanji   
2        3             Grumpier Old Men   
3        4            Waiting to Exhale   
4        5  Father of the Bride Part II   

                                        genres  imdbId   tmdbId  rating  year  
0  Adventure|Animation|Children|Comedy|Fantasy  114709    862.0   7.971  1995  
1                   Adventure|Children|Fantasy  113497   8844.0   7.240  1995  
2                               Comedy|Romance  113228  15602.0   6.500  1995  
3                         Comedy|Drama|Romance  114885  31357.0   6.300  1995  
4                                       Comedy  113041  11862.0   6.249  1995  


In [6]:
df

Unnamed: 0,movieId,title,genres,imdbId,tmdbId,rating,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,114709,862.0,7.971,1995
1,2,Jumanji,Adventure|Children|Fantasy,113497,8844.0,7.240,1995
2,3,Grumpier Old Men,Comedy|Romance,113228,15602.0,6.500,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,114885,31357.0,6.300,1995
4,5,Father of the Bride Part II,Comedy,113041,11862.0,6.249,1995
...,...,...,...,...,...,...,...
18380,128734,Polskie gówno,Comedy|Musical,4438688,,0.000,
18381,128734,Polskie gówno,Comedy|Musical,4438688,,0.000,
18382,128734,Polskie gówno,Comedy|Musical,4438688,,0.000,
18383,128734,Polskie gówno,Comedy|Musical,4438688,,0.000,


In [7]:
df.to_csv('updated.csv',index=False)

In [16]:
import pandas as pd

# Assuming 'df' is your DataFrame already loaded with movieId, title, genres, imdbId, tmdbId, rating, and year
df=pd.read_csv('updated.csv')
# Convert 'year' to integers if it's not already

# Check for null values in the DataFrame
print("Null value count before removal:")
print(df.isnull().sum())

# Remove rows with any null values
df.dropna(inplace=True)

# Make sure the 'year' is a numeric column for accurate sorting
df['year'] = pd.to_numeric(df['year'], errors='coerce')

# Also ensure 'rating' is numeric if it hasn't been converted yet
df['rating'] = pd.to_numeric(df['rating'], errors='coerce')

# Sort the DataFrame by 'year' descending, then by 'rating' descending
sorted_movies = df.sort_values(by=['year', 'rating'], ascending=[False, False])

# After sorting, take the top 500 movies
top_500_sorted_movies = sorted_movies.head(500)

# Display or use the top 500 sorted movies
print("\nTop 500 sorted movies after removing null values:")
print(top_500_sorted_movies)


Empty DataFrame
Columns: [movieId, title, genres, imdbId, tmdbId, rating, year]
Index: []


In [19]:
df

Unnamed: 0,movieId,title,genres,imdbId,tmdbId,rating,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,114709,862.0,7.971,1995.0
1,2,Jumanji,Adventure|Children|Fantasy,113497,8844.0,7.240,1995.0
2,3,Grumpier Old Men,Comedy|Romance,113228,15602.0,6.500,1995.0
3,4,Waiting to Exhale,Comedy|Drama|Romance,114885,31357.0,6.300,1995.0
4,5,Father of the Bride Part II,Comedy,113041,11862.0,6.249,1995.0
...,...,...,...,...,...,...,...
18380,128734,Polskie gówno,Comedy|Musical,4438688,,0.000,
18381,128734,Polskie gówno,Comedy|Musical,4438688,,0.000,
18382,128734,Polskie gówno,Comedy|Musical,4438688,,0.000,
18383,128734,Polskie gówno,Comedy|Musical,4438688,,0.000,


In [28]:
import pickle

# Load the dumped data from latest_movies.pkl
with open('../latest_movies.pkl', 'rb') as file:
    latest_movies = pickle.load(file)

# Load the dumped data from top_rated_movies.pkl
with open('./top_500_movies.pkl', 'rb') as file:
    top_rated_movies = pickle.load(file)

# Display or use the loaded data
print("Latest Movies:")
print(latest_movies)

print("\nTop Rated Movies:")
print(top_rated_movies)


Latest Movies:
       movieId                                  title   imdbId    tmdbId  year
26504   127160                   In Football We Trust  1780871  319074.0  2015
26502   127154                   The Mask You Live In  3983674  224972.0  2015
26505   127162                 Most Likely to Succeed  4267108  319078.0  2015
26506   127164            What Happened, Miss Simone?  4284010  318044.0  2015
26928   129428  The Second Best Exotic Marigold Hotel  2555736  268238.0  2015
...        ...                                    ...      ...       ...   ...
24494   115996                                 Stereo  3348102  256311.0  2014
24818   117340                On a marché sur Bangkok  3985434  295087.0  2014
24600   116413                          Life Partners  2870808  260001.0  2014
24875   117511                Hello Ladies: The Movie  3762944  303623.0  2014
24592   116397                     Stonehearst Asylum  1772264  207933.0  2014

[500 rows x 5 columns]

Top Rated Mo

In [29]:
latest_movies

Unnamed: 0,movieId,title,imdbId,tmdbId,year
26504,127160,In Football We Trust,1780871,319074.0,2015
26502,127154,The Mask You Live In,3983674,224972.0,2015
26505,127162,Most Likely to Succeed,4267108,319078.0,2015
26506,127164,"What Happened, Miss Simone?",4284010,318044.0,2015
26928,129428,The Second Best Exotic Marigold Hotel,2555736,268238.0,2015
...,...,...,...,...,...
24494,115996,Stereo,3348102,256311.0,2014
24818,117340,On a marché sur Bangkok,3985434,295087.0,2014
24600,116413,Life Partners,2870808,260001.0,2014
24875,117511,Hello Ladies: The Movie,3762944,303623.0,2014


In [30]:
top_rated_movies

Unnamed: 0,title,tmdbId,rating
3359,"Farmer's Wife, The",143750.0,10.000
4028,Promise Her Anything,265966.0,10.000
772,To Cross the Rubicon,277270.0,9.000
357,"Shawshank Redemption, The",278.0,8.704
1264,"Godfather, The",238.0,8.696
...,...,...,...
3202,Out of the Past,678.0,7.587
4447,"Longest Day, The",9289.0,7.587
1365,"Wizard of Oz, The",630.0,7.585
4777,"Raisin in the Sun, A",29478.0,7.585


In [23]:
latest = latest_movies['title'].tolist()

In [24]:
latest

['In Football We Trust',
 'The Mask You Live In',
 'Most Likely to Succeed',
 'What Happened, Miss Simone?',
 'The Second Best Exotic Marigold Hotel',
 'Power/Rangers',
 'Spring',
 'Jönssonligan - Den perfekta stöten',
 'Muck',
 'Advantageous',
 'The Bronze',
 'The D Train',
 'The Diary of a Teenage Girl',
 'Dope',
 'I Smile Back',
 'Me and Earl and the Dying Girl',
 'The Overnight',
 'People, Places, Things',
 'The Gunman',
 "Prophet's Prey",
 'Going Clear: Scientology and the Prison of Belief',
 'Songs My Brothers Taught Me',
 'The Hunting Ground',
 'SpongeBob Movie: Sponge Out of Water, The',
 'Last Days in the Desert',
 'Get Hard',
 "I'll See You in My Dreams",
 'Seoul Searching',
 'Mississippi Grind',
 'Mistress America',
 'Demonic',
 'Zipper',
 'A Walk in the Woods',
 'True Story',
 'Ten Thousand Saints',
 'Sleeping with Other People',
 'Justice League: Throne of Atlantis',
 'Beaver Trilogy Part IV',
 'Drunk Stoned Brilliant Dead: The Story of the National Lampoon',
 'Kurt Cobain

In [25]:
import pickle

with open('../latest.pkl', 'wb') as file:
    pickle.dump(latest, file)

In [31]:
top_rate=top_rated_movies['title'].tolist()

In [34]:
top_rate[0]

"Farmer's Wife, The"

In [33]:
import pickle

with open('../top_rate.pkl', 'wb') as file:
    pickle.dump(top_rate, file)

In [6]:
import pandas as pd

rating = pd.read_csv('../Datasets/rating.csv')


In [7]:
ratings=rating.sample(n=27000)
ratings

Unnamed: 0,userId,movieId,rating,timestamp
10458149,72341,51540,3.0,2015-03-27 02:44:00
14251932,98434,2502,3.5,2005-01-23 04:19:37
256916,1763,7101,2.0,2005-11-06 23:59:51
19736345,136642,2529,3.5,2006-10-07 17:25:31
18186803,125794,46972,3.0,2007-04-29 05:43:27
...,...,...,...,...
19834435,137287,350,4.0,1996-05-24 03:35:43
7982883,55027,5952,5.0,2008-06-15 02:00:35
701432,4649,1179,3.0,1997-11-14 10:23:48
5621625,38694,1610,5.0,1999-11-29 20:05:45


In [8]:
user_movie_matrix = ratings.pivot_table(index='userId', columns='movieId', values='rating', fill_value=0)

In [9]:
user_movie_matrix

movieId,1,2,3,4,5,6,7,8,9,10,...,115147,115149,115617,116797,116823,118246,118702,118760,118997,119141
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
54,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138456,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
138459,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
138472,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
138477,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
import pickle
with open('../user_movie_matrix.pkl', 'wb') as file:
    pickle.dump(user_movie_matrix, file)

with open('../movies_data.pkl', 'rb') as file:
    movies_data=pickle.load(file)

In [17]:
import numpy as np
def recommend_movies_based_on_movie(movie_id, num_recommendations=14):
    # Find all users who watched the given movie
    users_who_watched = user_movie_matrix[user_movie_matrix[movie_id] > 0].index.tolist()
    
    # Get the subset of the matrix corresponding to these users
    subset_matrix = user_movie_matrix.loc[users_who_watched]
    
    # Count how many times each movie was watched by these users
    movie_popularity = subset_matrix.apply(np.sum, axis=0)
    
    # Remove the original movie from the list
    movie_popularity = movie_popularity.drop(movie_id)
    
    # Sort movies by the most watched
    recommended_movies = movie_popularity.sort_values(ascending=False).head(num_recommendations).index
    
    # Get movie titles
    recommended_movie_titles = movies_data[movies_data['movieId'].isin(recommended_movies)]['title'].tolist()
    
    return recommended_movie_titles

In [18]:
recommend_movies_based_on_movie(1)

['Johnny Mnemonic',
 'Little Women',
 'Godfather, The',
 'Last of the Mohicans, The',
 'U Turn',
 'Wag the Dog',
 'Stalag 17',
 'Man with the Golden Gun, The',
 'Lord of the Rings: The Return of the King, The',
 'Calendar Girls',
 'Fog of War: Eleven Lessons from the Life of Robert S. McNamara, The',
 'House of Sand and Fog',
 'Monster',
 'Kick-Ass']