In [1]:
!pip install surprise
import os
import pandas as pd
import numpy as np
from surprise import Reader, Dataset, SVD


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
movie_titles = pd.read_csv('/content/drive/MyDrive/recomendation_system/movie_titles.csv',encoding = 'ISO-8859-1',header = None,names = ['movie_id', 'year', 'movie_name']).set_index('movie_id')
movie_titles.head()


Unnamed: 0_level_0,year,movie_name
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2003.0,Dinosaur Planet
2,2004.0,Isle of Man TT 2004 Review
3,1997.0,Character
4,1994.0,Paula Abdul's Get Up & Dance
5,2004.0,The Rise and Fall of ECW


In [None]:
movie_titles.isnull().sum().sort_values(ascending = False)

year          7
movie_name    0
dtype: int64

In [None]:
movie_titles['year'] = movie_titles['year'].fillna('')


In [None]:
movie_titles.isnull().sum().sort_values(ascending = False)

year          0
movie_name    0
dtype: int64

In [None]:
#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
movie_titles['movie_name'] = movie_titles['movie_name'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(movie_titles['movie_name'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

(17770, 11527)

In [None]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
cosine_sim.shape


(17770, 17770)

In [None]:
indices = pd.Series(movie_titles.index, index=movie_titles['movie_name']).drop_duplicates()


In [None]:
# Function that takes  movie name  and return most recommended movie to that name
#using content based filter
def get_recommendations(title, cosine_sim=cosine_sim):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:10]
    movie_indices = [i[0] for i in sim_scores]
    # Return the top 10 most similar movies
    return movie_titles['movie_name'].iloc[movie_indices]


In [None]:
get_recommendations('Speed').head(10)


3513                    Full Speed
5978                 Speed of Life
2024               Legend of Speed
16194                  Speed: IMAX
6018         Speed: Bonus Material
2420     With All Deliberate Speed
3545                   Speed Racer
12148      Speed 2: Cruise Control
0                  Dinosaur Planet
Name: movie_name, dtype: object

In [None]:
#get all text files and store it in one csv file with movie_id  and all user data 
if not os.path.isfile('/content/drive/MyDrive/recomendation_system/data.csv'):
    # Create a file 'data.csv' before reading it
    # Read all the files in the dataset and store them in one big file ('data.csv')
    # We're reading from each of the four files and appending each rating to a global file 'data.csv'
    data = open('/content/drive/MyDrive/recomendation_system/data.csv', mode='w')
    
    row = list()
    files = [
        '/content/drive/MyDrive/recomendation_system/combined_data_1.txt',
        '/content/drive/MyDrive/recomendation_system/combined_data_2.txt', 
        '/content/drive/MyDrive/recomendation_system/combined_data_3.txt', 
        '/content/drive/MyDrive/recomendation_system/combined_data_4.txt'
    ]
    for file in files:
        print("Reading ratings from {}\n".format(file))
        with open(file) as f:
            for line in f: 
                line = line.strip()
                if line.endswith(':'):
                    movie_id = line.replace(':', '')
                else:
                    row = [x for x in line.split(',')]
                    row.insert(0, movie_id)
                    data.write(','.join(row))
                    data.write('\n')
    data.close()


Reading ratings from /content/drive/MyDrive/recomendation_system/combined_data_1.txt

Reading ratings from /content/drive/MyDrive/recomendation_system/combined_data_2.txt

Reading ratings from /content/drive/MyDrive/recomendation_system/combined_data_3.txt

Reading ratings from /content/drive/MyDrive/recomendation_system/combined_data_4.txt



In [4]:
user_data=pd.read_csv('/content/drive/MyDrive/recomendation_system/data.csv', sep=',',names=['movie_id', 'user', 'rating', 'date'])
user_data.head()

Unnamed: 0,movie_id,user,rating,date
0,1,1488844,3,2005-09-06
1,1,822109,5,2005-05-13
2,1,885013,4,2005-10-19
3,1,30878,4,2005-12-26
4,1,823519,3,2004-05-03


In [None]:
user_data.isnull().sum().sort_values(ascending = False)

movie_id    0
user        0
rating      0
date        0
dtype: int64

In [5]:
#merge data with movie title to get movie name in same csv
df = pd.merge(user_data,movie_titles,on='movie_id')


In [7]:
df.head()

Unnamed: 0,movie_id,user,rating,date,year,movie_name
0,1,1488844,3,2005-09-06,2003.0,Dinosaur Planet
1,1,822109,5,2005-05-13,2003.0,Dinosaur Planet
2,1,885013,4,2005-10-19,2003.0,Dinosaur Planet
3,1,30878,4,2005-12-26,2003.0,Dinosaur Planet
4,1,823519,3,2004-05-03,2003.0,Dinosaur Planet


In [10]:
display(df.describe().transpose())

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
movie_id,100480507.0,9070.915,5131.890697,1.0,4677.0,9051.0,13635.0,17770.0
user,100480507.0,1322489.0,764536.800275,6.0,661198.0,1319012.0,1984455.0,2649429.0
rating,100480507.0,3.60429,1.085219,1.0,3.0,4.0,4.0,5.0
year,100479542.0,1993.912,12.400832,1896.0,1990.0,1998.0,2002.0,2005.0


In [11]:
#due to crash session i will work with first 1000000 record
df2= df.head(10000000)


In [None]:
#using collabrative filter to find recommended films to specific user
#using surbrise library
from surprise.model_selection import cross_validate
reader = Reader()
data = Dataset.load_from_df(df2[['movie_id', 'user', 'rating']], reader)

In [None]:
trainset = data.build_full_trainset()
svd = SVD()


In [None]:
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f2c00962f10>

In [None]:
df_movies_cpy= df2.copy()

In [None]:
df_movies_cpy = df_movies_cpy.reset_index(drop=False)
df_movies_cpy.head()

Unnamed: 0,index,movie_id,user,rating,date,year,movie_name
0,0,1,1488844,3,2005-09-06,2003.0,Dinosaur Planet
1,1,1,822109,5,2005-05-13,2003.0,Dinosaur Planet
2,2,1,885013,4,2005-10-19,2003.0,Dinosaur Planet
3,3,1,30878,4,2005-12-26,2003.0,Dinosaur Planet
4,4,1,823519,3,2004-05-03,2003.0,Dinosaur Planet


In [None]:
df_movies_cpy.shape 

(10000000, 7)

In [None]:
#method take userId and return the recommendation movie using surbrise library
def get_recommend_foruser(userId):
    df_movies_cpy['Estimate'] = df_movies_cpy['index'].apply(lambda x: svd.predict(userId,x).est)
    df_new_frame=df_movies_cpy.drop(['user', 'date','year','movie_id','rating','index'],axis=1)
    return df_new_frame.sort_values('Estimate', ascending=False).head(10)

In [None]:
get_recommend_foruser(904250)

Unnamed: 0,movie_name,Estimate
844526,Taking Lives,5.0
2307226,Mississippi Burning,5.0
447759,Lucio Fulci: The Beyond,5.0
642384,Reservoir Dogs,5.0
27061,7 Seconds,5.0
1732425,Dogma,5.0
2304126,Mississippi Burning,5.0
1860235,The Taming of the Shrew,5.0
794999,X2: X-Men United,5.0
293718,Congo,5.0


In [6]:
#due to crash session i will work with first 1000000 record
df2= df.head(1000000)

In [None]:
#another way to recommend movie using correlattion matrix
#collabritive filter based on correlattion matrix
df_user_ratings = df2.pivot_table(index='user', columns=['movie_name'], values='rating')
df_user_ratings.head(3)

movie_name,6ixtynin9,7 Seconds,8 Man,A Fishy Story,A Killer Within,A Little Princess,A Yank in the R.A.F.,ABC Primetime: Mel Gibson's The Passion of the Christ,Adam-12: Season 1,Airplane II: The Sequel,...,WWE: Armageddon 2003,WWE: Royal Rumble 2005,We're Not Married,What the #$*! Do We Know!?,Where Sleeping Dogs Lie,Winston Churchill: The Wilderness Years,Winter Kills,X2: X-Men United,Yellow,Zatoichi's Conspiracy
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,,,,,,,,,,,...,,,,,,,,2.0,,
7,,,,,,,,,,,...,,,,5.0,,,,4.0,,
10,,,,,,,,,,,...,,,,,,,,4.0,,


In [None]:
#method take movie name and recommend movies depend on user rating that film
def movie_recommender(movie):
  rated_movie = df_user_ratings[movie]
  similar_movies = df_user_ratings.corrwith(rated_movie)
  similar_movies.dropna(inplace=True)
  similar_movies = pd.DataFrame(similar_movies, columns=['correlation'])
  return similar_movies.sort_values(by='correlation', ascending=False).head(10)





In [None]:
movie_recommender('8 Man')

Unnamed: 0_level_0,correlation
movie_name,Unnamed: 1_level_1
One Last Dance,1.0
8 Man,1.0
MTV: Making the Band 2: Best of Season 1,0.969891
Complete Shamanic Princess,0.961375
WWE: Armageddon 2003,0.935414
Tai Chi: The 24 Forms,0.933948
Yellow,0.923579
Allergies: A Natural Approach,0.923579
Drowning on Dry Land,0.923381
Goddess of Mercy,0.918085
