In [1]:
!pip install surprise
import os
import pandas as pd
import numpy as np
from surprise import Reader, Dataset, SVD


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
movie_titles = pd.read_csv('/content/drive/MyDrive/recomendation_system/movie_titles.csv',encoding = 'ISO-8859-1',header = None,names = ['movie_id', 'year', 'movie_name']).set_index('movie_id')
movie_titles.head()


Unnamed: 0_level_0,year,movie_name
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2003.0,Dinosaur Planet
2,2004.0,Isle of Man TT 2004 Review
3,1997.0,Character
4,1994.0,Paula Abdul's Get Up & Dance
5,2004.0,The Rise and Fall of ECW


In [None]:
movie_titles.isnull().sum().sort_values(ascending = False)

year          7
movie_name    0
dtype: int64

In [None]:
movie_titles['year'] = movie_titles['year'].fillna('')


In [None]:
movie_titles.isnull().sum().sort_values(ascending = False)

year          0
movie_name    0
dtype: int64

In [None]:
#using content based filter 
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english')
movie_titles['movie_name'] = movie_titles['movie_name'].fillna('')
tfidf_matrix = tfidf.fit_transform(movie_titles['movie_name'])
tfidf_matrix.shape

(17770, 11527)

In [None]:
from sklearn.metrics.pairwise import linear_kernel
# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
cosine_sim.shape


(17770, 17770)

In [None]:
indices = pd.Series(movie_titles.index, index=movie_titles['movie_name']).drop_duplicates()


In [None]:
# Function that takes  movie name  and return most recommended movie to that name
#content based filter
def get_recommendations(title, cosine_sim=cosine_sim):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:10]
    movie_indices = [i[0] for i in sim_scores]
    # Return the top 10 most similar movies
    return movie_titles['movie_name'].iloc[movie_indices]


In [None]:
get_recommendations('Speed').head(10)


3513                    Full Speed
5978                 Speed of Life
2024               Legend of Speed
16194                  Speed: IMAX
6018         Speed: Bonus Material
2420     With All Deliberate Speed
3545                   Speed Racer
12148      Speed 2: Cruise Control
0                  Dinosaur Planet
Name: movie_name, dtype: object

In [None]:
#get all text files and store it in one csv file with movie_id  and all user data 
if not os.path.isfile('/content/drive/MyDrive/recomendation_system/data.csv'):
    # Create a file 'data.csv' before reading it
    data = open('/content/drive/MyDrive/recomendation_system/data.csv', mode='w')
    
    row = list()
    files = [
        '/content/drive/MyDrive/recomendation_system/combined_data_1.txt',
        '/content/drive/MyDrive/recomendation_system/combined_data_2.txt', 
        '/content/drive/MyDrive/recomendation_system/combined_data_3.txt', 
        '/content/drive/MyDrive/recomendation_system/combined_data_4.txt'
    ]
    for file in files:
        print("Reading ratings from {}\n".format(file))
        with open(file) as f:
            for line in f: 
                line = line.strip()
                if line.endswith(':'):
                    movie_id = line.replace(':', '')
                else:
                    row = [x for x in line.split(',')]
                    row.insert(0, movie_id)
                    data.write(','.join(row))
                    data.write('\n')
    data.close()


Reading ratings from /content/drive/MyDrive/recomendation_system/combined_data_1.txt

Reading ratings from /content/drive/MyDrive/recomendation_system/combined_data_2.txt

Reading ratings from /content/drive/MyDrive/recomendation_system/combined_data_3.txt

Reading ratings from /content/drive/MyDrive/recomendation_system/combined_data_4.txt



In [4]:
user_data=pd.read_csv('/content/drive/MyDrive/recomendation_system/data.csv', sep=',',names=['movie_id', 'user', 'rating', 'date'])
user_data.head()

Unnamed: 0,movie_id,user,rating,date
0,1,1488844,3,2005-09-06
1,1,822109,5,2005-05-13
2,1,885013,4,2005-10-19
3,1,30878,4,2005-12-26
4,1,823519,3,2004-05-03


In [None]:
user_data.isnull().sum().sort_values(ascending = False)

movie_id    0
user        0
rating      0
date        0
dtype: int64

In [5]:
#merge data with movie title to get movie name in same dataframe
df = pd.merge(user_data,movie_titles,on='movie_id')


In [6]:
df.head()

Unnamed: 0,movie_id,user,rating,date,year,movie_name
0,1,1488844,3,2005-09-06,2003.0,Dinosaur Planet
1,1,822109,5,2005-05-13,2003.0,Dinosaur Planet
2,1,885013,4,2005-10-19,2003.0,Dinosaur Planet
3,1,30878,4,2005-12-26,2003.0,Dinosaur Planet
4,1,823519,3,2004-05-03,2003.0,Dinosaur Planet


In [None]:
display(df.describe().transpose())

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
movie_id,100480507.0,9070.915,5131.890697,1.0,4677.0,9051.0,13635.0,17770.0
user,100480507.0,1322489.0,764536.800275,6.0,661198.0,1319012.0,1984455.0,2649429.0
rating,100480507.0,3.60429,1.085219,1.0,3.0,4.0,4.0,5.0
year,100479542.0,1993.912,12.400832,1896.0,1990.0,1998.0,2002.0,2005.0


In [6]:
#mean rating for movies
df.groupby('movie_name')['rating'].mean().head()

movie_name
'Allo 'Allo!: Series 1                    3.664319
'Allo 'Allo!: Series 2                    3.970968
'Allo 'Allo!: Series 3                    3.933333
'N Sync: 'N the Mix                       2.655936
'N Sync: Live at Madison Square Garden    2.890884
Name: rating, dtype: float64

In [10]:
df.groupby('movie_name')['rating'].mean().sort_values(ascending=False).head()

movie_name
Lord of the Rings: The Return of the King: Extended Edition            4.723270
The Lord of the Rings: The Fellowship of the Ring: Extended Edition    4.716611
Lord of the Rings: The Two Towers: Extended Edition                    4.702611
Lost: Season 1                                                         4.670989
Battlestar Galactica: Season 1                                         4.638809
Name: rating, dtype: float64

In [11]:
#total number of rating
df.groupby('movie_name')['rating'].count().sort_values(ascending=False).head()

movie_name
Miss Congeniality         232944
Independence Day          216596
The Patriot               211764
The Godfather             206551
The Day After Tomorrow    196397
Name: rating, dtype: int64

In [6]:
#due to crash session i will work with first 1000000 record
df2= df.head(1000000)


In [8]:
#using collobrative filter 
#memory based collobrative filter
#item based filter
from scipy import sparse
userRatings = df2.pivot_table(index=['user'],columns=['movie_name'],values='rating')
userRatings.head()

movie_name,6ixtynin9,7 Seconds,8 Man,A Fishy Story,A Killer Within,A Little Princess,A Yank in the R.A.F.,ABC Primetime: Mel Gibson's The Passion of the Christ,Adam-12: Season 1,Airplane II: The Sequel,...,WWE: Armageddon 2003,WWE: Royal Rumble 2005,We're Not Married,What the #$*! Do We Know!?,Where Sleeping Dogs Lie,Winston Churchill: The Wilderness Years,Winter Kills,X2: X-Men United,Yellow,Zatoichi's Conspiracy
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,,,,,,,,,,,...,,,,,,,,2.0,,
7,,,,,,,,,,,...,,,,5.0,,,,4.0,,
10,,,,,,,,,,,...,,,,,,,,4.0,,
25,,,,,,,,,,,...,,,,,,,,,,
33,,,,,,,,,,,...,,,,,,,,,,


In [9]:
userRatings = userRatings.fillna(0,axis=1)


In [10]:
userRatings.head()

movie_name,6ixtynin9,7 Seconds,8 Man,A Fishy Story,A Killer Within,A Little Princess,A Yank in the R.A.F.,ABC Primetime: Mel Gibson's The Passion of the Christ,Adam-12: Season 1,Airplane II: The Sequel,...,WWE: Armageddon 2003,WWE: Royal Rumble 2005,We're Not Married,What the #$*! Do We Know!?,Where Sleeping Dogs Lie,Winston Churchill: The Wilderness Years,Winter Kills,X2: X-Men United,Yellow,Zatoichi's Conspiracy
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,5.0,0.0,0.0,0.0,4.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0
25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
corrMatrix = userRatings.corr(method='pearson')
corrMatrix.head(10)


movie_name,6ixtynin9,7 Seconds,8 Man,A Fishy Story,A Killer Within,A Little Princess,A Yank in the R.A.F.,ABC Primetime: Mel Gibson's The Passion of the Christ,Adam-12: Season 1,Airplane II: The Sequel,...,WWE: Armageddon 2003,WWE: Royal Rumble 2005,We're Not Married,What the #$*! Do We Know!?,Where Sleeping Dogs Lie,Winston Churchill: The Wilderness Years,Winter Kills,X2: X-Men United,Yellow,Zatoichi's Conspiracy
movie_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6ixtynin9,1.0,0.005769,0.029253,0.034289,0.007694,0.006521,0.01869,0.002271,0.021857,0.01389,...,0.015448,0.008543,0.016404,0.017413,0.029546,0.013838,0.033924,0.00968,0.037603,0.037869
7 Seconds,0.005769,1.0,0.011479,0.002743,0.055554,0.001561,0.007845,0.031864,0.017133,0.005242,...,0.010989,0.019657,0.000729,-0.007585,0.007947,0.000417,0.005327,0.025752,0.002071,0.008588
8 Man,0.029253,0.011479,1.0,0.049668,0.024884,0.010505,0.042562,0.023396,0.051195,0.014381,...,0.045724,0.026252,0.028034,0.002728,0.086146,0.024376,0.039317,0.007017,0.042189,0.036826
A Fishy Story,0.034289,0.002743,0.049668,1.0,0.033264,0.007921,0.041019,0.003694,0.020925,0.008674,...,0.019676,0.018338,0.038551,0.004337,0.046455,0.020023,0.03031,0.000967,0.046743,0.019042
A Killer Within,0.007694,0.055554,0.024884,0.033264,1.0,0.003874,0.023,0.013514,0.015591,0.003434,...,0.021976,0.017007,0.021691,0.002855,0.024647,0.00502,0.01273,-0.001552,0.013423,0.007981
A Little Princess,0.006521,0.001561,0.010505,0.007921,0.003874,1.0,0.031392,0.05959,0.021613,0.045705,...,0.007555,0.007764,0.017847,0.006815,0.014472,0.005065,0.006383,0.07377,0.002401,0.00365
A Yank in the R.A.F.,0.01869,0.007845,0.042562,0.041019,0.023,0.031392,1.0,0.021738,0.04322,0.026272,...,0.02941,0.017787,0.104299,0.00209,0.05013,0.041045,0.040686,0.007647,0.020027,0.025259
ABC Primetime: Mel Gibson's The Passion of the Christ,0.002271,0.031864,0.023396,0.003694,0.013514,0.05959,0.021738,1.0,0.038229,0.023033,...,0.016147,0.019079,0.004466,-0.002144,0.01411,0.007606,0.007664,0.027432,0.005008,0.007607
Adam-12: Season 1,0.021857,0.017133,0.051195,0.020925,0.015591,0.021613,0.04322,0.038229,1.0,0.027631,...,0.041713,0.029743,0.027146,0.006091,0.043176,0.029251,0.02835,0.010067,0.023086,0.03935
Airplane II: The Sequel,0.01389,0.005242,0.014381,0.008674,0.003434,0.045705,0.026272,0.023033,0.027631,1.0,...,0.012548,0.019532,0.011126,0.018738,0.016606,0.00585,0.017941,0.149887,0.006786,0.016525


In [12]:
def get_recommend_movie(movie_name):
    similar_ratings = corrMatrix[movie_name]
    similar_ratings = similar_ratings.sort_values(ascending=False)
    #print(type(similar_ratings))
    return similar_ratings

In [19]:
recomended_movie = get_recommend_movie('8 Man')


In [20]:
print(recomended_movie.head(10))

movie_name
8 Man                      1.000000
Barbarian Queen 2          0.101700
Iron Monkey 2              0.086235
Where Sleeping Dogs Lie    0.086146
Sam the Iron Bridge        0.077875
Spirit Lost                0.066670
ECW: Cyberslam '99         0.061064
Arachnid                   0.060770
Onmyoji                    0.059592
Horror Vision              0.059460
Name: 8 Man, dtype: float64


In [7]:
#using collobrative filter 
#memory based collobrative filter
#user based filter
user_rating = df2.pivot_table(index = 'user', columns = 'movie_name', values = 'rating', fill_value=0)
user_rating.head(5)

movie_name,6ixtynin9,7 Seconds,8 Man,A Fishy Story,A Killer Within,A Little Princess,A Yank in the R.A.F.,ABC Primetime: Mel Gibson's The Passion of the Christ,Adam-12: Season 1,Airplane II: The Sequel,...,WWE: Armageddon 2003,WWE: Royal Rumble 2005,We're Not Married,What the #$*! Do We Know!?,Where Sleeping Dogs Lie,Winston Churchill: The Wilderness Years,Winter Kills,X2: X-Men United,Yellow,Zatoichi's Conspiracy
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,5,0,0,0,4,0,0
10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,4,0,0
25,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
33,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
user_rating = user_rating.T
user_rating.head()


user,6,7,10,25,33,42,59,79,87,94,...,2649370,2649375,2649376,2649378,2649388,2649401,2649404,2649409,2649426,2649429
movie_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6ixtynin9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7 Seconds,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,4,0
8 Man,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A Fishy Story,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A Killer Within,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
import sklearn
from sklearn.decomposition import TruncatedSVD
SVD = TruncatedSVD(n_components=10)
decomposed_matrix = SVD.fit_transform(user_rating)
decomposed_matrix.shape

(225, 10)

In [13]:
correlation_matrix = np.corrcoef(decomposed_matrix)
correlation_matrix.shape


(225, 225)

In [37]:
#method take movie name and return recommended films
def get_recommend(movie_name):
   movie_names = list(user_rating.index)
   movie_id = movie_names.index(movie_name)
   Recommend_film = list(user_rating.index[correlation_matrix[movie_id] > 0.80])
   return Recommend_film[0:10]


In [38]:
get_recommend('6ixtynin9')

['6ixtynin9',
 '8 Man',
 'Airplane II: The Sequel',
 'Antarctica: IMAX',
 'Arliss: The Best of Arliss',
 "Ashtanga Yoga: Beginner's Practice with Nicki Doane",
 'At Home Among Strangers',
 'Bear Cub',
 'Billy Blanks: Tae Bo: Cardio Circuit 1',
 'Bollywood Bound']

In [None]:
#using collabrative filter to find recommended films to specific user
#model based collobrative filtering
#using surbrise library
from surprise.model_selection import cross_validate
reader = Reader()
data = Dataset.load_from_df(df2[['movie_id', 'user', 'rating']], reader)

In [None]:
trainset = data.build_full_trainset()
svd = SVD()


In [None]:
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f2c00962f10>

In [None]:
df_movies_cpy= df2.copy()

In [None]:
df_movies_cpy = df_movies_cpy.reset_index(drop=False)
df_movies_cpy.head()

Unnamed: 0,index,movie_id,user,rating,date,year,movie_name
0,0,1,1488844,3,2005-09-06,2003.0,Dinosaur Planet
1,1,1,822109,5,2005-05-13,2003.0,Dinosaur Planet
2,2,1,885013,4,2005-10-19,2003.0,Dinosaur Planet
3,3,1,30878,4,2005-12-26,2003.0,Dinosaur Planet
4,4,1,823519,3,2004-05-03,2003.0,Dinosaur Planet


In [None]:
df_movies_cpy.shape 

(10000000, 7)

In [None]:
#method take userId and return the recommendation movie using surbrise library
def get_recommend_foruser(userId):
    df_movies_cpy['Estimate'] = df_movies_cpy['index'].apply(lambda x: svd.predict(userId,x).est)
    df_new_frame=df_movies_cpy.drop(['user', 'date','year','movie_id','rating','index'],axis=1)
    return df_new_frame.sort_values('Estimate', ascending=False).head(10)

In [None]:
get_recommend_foruser(904250)

Unnamed: 0,movie_name,Estimate
844526,Taking Lives,5.0
2307226,Mississippi Burning,5.0
447759,Lucio Fulci: The Beyond,5.0
642384,Reservoir Dogs,5.0
27061,7 Seconds,5.0
1732425,Dogma,5.0
2304126,Mississippi Burning,5.0
1860235,The Taming of the Shrew,5.0
794999,X2: X-Men United,5.0
293718,Congo,5.0
