In [1]:
!pip install surprise
import os
import pandas as pd
import numpy as np
from surprise import Reader, Dataset, SVD


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise
  Downloading scikit-surprise-1.1.1.tar.gz (11.8 MB)
[K     |████████████████████████████████| 11.8 MB 32.9 MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp37-cp37m-linux_x86_64.whl size=1633980 sha256=ed4f26867efca70635eb61ae25f0a831a527d1048cdfd7e2ff9e946f9cea27ab
  Stored in directory: /root/.cache/pip/wheels/76/44/74/b498c42be47b2406bd27994e16c5188e337c657025ab400c1c
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.1 surprise-0.1


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
movie_titles = pd.read_csv('/content/drive/MyDrive/recomendation_system/movie_titles.csv',encoding = 'ISO-8859-1',header = None,names = ['movie_id', 'year', 'movie_name']).set_index('movie_id')
movie_titles.head()


Unnamed: 0_level_0,year,movie_name
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2003.0,Dinosaur Planet
2,2004.0,Isle of Man TT 2004 Review
3,1997.0,Character
4,1994.0,Paula Abdul's Get Up & Dance
5,2004.0,The Rise and Fall of ECW


In [None]:
#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
movie_titles['movie_name'] = movie_titles['movie_name'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(movie_titles['movie_name'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

(17770, 11527)

In [None]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
cosine_sim.shape


(17770, 17770)

In [None]:
indices = pd.Series(movie_titles.index, index=movie_titles['movie_name']).drop_duplicates()


In [None]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim=cosine_sim):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:10]
    movie_indices = [i[0] for i in sim_scores]
    # Return the top 10 most similar movies
    return movie_titles['movie_name'].iloc[movie_indices]


In [None]:
get_recommendations('Speed').head(10)


3513                    Full Speed
5978                 Speed of Life
2024               Legend of Speed
16194                  Speed: IMAX
6018         Speed: Bonus Material
2420     With All Deliberate Speed
3545                   Speed Racer
12148      Speed 2: Cruise Control
0                  Dinosaur Planet
Name: movie_name, dtype: object

In [None]:
#get all text files and store it in one csv file with movie_id  and all user data 
if not os.path.isfile('/content/drive/MyDrive/recomendation_system/data.csv'):
    # Create a file 'data.csv' before reading it
    # Read all the files in the dataset and store them in one big file ('data.csv')
    # We're reading from each of the four files and appending each rating to a global file 'data.csv'
    data = open('/content/drive/MyDrive/recomendation_system/data.csv', mode='w')
    
    row = list()
    files = [
        '/content/drive/MyDrive/recomendation_system/combined_data_1.txt',
        '/content/drive/MyDrive/recomendation_system/combined_data_2.txt', 
        '/content/drive/MyDrive/recomendation_system/combined_data_3.txt', 
        '/content/drive/MyDrive/recomendation_system/combined_data_4.txt'
    ]
    for file in files:
        print("Reading ratings from {}\n".format(file))
        with open(file) as f:
            for line in f: 
                line = line.strip()
                if line.endswith(':'):
                    movie_id = line.replace(':', '')
                else:
                    row = [x for x in line.split(',')]
                    row.insert(0, movie_id)
                    data.write(','.join(row))
                    data.write('\n')
    data.close()


Reading ratings from /content/drive/MyDrive/recomendation_system/combined_data_1.txt

Reading ratings from /content/drive/MyDrive/recomendation_system/combined_data_2.txt

Reading ratings from /content/drive/MyDrive/recomendation_system/combined_data_3.txt

Reading ratings from /content/drive/MyDrive/recomendation_system/combined_data_4.txt



In [None]:
user_data=pd.read_csv('/content/drive/MyDrive/recomendation_system/data.csv', sep=',',names=['movie_id', 'user', 'rating', 'date'])
user_data.head()

Unnamed: 0,movie_id,user,rating,date
0,1,1488844,3,2005-09-06
1,1,822109,5,2005-05-13
2,1,885013,4,2005-10-19
3,1,30878,4,2005-12-26
4,1,823519,3,2004-05-03


In [None]:
user_data.isnull().sum().sort_values(ascending = False)

movie_id    0
user        0
rating      0
date        0
dtype: int64

In [None]:
#merge data with movie title to get movie name in same csv
df = pd.merge(user_data,movie_titles,on='movie_id')


In [None]:
df.head()

Unnamed: 0,movie_id,user,rating,date,year,movie_name
0,1,1488844,3,2005-09-06,2003.0,Dinosaur Planet
1,1,822109,5,2005-05-13,2003.0,Dinosaur Planet
2,1,885013,4,2005-10-19,2003.0,Dinosaur Planet
3,1,30878,4,2005-12-26,2003.0,Dinosaur Planet
4,1,823519,3,2004-05-03,2003.0,Dinosaur Planet


In [None]:
# method to find the best rated movie for specific user
def user_fav_film(user_id):
  df_user=df[(df['user'] == user_id) & (df['rating'] == 5)]
  print(df_user['movie_name'])
   

In [None]:
user_fav_film(822109)

1                                             Dinosaur Planet
3014052                                       American Beauty
3216157                                                 Speed
4855982                                             The Mummy
5645341                                  Fried Green Tomatoes
6204109                                           Man on Fire
9688031     Pirates of the Caribbean: The Curse of the Bla...
15632078                  T-Rex: Back to the Cretaceous: IMAX
15914259                                                Ghost
18004711                                        Forever Young
18463157                                              Beaches
22714709                                      The Sixth Sense
22936652                              While You Were Sleeping
24335089                                      Steel Magnolias
27313183                               Angels in the Outfield
28744527                                    Miss Congeniality
32555990

In [None]:
#because the session crash every  time i try work with one text file 
if not os.path.isfile('/content/drive/MyDrive/recomendation_system/data_1.csv'):
    # Create a file 'data.csv' before reading it
    # Read all the files in the dataset and store them in one big file ('data.csv')
    # We're reading from each of the four files and appending each rating to a global file 'data.csv'
    data = open('/content/drive/MyDrive/recomendation_system/data_1.csv', mode='w')
    
    row = list()
    files = [
        '/content/drive/MyDrive/recomendation_system/combined_data_1.txt',
      
    ]
    for file in files:
        print("Reading ratings from {}\n".format(file))
        with open(file) as f:
            for line in f: 
                line = line.strip()
                if line.endswith(':'):
                    movie_id = line.replace(':', '')
                else:
                    row = [x for x in line.split(',')]
                    row.insert(0, movie_id)
                    data.write(','.join(row))
                    data.write('\n')
    data.close()


Reading ratings from /content/drive/MyDrive/recomendation_system/combined_data_1.txt



In [4]:
df1=pd.read_csv('/content/drive/MyDrive/recomendation_system/data_1.csv', sep=',',names=['movie_id', 'user', 'rating', 'date'])
df1.head()

Unnamed: 0,movie_id,user,rating,date
0,1,1488844,3,2005-09-06
1,1,822109,5,2005-05-13
2,1,885013,4,2005-10-19
3,1,30878,4,2005-12-26
4,1,823519,3,2004-05-03


In [5]:
#merge the movie title with csv file to put movie name in the same csv file
df2 = pd.merge(df1,movie_titles,on='movie_id')


In [6]:
df2.head()

Unnamed: 0,movie_id,user,rating,date,year,movie_name
0,1,1488844,3,2005-09-06,2003.0,Dinosaur Planet
1,1,822109,5,2005-05-13,2003.0,Dinosaur Planet
2,1,885013,4,2005-10-19,2003.0,Dinosaur Planet
3,1,30878,4,2005-12-26,2003.0,Dinosaur Planet
4,1,823519,3,2004-05-03,2003.0,Dinosaur Planet


In [7]:
#using collabrative filter to find recommended films to specific user
#using surbrise library
from surprise.model_selection import cross_validate
reader = Reader()
data = Dataset.load_from_df(df2[['movie_id', 'user', 'rating']], reader)

In [8]:
trainset = data.build_full_trainset()
svd = SVD()


In [9]:
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fe77b10b4d0>

In [10]:
df_movies3= df2.copy()

In [14]:
df_movies3 = df_movies3.reset_index(drop=False)
df_movies3.head()


Unnamed: 0,index,movie_id,user,rating,date,year,movie_name
0,0,1,1488844,3,2005-09-06,2003.0,Dinosaur Planet
1,1,1,822109,5,2005-05-13,2003.0,Dinosaur Planet
2,2,1,885013,4,2005-10-19,2003.0,Dinosaur Planet
3,3,1,30878,4,2005-12-26,2003.0,Dinosaur Planet
4,4,1,823519,3,2004-05-03,2003.0,Dinosaur Planet


In [16]:
#method take userId and return the recommendation movie using surbrise library
def collaborative(userId):
    df_movies3['est'] = df_movies3['index'].apply(lambda x: svd.predict(userId,x).est)
    return df_movies3.sort_values('est', ascending=False).head(10)


In [17]:
collaborative(904250)

Unnamed: 0,index,movie_id,user,rating,date,year,movie_name,est
1366229,1366229,295,2553437,4,2005-02-12,1995.0,Ace Ventura: When Nature Calls,5.0
2304126,2304126,442,1019937,4,2005-04-05,1988.0,Mississippi Burning,5.0
2496577,2496577,463,1391586,4,2003-01-19,1962.0,The Twilight Zone: Vol. 12,5.0
882170,882170,197,1011393,5,2004-12-17,2004.0,Taking Lives,5.0
1745577,1745577,330,1224253,3,2005-01-27,1998.0,Wild Things,5.0
1925900,1925900,357,566587,1,2005-06-20,2003.0,House of Sand and Fog,5.0
1482568,1482568,311,429493,4,2005-10-21,1994.0,Ed Wood,5.0
1309838,1309838,290,447410,4,2005-01-19,2004.0,Harold and Kumar Go to White Castle,5.0
1928368,1928368,357,1830820,5,2005-10-17,2003.0,House of Sand and Fog,5.0
1342568,1342568,290,126257,3,2005-03-21,2004.0,Harold and Kumar Go to White Castle,5.0
