In [1]:
!pip install surprise
import os
import pandas as pd
import numpy as np
from surprise import Reader, Dataset, SVD


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise
  Downloading scikit-surprise-1.1.1.tar.gz (11.8 MB)
[K     |████████████████████████████████| 11.8 MB 14.7 MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp37-cp37m-linux_x86_64.whl size=1633964 sha256=a4112f983acf58e8494f9954f85f1af2a9ec2f7859e34a94e2fdec870a509090
  Stored in directory: /root/.cache/pip/wheels/76/44/74/b498c42be47b2406bd27994e16c5188e337c657025ab400c1c
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.1 surprise-0.1


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
movie_titles = pd.read_csv('/content/drive/MyDrive/recomendation_system/movie_titles.csv',encoding = 'ISO-8859-1',header = None,names = ['movie_id', 'year', 'movie_name']).set_index('movie_id')
movie_titles.head()


Unnamed: 0_level_0,year,movie_name
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2003.0,Dinosaur Planet
2,2004.0,Isle of Man TT 2004 Review
3,1997.0,Character
4,1994.0,Paula Abdul's Get Up & Dance
5,2004.0,The Rise and Fall of ECW


In [4]:
movie_titles.isnull().sum().sort_values(ascending = False)

year          7
movie_name    0
dtype: int64

In [5]:
movie_titles['year'] = movie_titles['year'].fillna('')


In [6]:
movie_titles.isnull().sum().sort_values(ascending = False)

year          0
movie_name    0
dtype: int64

In [None]:
#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
movie_titles['movie_name'] = movie_titles['movie_name'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(movie_titles['movie_name'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

(17770, 11527)

In [None]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
cosine_sim.shape


(17770, 17770)

In [None]:
indices = pd.Series(movie_titles.index, index=movie_titles['movie_name']).drop_duplicates()


In [None]:
# Function that takes  movie name  and return most recommended movie to that name
def get_recommendations(title, cosine_sim=cosine_sim):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:10]
    movie_indices = [i[0] for i in sim_scores]
    # Return the top 10 most similar movies
    return movie_titles['movie_name'].iloc[movie_indices]


In [None]:
get_recommendations('Speed').head(10)


3513                    Full Speed
5978                 Speed of Life
2024               Legend of Speed
16194                  Speed: IMAX
6018         Speed: Bonus Material
2420     With All Deliberate Speed
3545                   Speed Racer
12148      Speed 2: Cruise Control
0                  Dinosaur Planet
Name: movie_name, dtype: object

In [None]:
#get all text files and store it in one csv file with movie_id  and all user data 
if not os.path.isfile('/content/drive/MyDrive/recomendation_system/data.csv'):
    # Create a file 'data.csv' before reading it
    # Read all the files in the dataset and store them in one big file ('data.csv')
    # We're reading from each of the four files and appending each rating to a global file 'data.csv'
    data = open('/content/drive/MyDrive/recomendation_system/data.csv', mode='w')
    
    row = list()
    files = [
        '/content/drive/MyDrive/recomendation_system/combined_data_1.txt',
        '/content/drive/MyDrive/recomendation_system/combined_data_2.txt', 
        '/content/drive/MyDrive/recomendation_system/combined_data_3.txt', 
        '/content/drive/MyDrive/recomendation_system/combined_data_4.txt'
    ]
    for file in files:
        print("Reading ratings from {}\n".format(file))
        with open(file) as f:
            for line in f: 
                line = line.strip()
                if line.endswith(':'):
                    movie_id = line.replace(':', '')
                else:
                    row = [x for x in line.split(',')]
                    row.insert(0, movie_id)
                    data.write(','.join(row))
                    data.write('\n')
    data.close()


Reading ratings from /content/drive/MyDrive/recomendation_system/combined_data_1.txt

Reading ratings from /content/drive/MyDrive/recomendation_system/combined_data_2.txt

Reading ratings from /content/drive/MyDrive/recomendation_system/combined_data_3.txt

Reading ratings from /content/drive/MyDrive/recomendation_system/combined_data_4.txt



In [7]:
user_data=pd.read_csv('/content/drive/MyDrive/recomendation_system/data.csv', sep=',',names=['movie_id', 'user', 'rating', 'date'])
user_data.head()

Unnamed: 0,movie_id,user,rating,date
0,1,1488844,3,2005-09-06
1,1,822109,5,2005-05-13
2,1,885013,4,2005-10-19
3,1,30878,4,2005-12-26
4,1,823519,3,2004-05-03


In [8]:
user_data.isnull().sum().sort_values(ascending = False)

movie_id    0
user        0
rating      0
date        0
dtype: int64

In [9]:
#merge data with movie title to get movie name in same csv
df = pd.merge(user_data,movie_titles,on='movie_id')


In [10]:
df.head()

Unnamed: 0,movie_id,user,rating,date,year,movie_name
0,1,1488844,3,2005-09-06,2003.0,Dinosaur Planet
1,1,822109,5,2005-05-13,2003.0,Dinosaur Planet
2,1,885013,4,2005-10-19,2003.0,Dinosaur Planet
3,1,30878,4,2005-12-26,2003.0,Dinosaur Planet
4,1,823519,3,2004-05-03,2003.0,Dinosaur Planet


In [38]:
#due to crash session i will work with first 1000000 record
df2= df.head(10000000)


In [39]:
#using collabrative filter to find recommended films to specific user
#using surbrise library
from surprise.model_selection import cross_validate
reader = Reader()
data = Dataset.load_from_df(df2[['movie_id', 'user', 'rating']], reader)

In [40]:
trainset = data.build_full_trainset()
svd = SVD()


In [41]:
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f2c00962f10>

In [42]:
df_movies_cpy= df2.copy()

In [43]:
df_movies_cpy = df_movies_cpy.reset_index(drop=False)
df_movies_cpy.head()

Unnamed: 0,index,movie_id,user,rating,date,year,movie_name
0,0,1,1488844,3,2005-09-06,2003.0,Dinosaur Planet
1,1,1,822109,5,2005-05-13,2003.0,Dinosaur Planet
2,2,1,885013,4,2005-10-19,2003.0,Dinosaur Planet
3,3,1,30878,4,2005-12-26,2003.0,Dinosaur Planet
4,4,1,823519,3,2004-05-03,2003.0,Dinosaur Planet


In [44]:
df_movies_cpy.shape 

(10000000, 7)

In [45]:
#method take userId and return the recommendation movie using surbrise library
def collaborative(userId):
    df_movies_cpy['Estimate'] = df_movies_cpy['index'].apply(lambda x: svd.predict(userId,x).est)
    df_new_frame=df_movies_cpy.drop(['user', 'date','year','movie_id','rating','index'],axis=1)
    return df_new_frame.sort_values('Estimate', ascending=False).head(10)

In [46]:
collaborative(904250)

Unnamed: 0,movie_name,Estimate
844526,Taking Lives,5.0
2307226,Mississippi Burning,5.0
447759,Lucio Fulci: The Beyond,5.0
642384,Reservoir Dogs,5.0
27061,7 Seconds,5.0
1732425,Dogma,5.0
2304126,Mississippi Burning,5.0
1860235,The Taming of the Shrew,5.0
794999,X2: X-Men United,5.0
293718,Congo,5.0
