# Model-based Collaborative Filtering Systems

 ## *SVD Matrix Factorization

In [1]:
#import required libraries
import pandas as pd
import numpy as np

import sklearn
from sklearn.decomposition import TruncatedSVD  # TruncatedSVD is used for dimensionality reduction


In [2]:
#read dataset
frame = pd.read_csv("../data/ml-100k/u.data")
frame.head()

Unnamed: 0,196	242	3	881250949
0,186\t302\t3\t891717742
1,22\t377\t1\t878887116
2,244\t51\t2\t880606923
3,166\t346\t1\t886397596
4,298\t474\t4\t884182806


## Preparing the Data

#As we find out that it is tab seperated column values so give space and column names

In [3]:
columns= ['user_id', 'item_id', 'rating', 'timestamp']
frame= pd.read_csv("../data/ml-100k/u.data",sep='\t', names=columns)
frame.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [4]:
columns = ['item_id', 'movie title', 'release date', 'video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
          'Animation', 'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror','Musical',
           'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

movies = pd.read_csv('../data/ml-100k/u.item', sep='|', encoding='latin-1', names= columns)
movie_names = movies[['item_id', 'movie title']]  # it only print item id and movie title
movie_names.head()


Unnamed: 0,item_id,movie title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [5]:
#merge movie names and frame on itm id
combined = pd.merge(frame ,movie_names,on = 'item_id')  #by default merge takes inner dataframe, but left and right is specified then order of writing merging frame d0esnot affect 
combined.head()

Unnamed: 0,user_id,item_id,rating,timestamp,movie title
0,196,242,3,881250949,Kolya (1996)
1,63,242,3,875747190,Kolya (1996)
2,226,242,5,883888671,Kolya (1996)
3,154,242,3,879138235,Kolya (1996)
4,306,242,5,876503793,Kolya (1996)


In [6]:
#sort which movie id has highest rating and how many ratings so group by those
combined.groupby('item_id')['rating'].count().sort_values( ascending= False).head()

item_id
50     583
258    509
100    508
181    507
294    485
Name: rating, dtype: int64

From result we find out that item _id= 50 has highest number of rating so for watching which item is that ,filter it.but here there are more than 1 user who has seen same movie as instead of a full list of user id with same movie we want unique or single name

In [7]:
combined[combined['item_id']==50]['movie title'].unique()
#     or we can write this way also
#'''filter = combined_movies_data['item_id']==50
#   combined_movies_data[filter]['movie title'].unique()'''


array(['Star Wars (1977)'], dtype=object)

## Build Utility Matrix

* Now if we want each user's rating to each movie we need to compose utility matrix which gives complete summary

In [36]:
rating_summary= combined.pivot_table(values= 'rating',index= 'user_id' ,columns= 'movie title')
rating_summary.head()
#fill NaN values by ZERO otherwise it will raise error in fitting model
rating_summary= combined.pivot_table(values= 'rating',index= 'user_id' ,columns= 'movie title',fill_value=0)
rating_summary.head(20)

movie title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,2,5,0,0,3,4,0,0,...,0,0,0,5,3,0,0,0,4,0
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,2,0,0,0,0,4,0,0,...,0,0,0,4,0,0,0,0,4,0
6,0,0,0,4,0,0,0,5,0,0,...,0,0,0,4,0,0,0,0,0,0
7,0,0,0,4,0,0,5,5,0,4,...,0,0,0,5,3,0,3,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,4,...,0,0,0,0,0,0,0,0,0,0
10,0,0,0,5,0,0,0,5,0,4,...,0,0,0,0,0,0,0,0,0,0


In [56]:
rating_summary.shape
#rating_summary.isnull().any()


(943, 1664)

## Transposing the matrix

* Now we need to transpose the matrix so that movie title becomes row and user_id becomes columns and  so that during Truncated SVD the movie title doesnot get lost because it is important

In [38]:
transposed = rating_summary.T
transposed.shape

(1664, 943)

## Decomposing the matrix using TruncatedSVD

In [52]:
decomposing_SVD= TruncatedSVD(n_components= 10 ,random_state= 1)  #n_components is used to reduced dimensions upto its value

resultant_matrix= decomposing_SVD.fit_transform(transposed)
resultant_matrix.shape

(1664, 10)

## Generating a Correlation Matrix

In [54]:
corr_matrix = np.corrcoef(resultant_matrix)
#corr_matrix= pd.DataFrame(resultant_matrix)
corr_matrix.shape


(1664, 1664)

## Isolating Star Wars From the Correlation Matrix¶

In [65]:
movie_names = rating_summary.columns
movies_list = list(movie_names)

star_wars = movies_list.index('Star Wars (1977)')
star_wars

1398

In [72]:
corr_star_wars= corr_matrix[1398]
corr_star_wars.shape

(1664,)

## Recommending a highly correlated movie

In [76]:
list(movie_names[(corr_star_wars<1.0) & (corr_star_wars > 0.9)])


['Aliens (1986)',
 'Blade Runner (1982)',
 'Die Hard (1988)',
 'Empire Strikes Back, The (1980)',
 'Fugitive, The (1993)',
 'Indiana Jones and the Last Crusade (1989)',
 "Jackie Chan's First Strike (1996)",
 'Raiders of the Lost Ark (1981)',
 'Return of the Jedi (1983)',
 'Star Trek: First Contact (1996)',
 'Strange Days (1995)',
 'Terminator 2: Judgment Day (1991)',
 'Terminator, The (1984)',
 'Toy Story (1995)']

In [81]:
list(movie_names[(corr_star_wars<1.0) & (corr_star_wars > 0.95)])

['Return of the Jedi (1983)']