#####   
# Recommendation System (Collaborative Based Filtering & KNN)
#####   

# 1) Loading Libraries

In [1]:
import numpy  as np
import pandas as pd
pd.set_option('display.max_rows',2000)
pd.set_option('display.max_columns',500)

from matplotlib import pyplot as plt
import seaborn as sns

import datetime
from   datetime import date

#####   
# 2) Loading and Exploring the Data

In [2]:
movies_details = pd.read_csv('movies_metadata.csv')
movies_details.head()

  movies_details = pd.read_csv('movies_metadata.csv')


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,/e64sOI48hQXyru7naBFyssKFxVd.jpg,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [7]:
df_merged = movies_details.copy()

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [22]:
tfv = TfidfVectorizer( min_df=3, max_features=5, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
                     ngram_range=(1,3), stop_words='english')

In [23]:
df_merged['overview'] = df_merged['overview'].fillna('')

In [24]:
tfv_matrix = tfv.fit_transform(df_merged['overview'])

In [25]:
tfv_matrix.shape

(45466, 5)

In [26]:
from sklearn.metrics.pairwise import sigmoid_kernel

In [27]:
sig = sigmoid_kernel(tfv_matrix,tfv_matrix)
sig

MemoryError: Unable to allocate 2.88 GiB for an array with shape (387177214,) and data type float64

In [3]:
ratings_details = pd.read_csv('ratings.csv')
ratings_details.tail()

Unnamed: 0,userId,movieId,rating,timestamp
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352
100835,610,170875,3.0,1493846415


In [None]:
df_merged = movies_details.merge( ratings_details, on='movieId')
df_merged.tail()

#####   
# 3) Extracting New Columns

In [None]:
# Extracting Year from TimrStamp
year = []
for i in range( df_merged.shape[0] ):
    year.append(  datetime.datetime.fromtimestamp( df_merged['timestamp'][i] ).year  )
df_merged['timestamp_year'] = np.array(year)
df_merged.tail()

#####   
# 4) Grouping & Combining
 - Group1: **Title** & **Rating (average)**  
 - Group2: **Title** & **Rating (count)** 
 - Combo : **Group1** + **Group2**

In [None]:
# Grouping by 'Title' & 'Rating' average
G = df_merged.copy()

G = G.groupby('title')['rating'].mean()
G = G.sort_values( ascending=False )

G_rating_mean = G.copy()
G_rating_mean

In [None]:
# Grouping by 'Title' & 'Rating' count
G = df_merged.copy()

G = G.groupby('title')['rating'].count()
G = G.sort_values( ascending=False )

G_rating_count = G.copy()
G_rating_count

In [None]:
# Combining the Groups
df_Grouped = pd.DataFrame() 

df_Grouped['rating_mean' ] = G_rating_mean
df_Grouped['rating_count'] = G_rating_count 

df_Grouped.tail()

In [None]:
# Weighting the Rating
C = df_Grouped['rating_count'].copy()
Q = df_Grouped['rating_count'].copy().quantile(0.70)
R = df_Grouped['rating_mean' ].copy()
M = df_Grouped['rating_mean' ].copy().mean()

df_Grouped['weighted_rating' ] = np.round( ((R*C)+(M*Q))/(C+Q) ,2 )
df_Grouped.tail()

#####   
# 5) EDA of the Combo

In [None]:
# jointplot
sns_plot = sns.jointplot( x='rating_count', y='rating_mean', data=df_Grouped, color='cyan', edgecolor='b', alpha=0.35 )
sns_plot.savefig('jointplot.png')
plt.show()

In [None]:
# Histogram
fig, ax = plt.subplots()
ax.hist( df_Grouped['weighted_rating' ], bins=16, linewidth=1, edgecolor='blue', facecolor='cyan' )
fig.savefig('jointplot.png')
plt.show()

#####   
# 6) Pivot Table
 - Making **Pivot Table** of **Title** & **Rating**

In [None]:
# Making the Pivot Table of Title & Rating
df_Pivot      = df_merged.pivot_table(index='userId',columns='title',values='rating')
df_Pivot_Zero = df_Pivot.copy().fillna(0)
df_Pivot_Zero.tail()

#####   
# 7) Making KNN

In [None]:
from scipy.sparse import csr_matrix
CSR_Matrix = csr_matrix(df_Pivot_Zero.T.values)

from sklearn.neighbors import NearestNeighbors
Model_KNN = NearestNeighbors( metric='cosine', algorithm='brute' )
Model_KNN.fit( CSR_Matrix )

#####   
# 8A) Recommending ( Ratings + KNN )
 - Finding the **Item of Interest** in the **Pivot Table**
 - Finding the **Neighbors** of **Item of Interest**  Using  **KNN**
 - **Recommending Neighbors**

In [None]:
Item_of_Interest = 'Sin City (2005)'

# Finding the  Item of Interest  in the  Pivot Table Zero
xx = df_Pivot_Zero[Item_of_Interest]

# Finding the  Neighbors  of the  Item of Interest  Using  KNN
distance, indices = Model_KNN.kneighbors(xx.values.reshape(1,-1), n_neighbors=21 )

# Recommending Neighbors
new = pd.DataFrame()
new['Distance'] = np.round( list(distance[0]) ,3 )
new.index = list( df_Pivot_Zero.iloc[ :,list( indices[0] ) ].columns )
new.style.bar( subset=['Distance'], color='yellow' )


#####   

#####   
# 8B) Recommending ( Ratings + Correlations )
 - Finding the **Item of Interest** in the **Pivot Table**
 - Finding the **Correlation** of **Item of Interest** with the **Pivot Table**
 - **Recommending** Highly Correlated Items

In [None]:
# Finding Item of Interest in the Pivot Table
Item_Ratings = df_Pivot[ Item_of_Interest ]

# Finding Correlation of  Item of Interest  with the  Pivot Table
r = df_Pivot.corrwith( Item_Ratings )
r = pd.DataFrame(r,columns=['Correlations'])
r = r.dropna()


In [None]:
# Recommending Highly Correlated Items
new = r.copy()
new = new.join( df_Grouped['weighted_rating'], on='title')
new = new.join( df_Grouped[  'rating_count' ], on='title')

new['Correlations'] = np.round( new['Correlations'], 2 )

new = new[ new[  'Correlations'  ]>.75 ]  #.75
new = new[ new['weighted_rating' ]>3.0 ]  #3.0
new = new[ new[  'rating_count'  ]>40  ]  #40

new = new.sort_values(['Correlations','weighted_rating','rating_count'],ascending=[False,False,False])

print('\nItem of Interest =',Item_of_Interest)

np.random.seed(3)
cm = sns.light_palette("cyan", as_cmap=True)
new.head(2000).style.\
    background_gradient(subset=['Correlations'], cmap=cm).\
    bar(subset=['weighted_rating'], color='deeppink').\
    bar(subset=['rating_count'], color='lime').\
    highlight_null(color="green").\
    format(precision=3)   #.to_excel('styled.xlsx', engine='openpyxl')

#####   

#####   