#### Import required Libraries

In [14]:
import pandas as pd
import scipy as sp
from scipy import spatial
import numpy as np

#### import the Source Data

In [15]:
ratingsData = pd.read_csv(".\\Data\\ml-latest-small\\ratings.csv", sep=",")

In [16]:
ratingsData.head(10)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
5,1,1263,2.0,1260759151
6,1,1287,2.0,1260759187
7,1,1293,2.0,1260759148
8,1,1339,3.5,1260759125
9,1,1343,2.0,1260759131


##### Remove the Timestamp from the data

In [17]:
ratingsData = ratingsData.loc[:,['userId','movieId','rating']]

###### Calculate Average Movie Rating which will sybtract from actual rating to center the data for calcualting the similarity based on Adjusted Cosine Distance

In [18]:
AverageMovieRating = ratingsData.loc[:,['movieId','rating']].groupby('movieId').mean()
AverageMovieRating.head()

Unnamed: 0_level_0,rating
movieId,Unnamed: 1_level_1
1,3.87247
2,3.401869
3,3.161017
4,2.384615
5,3.267857


#### Use Movie Id and Its corresponding Avarage and create a Data Frame, and later join the actual rating table

In [19]:
MovieId = AverageMovieRating.index
MovieRating = np.array( AverageMovieRating['rating'])
AvgMovieRatingDf = pd.DataFrame({'MovieId':MovieId,'MovieRating':MovieRating}, index=np.array(range(len(MovieId))))
AvgMovieRatingDf.tail(10)

Unnamed: 0,MovieId,MovieRating
9056,161084,2.5
9057,161155,0.5
9058,161594,3.0
9059,161830,1.0
9060,161918,1.5
9061,161944,5.0
9062,162376,4.5
9063,162542,5.0
9064,162672,3.0
9065,163949,5.0


#### Merged the avaerage Rating with the master rating table

In [20]:
ratingsDataWithAverage = pd.merge(ratingsData,AvgMovieRatingDf, left_on='movieId', right_on='MovieId', how='left')
ratingsDataWithAverage.columns = ['UserId','MovieId','ActualRating','m_MovieId','AverageRating']
ratingsDataWithAverage.loc[ratingsDataWithAverage.loc[:,'MovieId']==31,:].head(10)

Unnamed: 0,UserId,MovieId,ActualRating,m_MovieId,AverageRating
0,1,31,2.5,31,3.178571
498,7,31,3.0,31,3.178571
6059,31,31,4.0,31,3.178571
6130,32,31,4.0,31,3.178571
6526,36,31,3.0,31,3.178571
6773,39,31,3.0,31,3.178571
10223,73,31,3.5,31,3.178571
13516,88,31,3.0,31,3.178571
14810,96,31,2.5,31,3.178571
16867,110,31,4.0,31,3.178571


### Centralize the Data

In [21]:
ratingsDataCentered = ratingsDataWithAverage
ratingsDataCentered['CenteredRating'] = ratingsDataWithAverage['ActualRating']-ratingsDataWithAverage['AverageRating']
ratingsDataCentered.head(10)

Unnamed: 0,UserId,MovieId,ActualRating,m_MovieId,AverageRating,CenteredRating
0,1,31,2.5,31,3.178571,-0.678571
1,1,1029,3.0,1029,3.702381,-0.702381
2,1,1061,3.0,1061,3.545455,-0.545455
3,1,1129,2.0,1129,3.3125,-1.3125
4,1,1172,4.0,1172,4.26087,-0.26087
5,1,1263,2.0,1263,3.864583,-1.864583
6,1,1287,2.0,1287,3.891304,-1.891304
7,1,1293,2.0,1293,3.978261,-1.978261
8,1,1339,3.5,1339,3.298077,0.201923
9,1,1343,2.0,1343,3.74359,-1.74359


#### Test for Centroid

In [22]:
ratingsDataCentered.loc[:,['MovieId','CenteredRating']].groupby('MovieId').aggregate(sum).round(10).head(10)

Unnamed: 0_level_0,CenteredRating
MovieId,Unnamed: 1_level_1
1,0.0
2,0.0
3,-0.0
4,0.0
5,0.0
6,0.0
7,-0.0
8,0.0
9,0.0
10,-0.0


## ** VERY IMPORTANT STEP - ITEM-USER-REPRESENTATION_MATRIX

###### Let us first extract unique movies and users and hold them in an array

In [25]:
movies = pd.unique(ratingsDataCentered.loc[:,'MovieId'])
users = pd.unique(ratingsDataCentered.loc[:,'UserId'])
item_user_df = pd.DataFrame(0,columns=users, index=movies)

In [26]:
item_user_df.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,662,663,664,665,666,667,668,669,670,671
31,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1029,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1061,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1129,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1172,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
filtered_data=np.nan
for u in users:
    filtered_data = pd.DataFrame(ratingsDataCentered.loc[ratingsDataCentered.loc[:,'UserId']==u,['MovieId','CenteredRating']])
    item_user_df.loc[filtered_data.MovieId,u] = np.array(filtered_data.CenteredRating)
    filtered_data=np.nan
item_user_df.head()


Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,662,663,664,665,666,667,668,669,670,671
31,-0.678571,0.0,0.0,0.0,0.0,0.0,-0.178571,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1029,-0.702381,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.297619,0.0,0.0,0.0,0.0,0.0,0.0
1061,-0.545455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-0.545455,0.0,0.0,0.0,0.0,0.0,0.0
1129,-1.3125,0.0,0.0,0.0,0.0,0.0,-0.3125,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1172,-0.26087,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### As we are trying to do Model Based Collaborative filtering through finding the Similarites based on Adjusted Cosine Similarity approach, we will have to masure the vector distance of  each item with rest of the items. if we have 10 item we will have 10X10array and need to perform 100 calculations. in our case, we 9000 movies which will have 9000X9000 matrix with  81000000 calculations. Its will run for ever. this is just a sample data, which is like no even 1% of total movies. 

#### We have to take a different apprach to find the similarites. We can do clustering.