In [2]:
import pandas as pd
import numpy as np
import os
import pandas_datareader.data as web
import matplotlib.pyplot as plt
from mlxtend.frequent_patterns import association_rules
from mlxtend.frequent_patterns import apriori

In [18]:
# setting up directory

os.chdir("/Users/rishisinha/ml-100k")

In [19]:

rating_df = pd.read_csv( "u.data", delimiter = "\t", header = None )

In [20]:
rating_df.head(4)

Unnamed: 0,0,1,2,3
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923


*** Name the columns ***

In [21]:
rating_df.columns = ["userid", "movieid", "rating", "timestamp"]

In [22]:

rating_df.head( 10 )

Unnamed: 0,userid,movieid,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
5,298,474,4,884182806
6,115,265,2,881171488
7,253,465,5,891628467
8,305,451,3,886324817
9,6,86,3,883603013


In [23]:
#Number of unique users

In [24]:
len( rating_df.userid.unique() )

943

In [25]:
#Number of unique movies

In [26]:
len( rating_df.movieid.unique() )

1682

In [27]:
rating_df.drop( "timestamp", inplace = True, axis = 1 )

In [28]:

rating_df.head( 10 )

Unnamed: 0,userid,movieid,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1
5,298,474,4
6,115,265,2
7,253,465,5
8,305,451,3
9,6,86,3


In [29]:
# Load Movie Data

In [30]:
movies_df = pd.read_csv( "u.item", delimiter = '\|', header = None )

  """Entry point for launching an IPython kernel.


In [31]:
movies_df = movies_df.iloc[:,:2]
movies_df.columns = ['movieid', 'title']

In [32]:
movies_df.head( 10 )

Unnamed: 0,movieid,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)
5,6,Shanghai Triad (Yao a yao yao dao waipo qiao) ...
6,7,Twelve Monkeys (1995)
7,8,Babe (1995)
8,9,Dead Man Walking (1995)
9,10,Richard III (1995)


In [33]:
# Finding User Similarities

In [34]:
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine, correlation

In [35]:
# Create Pivot

In [36]:
user_movies_df = rating_df.pivot( index='userid', columns='movieid', values = "rating" ).reset_index(drop=True)

In [37]:
#Fill '0' for ratings not given by users

In [38]:
user_movies_df.fillna( 0, inplace = True )

In [39]:
user_movies_df.shape

(943, 1682)

In [40]:
user_movies_df.iloc[10:20, 20:30]

movieid,21,22,23,24,25,26,27,28,29,30
10,0.0,4.0,0.0,3.0,3.0,0.0,0.0,5.0,3.0,0.0
11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0
12,3.0,4.0,5.0,1.0,1.0,0.0,3.0,5.0,2.0,0.0
13,0.0,3.0,5.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
14,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
15,0.0,5.0,0.0,0.0,0.0,0.0,2.0,5.0,0.0,0.0
16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17,0.0,5.0,4.0,0.0,3.0,4.0,0.0,3.0,0.0,0.0
18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Calculate the distances
Based on what users have given ratings to different items, we can calculate the distances between them. Less the distance more similar they are.

For example, following users have given different ratings to differnt books.



Now, we can find similar users based the distance between user depending on how they have rated the movies. The dimensions are the books and scale is the ratings users have provided.



For calculating distances, many similarity coefficients can be calculated. Most widely used similarity coefficients are Euclidean, Cosine, Pearson Correlation etc.

We will use cosine distance here. Here we are insterested in similarity. That means higher the value more similar they are. But as the function gives us the distance, we will deduct it from 1.

In [41]:
user_sim = 1 - pairwise_distances( user_movies_df.as_matrix(), metric="cosine" )

In [42]:
user_sim_df = pd.DataFrame( user_sim )

In [43]:
user_sim_df[0:5]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,933,934,935,936,937,938,939,940,941,942
0,1.0,0.166931,0.04746,0.064358,0.378475,0.430239,0.440367,0.319072,0.078138,0.376544,...,0.369527,0.119482,0.274876,0.189705,0.197326,0.118095,0.314072,0.148617,0.179508,0.398175
1,0.166931,1.0,0.110591,0.178121,0.072979,0.245843,0.107328,0.103344,0.161048,0.159862,...,0.156986,0.307942,0.358789,0.424046,0.319889,0.228583,0.22679,0.161485,0.172268,0.105798
2,0.04746,0.110591,1.0,0.344151,0.021245,0.072415,0.066137,0.08306,0.06104,0.065151,...,0.031875,0.042753,0.163829,0.069038,0.124245,0.026271,0.16189,0.101243,0.133416,0.026556
3,0.064358,0.178121,0.344151,1.0,0.031804,0.068044,0.09123,0.18806,0.101284,0.060859,...,0.052107,0.036784,0.133115,0.193471,0.146058,0.030138,0.196858,0.152041,0.170086,0.058752
4,0.378475,0.072979,0.021245,0.031804,1.0,0.237286,0.3736,0.24893,0.056847,0.201427,...,0.338794,0.08058,0.094924,0.079779,0.148607,0.071459,0.239955,0.139595,0.152497,0.313941


Who is similar to who?
Users with highest similarity values can be treated as similar users.

In [46]:
user_sim_df.idxmax(axis=1)[0:5]

0    0
1    1
2    2
3    3
4    4
dtype: int64

The above results show that user are most similar to themselves. But this is not what we want. So, we will fill the diagonal of the matrix (which represent the relationship with self) with 0.

<h3>Setting correlation with self to 0 </h3>

In [47]:
np.fill_diagonal( user_sim, 0 )

In [48]:
user_sim_df = pd.DataFrame( user_sim )

In [49]:
user_sim_df[0:5]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,933,934,935,936,937,938,939,940,941,942
0,0.0,0.166931,0.04746,0.064358,0.378475,0.430239,0.440367,0.319072,0.078138,0.376544,...,0.369527,0.119482,0.274876,0.189705,0.197326,0.118095,0.314072,0.148617,0.179508,0.398175
1,0.166931,0.0,0.110591,0.178121,0.072979,0.245843,0.107328,0.103344,0.161048,0.159862,...,0.156986,0.307942,0.358789,0.424046,0.319889,0.228583,0.22679,0.161485,0.172268,0.105798
2,0.04746,0.110591,0.0,0.344151,0.021245,0.072415,0.066137,0.08306,0.06104,0.065151,...,0.031875,0.042753,0.163829,0.069038,0.124245,0.026271,0.16189,0.101243,0.133416,0.026556
3,0.064358,0.178121,0.344151,0.0,0.031804,0.068044,0.09123,0.18806,0.101284,0.060859,...,0.052107,0.036784,0.133115,0.193471,0.146058,0.030138,0.196858,0.152041,0.170086,0.058752
4,0.378475,0.072979,0.021245,0.031804,0.0,0.237286,0.3736,0.24893,0.056847,0.201427,...,0.338794,0.08058,0.094924,0.079779,0.148607,0.071459,0.239955,0.139595,0.152497,0.313941


In [50]:
# Finding user similarities

In [51]:

user_sim_df.idxmax(axis=1).sample( 10, random_state = 10 )

544    756
309    246
448    893
628    537
284    413
572    693
225    866
567    311
75     176
726    496
dtype: int64

<pre><h4>This shows which results are similar to each other. The actual user id will be the index number + 1. That means user 545 is similar to user 757 and so on and so forth.
Movies similar users like or dislike
We can find the actual movie names and check if the similar users have rated them similarity or differently.</h4>

In [52]:
def get_user_similar_movies( user1, user2 ):
  common_movies = rating_df[rating_df.userid == user1].merge(
      rating_df[rating_df.userid == user2],
      on = "movieid",
      how = "inner" )

  return common_movies.merge( movies_df, on = 'movieid' )

<h3>User 310 Vs. User 247</h3>

In [53]:

get_user_similar_movies( 310, 247 )

Unnamed: 0,userid_x,movieid,rating_x,userid_y,rating_y,title
0,310,258,3,247,5,Contact (1997)
1,310,257,5,247,4,Men in Black (1997)
2,310,251,5,247,4,Shall We Dance? (1996)
3,310,1022,5,247,4,"Fast, Cheap & Out of Control (1997)"
4,310,222,3,247,3,Star Trek: First Contact (1996)
5,310,181,4,247,4,Return of the Jedi (1983)
6,310,50,5,247,5,Star Wars (1977)


<pre>
Challenges with User similarity
The challenge with calculating user similarity is the user need to have some prior purchases and should have rated them. This recommendation technique does not work for new users. The system need to wait until the user make some purchases and rates them. Only then similar users can be found and recommendations can be made. This is called cold start problem.
This can be avoided by calculating item similarities based how how users are buying these items and rates them together. Here the items are entities and users are dimensions.