In [1]:
import re
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [2]:
# read 1 million ratings data
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('ratings.dat', sep='::', names=r_cols, encoding='latin-1', engine='python')
ratings.head(5)

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [3]:
# read users file
u_cols = ['user_id', 'sex', 'age', 'occupation', 'zip_code']
users = pd.read_csv('users.dat', sep='::', names=u_cols, usecols=range(0,4),encoding='latin-1', engine='python', 
                    dtype={'age':object,'occupation':object})
users.head(5)

Unnamed: 0,user_id,sex,age,occupation
0,1,F,1,10
1,2,M,56,16
2,3,M,25,15
3,4,M,45,7
4,5,M,25,20


In [4]:
# add a new column about like or not like, 4 is the threshold
ratings['like'] = 0
for index, row in ratings.iterrows():
    if row['rating'] >= 4 :
        ratings.set_value(index,'like',1)
    else:
        ratings.set_value(index,'like',0)

ratings.head(5)

Unnamed: 0,user_id,movie_id,rating,unix_timestamp,like
0,1,1193,5,978300760,1
1,1,661,3,978302109,0
2,1,914,3,978301968,0
3,1,3408,4,978300275,1
4,1,2355,5,978824291,1


In [5]:
users.index = users.user_id
del users['user_id']
users.head(5)

Unnamed: 0_level_0,sex,age,occupation
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,F,1,10
2,M,56,16
3,M,25,15
4,M,45,7
5,M,25,20


In [6]:
user_matrix = pd.get_dummies(users)
user_matrix.head(5)

Unnamed: 0_level_0,sex_F,sex_M,age_1,age_18,age_25,age_35,age_45,age_50,age_56,occupation_0,...,occupation_19,occupation_2,occupation_20,occupation_3,occupation_4,occupation_5,occupation_6,occupation_7,occupation_8,occupation_9
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
5,0,1,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [7]:
user_matrix.T

user_id,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
sex_F,1,0,0,0,0,1,0,0,0,1,...,1,0,0,0,1,1,1,1,1,0
sex_M,0,1,1,1,1,0,1,1,1,0,...,0,1,1,1,0,0,0,0,0,1
age_1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
age_18,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
age_25,0,0,1,0,1,0,0,1,1,0,...,0,0,0,1,1,1,0,0,0,1
age_35,0,0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
age_45,0,0,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,1,0
age_50,0,0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
age_56,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
occupation_0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0


In [8]:
# create a "like" matrix
ratings_matrix = ratings.pivot(index = 'user_id', columns = 'movie_id', values = 'like').fillna(0)

ratings_matrix.T.head(5)

user_id,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
enhance_user_matrix = pd.concat([ratings_matrix.T,user_matrix.T])
enhance_user_matrix

user_id,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# use apriori to get "movie basket"
frequent_movieset = apriori(enhance_user_matrix, min_support=0.1, use_colnames=True)

NameError: name 'enhance_user_matrix' is not defined

In [40]:
frequent_movieset.head(5)

Unnamed: 0,support,itemsets
0,0.144272,[53]
1,0.10546,[148]
2,0.115632,[149]
3,0.105996,[173]
4,0.15257,[195]


In [41]:
# get the association rules
rules = association_rules(frequent_movieset,metric='confidence',min_threshold=0.1)

In [42]:
rules.head(5)

Unnamed: 0,antecedants,consequents,support,confidence,lift
0,(53),(4277),0.144272,0.742115,1.928054
1,(4277),(53),0.384904,0.278164,1.928054
2,(195),(4277),0.15257,0.714035,1.855101
3,(4277),(195),0.384904,0.283032,1.855101
4,(424),(549),0.211724,0.542351,2.655603


In [43]:
# substract movie id from antecedants, only reserve the number itself
rules['antecedants_movie'] = rules['antecedants'].apply(lambda x: re.match(".+\\(\\{(.+)\\}\\)",str(x)).group(1))
rules['consequents_movie'] = rules['consequents'].apply(lambda x: re.match(".+\\(\\{(.+)\\}\\)",str(x)).group(1))
rules.head(5)

Unnamed: 0,antecedants,consequents,support,confidence,lift,antecedants_movie,consequents_movie
0,(53),(4277),0.144272,0.742115,1.928054,53,4277
1,(4277),(53),0.384904,0.278164,1.928054,4277,53
2,(195),(4277),0.15257,0.714035,1.855101,195,4277
3,(4277),(195),0.384904,0.283032,1.855101,4277,195
4,(424),(549),0.211724,0.542351,2.655603,424,549


In [44]:
# subset rules of which antecedants only got one movie
rules['ant_len'] = rules['antecedants'].apply(lambda x: len(x))
rules_1movie = rules[rules['ant_len']==1]
rules_1movie.shape,rules.shape

((307, 8), (340, 8))

In [45]:
# create a dictionary, map consequents to each unique antecedants
rec_1movie_integrated = {}
for index,row in rules_1movie.iterrows():
    movie_ant = row['antecedants_movie']
    if movie_ant not in rec_1movie_integrated.keys():
        rec_1movie_integrated[movie_ant] = row['consequents_movie']
    else:
        rec_1movie_integrated[movie_ant] = rec_1movie_integrated[movie_ant] + "," + row['consequents_movie']

In [46]:
list(rec_1movie_integrated.items())[:2]

[('53', '4277'),
 ('4277',
  '53,195,424,524,549,550,678,692,710,839,1015,1088,1117,1285,1317,1448,1451,1635,1680,1698,1835,1880,1884,1899,1980,2304,2507,2793,2909,3029,3032,3163,3224,3272,3292,3391,3483,3539,3705,3841,4169,4344,4386,4448,4482,4508,4543,4802,5046,5100,5312,5511,5614,5795,5831,424, 1285,424, 4169,424, 4448,424, 5831,1285, 1015,4169, 1015,4169, 1285,1285, 5831,4448, 2909,4169, 5831,4448, 5831')]

In [47]:
# append associated movies to each movie
ratings['recommend'] = ''
for index,row in ratings.iterrows():
    if str(row['movie_id']) in rec_1movie_integrated.keys():
        ratings.set_value(index,'recommend',rec_1movie_integrated.get(str(row['movie_id']),""))

In [48]:
ratings.head(5)

Unnamed: 0,user_id,movie_id,rating,unix_timestamp,like,recommend
0,1,1193,5,978300760,1,
1,1,661,3,978302109,0,
2,1,914,3,978301968,0,
3,1,3408,4,978300275,1,
4,1,2355,5,978824291,1,


In [49]:
# get movies which have recomnendations
ratings_valid = ratings[ratings['recommend']!='']
ratings_valid.head(5)

Unnamed: 0,user_id,movie_id,rating,unix_timestamp,like,recommend
333,5,3163,5,978244852,1,42442775831
336,5,1635,4,978245314,1,4277
418,5,1884,3,978246576,0,41694277
504,6,1088,5,978236670,1,41694277
604,8,524,5,978230611,1,41694277


In [50]:
len(set(ratings_valid['user_id']))  # only 2947 users get recommendations

2947

In [51]:
# get users who have recommendations
test_ratings = ratings[ratings['user_id'].isin(set(ratings_valid['user_id']))]
test_ratings.head(5)

Unnamed: 0,user_id,movie_id,rating,unix_timestamp,like,recommend
254,5,2987,4,978243170,1,
255,5,2333,4,978242607,1,
256,5,1175,5,978244759,1,
257,5,39,3,978245037,0,
258,5,288,2,978246585,0,


In [52]:
# define a function to get correct predictions, actualy liked movies and predicted liked movies
def get_correct_pred(user_id):
    liked = test_ratings[(test_ratings['user_id']==user_id) & (test_ratings['like']==1)]
    recommended = liked[liked['recommend']!='']
    recommended['recommend'] = recommended['recommend'].apply(lambda x: x.split(','))
    rec_list = list(recommended['recommend'])
    unique_rec  = [val for sublist in rec_list for val in sublist]
    correct = sum(liked['movie_id'].isin(set(unique_rec)))
    total_liked = len(liked)
    pred_liked = len(set(unique_rec))
    return correct,total_liked,pred_liked

In [54]:
# for user 5, correct=0, actual=82, pred= 3
get_correct_pred(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


(0, 82, 3)

In [55]:
# get correct,total_liked,pred_liked for each user
test_userid = set(ratings_valid['user_id'])
test_result = pd.DataFrame(list(test_userid),columns=['user_id'])
test_result['user_id'] = test_result['user_id'].apply(lambda x: pd.to_numeric(x))
test_result['user_id'].apply(lambda x : get_correct_pred(x))
pred = test_result['user_id'].apply(lambda x : get_correct_pred(x))
test_result['correct'] = pred.apply(lambda x: x[0])
test_result['total_liked'] = pred.apply(lambda x: x[1])
test_result['pred_liked'] = pred.apply(lambda x: x[2])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [56]:
# calculate precision and recall
precision = test_result['correct'].sum()/test_result['total_liked'].sum()
recall = test_result['correct'].sum()/test_result['pred_liked'].sum()
print("Precision: " + str(precision*100) + "%" + "\n" + 
      "Recall: " + str(recall*100) + "%")

Precision: 0.08596858323311389%
Recall: 2.0475743694464374%
