In [4]:
import re
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
import os

os.chdir("C:\\Users\\Leo\\OneDrive\\文档\\17-18 Semester1\\EB5202 Web Analysis\\MovieLens\\1millionfile\\ml-1m")

In [5]:
# read 1 million ratings data
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('ratings.dat', sep='::', names=r_cols, encoding='latin-1', engine='python')
ratings.head(5)

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [9]:
# add a new column about like or not like, 4 is the threshold
ratings['like'] = 0
for index, row in ratings.iterrows():
    if row['rating'] >= 4 :
        ratings.set_value(index,'like',1)
    else:
        ratings.set_value(index,'like',0)

ratings.head(5)

Unnamed: 0,user_id,movie_id,rating,unix_timestamp,like
0,1,1193,5,978300760,1
1,1,661,3,978302109,0
2,1,914,3,978301968,0
3,1,3408,4,978300275,1
4,1,2355,5,978824291,1


In [8]:
# create a "like" matrix
ratings_matrix = ratings.pivot(index = 'user_id', columns = 'movie_id', values = 'like').fillna(0)

ratings_matrix.head(5)

movie_id,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
# use apriori to get "movie basket"
frequent_movieset = apriori(ratings_matrix, min_support=0.1, use_colnames=True)

In [17]:
frequent_movieset.head(5)

Unnamed: 0,support,itemsets
0,0.274007,[1]
1,0.107947,[6]
2,0.115563,[11]
3,0.104636,[17]
4,0.130298,[21]


In [13]:
# get the association rules
rules = association_rules(frequent_movieset,metric='confidence',min_threshold=0.1)

In [15]:
rules.head(5)

Unnamed: 0,antecedants,consequents,support,confidence,lift
0,(1),(34),0.274007,0.407251,1.993351
1,(34),(1),0.204305,0.546191,1.993351
2,(1),(50),0.274007,0.40423,1.518375
3,(50),(1),0.266225,0.416045,1.518375
4,(1),(110),0.274007,0.455589,1.391886


In [28]:
# substract movie id from antecedants, only reserve the number itself
rules['antecedants_movie'] = rules['antecedants'].apply(lambda x: re.match(".+\\(\\{(.+)\\}\\)",str(x)).group(1))
rules['consequents_movie'] = rules['consequents'].apply(lambda x: re.match(".+\\(\\{(.+)\\}\\)",str(x)).group(1))
rules.head(5)

Unnamed: 0,antecedants,consequents,support,confidence,lift,antecedants_movie,consequents_movie,ant_len
0,(1),(34),0.274007,0.407251,1.993351,1,34,1
1,(34),(1),0.204305,0.546191,1.993351,34,1,1
2,(1),(50),0.274007,0.40423,1.518375,1,50,1
3,(50),(1),0.266225,0.416045,1.518375,50,1,1
4,(1),(110),0.274007,0.455589,1.391886,1,110,1


In [26]:
# subset rules of which antecedants only got one movie
rules['ant_len'] = rules['antecedants'].apply(lambda x: len(x))
rules_1movie = rules[rules['ant_len']==1]
rules_1movie.shape,rules.shape

((9736, 8), (20688, 8))

In [29]:
# create a dictionary, map consequents to each unique antecedants
rec_1movie_integrated = {}
for index,row in rules_1movie.iterrows():
    movie_ant = row['antecedants_movie']
    if movie_ant not in rec_1movie_integrated.keys():
        rec_1movie_integrated[movie_ant] = row['consequents_movie']
    else:
        rec_1movie_integrated[movie_ant] = rec_1movie_integrated[movie_ant] + "," + row['consequents_movie']

In [45]:
list(rec_1movie_integrated.items())[:2]

[('1',
  '34,50,110,260,296,318,356,457,480,527,588,589,593,608,858,919,1036,1097,1136,1196,1197,1198,1200,1210,1214,1240,1259,1265,1270,1291,1307,1580,1617,2028,2355,2396,2571,2716,2762,2858,2997,3114,1196, 260,260, 1197,260, 1198,1210, 260,260, 1270,260, 2028,2571, 260,593, 318,1196, 589,2571, 589,593, 2762,593, 2858,1196, 1197,1196, 1198,1210, 1196,1196, 1270,1196, 2028,2571, 1196,1197, 1198,1210, 1198,1198, 1270,2571, 1198,1196, 260, 1198,1210, 1196, 260,2571, 1196, 260'),
 ('34', '1,260,318,527,593,608,1196,1198,1265,2396,2858')]

In [48]:
# append associated movies to each movie
ratings['recommend'] = ''
for index,row in ratings.iterrows():
    if str(row['movie_id']) in rec_1movie_integrated.keys():
        ratings.set_value(index,'recommend',rec_1movie_integrated.get(str(row['movie_id']),""))

In [49]:
ratings.head(5)

Unnamed: 0,user_id,movie_id,rating,unix_timestamp,like,recommend
0,1,1193,5,978300760,1,"50,110,260,296,318,527,593,608,858,912,919,113..."
1,1,661,3,978302109,0,
2,1,914,3,978301968,0,
3,1,3408,4,978300275,1,2858
4,1,2355,5,978824291,1,126011962396276228583114


In [50]:
# get movies which have recomnendations
ratings_valid = ratings[ratings['recommend']!='']
ratings_valid.head(5)

Unnamed: 0,user_id,movie_id,rating,unix_timestamp,like,recommend
0,1,1193,5,978300760,1,"50,110,260,296,318,527,593,608,858,912,919,113..."
3,1,3408,4,978300275,1,2858
4,1,2355,5,978824291,1,126011962396276228583114
5,1,1197,3,978302268,0,"1,50,110,260,296,318,356,457,480,527,541,589,5..."
7,1,2804,5,978300719,1,26031859360811961197119812702858


In [54]:
# get users who have recommendations
test_ratings = ratings[ratings['user_id'].isin(set(ratings_valid['user_id']))]
test_ratings.head(5)

Unnamed: 0,user_id,movie_id,rating,unix_timestamp,like,recommend
0,1,1193,5,978300760,1,"50,110,260,296,318,527,593,608,858,912,919,113..."
1,1,661,3,978302109,0,
2,1,914,3,978301968,0,
3,1,3408,4,978300275,1,2858
4,1,2355,5,978824291,1,126011962396276228583114


In [65]:
# define a function to get correct predictions, actualy liked movies and predicted liked movies
def get_correct_pred(user_id):
    liked = test_ratings[(test_ratings['user_id']==user_id) & (test_ratings['like']==1)]
    recommended = liked[liked['recommend']!='']
    recommended['recommend'] = recommended['recommend'].apply(lambda x: x.split(','))
    rec_list = list(recommended['recommend'])
    unique_rec  = [val for sublist in rec_list for val in sublist]
    correct = sum(liked['movie_id'].isin(set(unique_rec)))
    total_liked = len(liked)
    pred_liked = len(set(unique_rec))
    return correct,total_liked,pred_liked

In [66]:
# for user 200, correct=0, actual=13, pred= 45
get_correct_pred(200)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


(0, 13, 45)

In [67]:
# get correct,total_liked,pred_liked for each user
test_userid = set(ratings_valid['user_id'])
test_result = pd.DataFrame(list(test_userid),columns=['user_id'])
test_result['user_id'] = test_result['user_id'].apply(lambda x: pd.to_numeric(x))
test_result['user_id'].apply(lambda x : get_correct_pred(x))
pred = test_result['user_id'].apply(lambda x : get_correct_pred(x))
test_result['correct'] = pred.apply(lambda x: x[0])
test_result['total_liked'] = pred.apply(lambda x: x[1])
test_result['pred_liked'] = pred.apply(lambda x: x[2])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [69]:
# calculate precision and recall
precision = test_result['correct'].sum()/test_result['total_liked'].sum()
recall = test_result['correct'].sum()/test_result['pred_liked'].sum()
print("Precision: " + str(precision*100) + "%" + "\n" + 
      "Recall: " + str(recall*100) + "%")

Precision: 27.27544540651982%
Recall: 17.7122266912154%


The next part only gets the top10 associated movies set for each movie, for example, if movie 1 has 500 asscoiated movies, here we only use the top 10 highest confidence movies to be the recommendations for movie1.

In [77]:
# get top 10 associated movies for each movie
ant_movieid = set(rules_1movie['antecedants_movie'])
top10_rec = pd.DataFrame(columns=rules_1movie.columns)
for i in ant_movieid:
    temp = rules_1movie[rules_1movie['antecedants_movie']==i].nlargest(10,'confidence')
    top10_rec = top10_rec.append(temp)

top10_rec.head(5)

Unnamed: 0,antecedants,consequents,support,confidence,lift,antecedants_movie,consequents_movie,ant_len
490,(2529),(260),0.118212,0.848739,1.955144,2529,260,1
353,(541),(260),0.245861,0.736027,1.695501,541,260,1
1027,(541),(1196),0.245861,0.727946,1.751711,541,1196,1
1051,(541),(2571),0.245861,0.626936,1.744216,541,2571,1
4695,(541),"(1196, 260)",0.245861,0.626263,1.929911,541,"1196, 260",1


In [76]:
# create another association dictionary for this part
rec_1movie_top10 = {}
for index,row in top10_rec.iterrows():
    movie_ant = row['antecedants_movie']
    if movie_ant not in rec_1movie_top10.keys():
        rec_1movie_top10[movie_ant] = row['consequents_movie']
    else:
        rec_1movie_top10[movie_ant] = rec_1movie_top10[movie_ant] + "," + row['consequents_movie']

list(rec_1movie_top10.items())

[('2529', '260'),
 ('541', '260,1196,2571,1196, 260,1214,1198,1240,589,1200,1210'),
 ('1214', '260,1196,1196, 260,1240,1200,2571,1198,589,1210,541'),
 ('733', '2571'),
 ('1136', '260,1196,1198,2858,1197,1196, 260,593,858,608,2571'),
 ('1247', '2858,858,608,260,1196,593,912,1198'),
 ('1291',
  '1198,1196,260,1196, 1198,260, 1198,1210,1196, 260,1210, 1196,1210, 1198,1196, 260, 1198'),
 ('1073', '260,1196'),
 ('904', '260'),
 ('2987', '1196,260,1198,2858,2716,1196, 260,2762,1210,1197'),
 ('2028', '260,1196,2858,593,527,110,1198,2571,318,589'),
 ('2599', '2858,2997,608,2858, 2997,2762,2396,593,296'),
 ('2692', '2858'),
 ('923', '858,912,260'),
 ('47', '593,50,2858,608,2762,296'),
 ('2115', '1198'),
 ('1242', '1196,260,1198,2028'),
 ('1200', '1196,260,1214,1240,2571,589,1196, 260,1198,1210,260, 1214'),
 ('1784', '2858,593,318,356'),
 ('1704', '318,2858,593,2028,527,608,296,2762,1196,593, 318'),
 ('1374',
  '1196,260,1196, 260,1210,1198,2571,1240,1210, 1196,1210, 260,1196, 1198'),
 ('3175', 

In [78]:
ratings['recommend_top10'] = ''
for index,row in ratings.iterrows():
    if str(row['movie_id']) in rec_1movie_top10.keys():
        ratings.set_value(index,'recommend_top10',rec_1movie_top10.get(str(row['movie_id']),""))

ratings.head(5)

Unnamed: 0,user_id,movie_id,rating,unix_timestamp,like,recommend,recommend_top10
0,1,1193,5,978300760,1,"50,110,260,296,318,527,593,608,858,912,919,113...",2858858260608593318119652711982028
1,1,661,3,978302109,0,,
2,1,914,3,978301968,0,,
3,1,3408,4,978300275,1,2858,2858
4,1,2355,5,978824291,1,126011962396276228583114,285812762260119631142396


In [79]:
top10_valid = ratings[ratings['recommend_top10']!='']
top10_valid.head(5)

Unnamed: 0,user_id,movie_id,rating,unix_timestamp,like,recommend,recommend_top10
0,1,1193,5,978300760,1,"50,110,260,296,318,527,593,608,858,912,919,113...",2858858260608593318119652711982028
3,1,3408,4,978300275,1,2858,2858
4,1,2355,5,978824291,1,126011962396276228583114,285812762260119631142396
5,1,1197,3,978302268,0,"1,50,110,260,296,318,356,457,480,527,541,589,5...","1196,260,1198,1196, 260,1210,2571,2858,1270,11..."
7,1,2804,5,978300719,1,26031859360811961197119812702858,11962858260119711983185936081270


In [81]:
top10_ratings = ratings[ratings['user_id'].isin(set(top10_valid['user_id']))]
top10_ratings.head(5)

Unnamed: 0,user_id,movie_id,rating,unix_timestamp,like,recommend,recommend_top10
0,1,1193,5,978300760,1,"50,110,260,296,318,527,593,608,858,912,919,113...",2858858260608593318119652711982028
1,1,661,3,978302109,0,,
2,1,914,3,978301968,0,,
3,1,3408,4,978300275,1,2858,2858
4,1,2355,5,978824291,1,126011962396276228583114,285812762260119631142396


In [88]:
def get_top10_pred(user_id):
    liked = top10_ratings[(top10_ratings['user_id']==user_id) & (top10_ratings['like']==1)]
    recommended = liked[liked['recommend_top10']!='']
    recommended['recommend_top10'] = recommended['recommend_top10'].apply(lambda x: x.split(','))
    rec_list = list(recommended['recommend_top10'])
    unique_rec  = [val for sublist in rec_list for val in sublist]
    correct = sum(liked['movie_id'].isin(set(unique_rec)))
    total_liked = len(liked)
    pred_liked = len(set(unique_rec))
    return correct,total_liked,pred_liked

In [89]:
top10_userid = set(top10_valid['user_id'])
top10_result = pd.DataFrame(list(top10_userid),columns=['user_id'])
top10_result['user_id'] = top10_result['user_id'].apply(lambda x: pd.to_numeric(x))
top10_pred = top10_result['user_id'].apply(lambda x : get_top10_pred(x))
top10_result['correct'] = top10_pred.apply(lambda x: x[0])
top10_result['total_liked'] = top10_pred.apply(lambda x: x[1])
top10_result['pred_liked'] = top10_pred.apply(lambda x: x[2])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [90]:
top10_precision = top10_result['correct'].sum()/top10_result['total_liked'].sum()
top10_recall = top10_result['correct'].sum()/top10_result['pred_liked'].sum()
print("Precision: " + str(top10_precision*100) + "%" + "\n" + 
      "Recall: " + str(top10_recall*100) + "%")

Precision: 11.917720825851221%
Recall: 38.004168606842725%


We can see the precision is much lower but recall is much higher, it is reasonable as we decreased the denominator(total predcited movies) of recall while we decreased the numerator(correctly predicted movies) of precision.