In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import random
from datetime import datetime
import pickle as pk

In [2]:
ml_latest_path = "../../dataset/ml-latest-small/"

In [3]:
ratings_csv = ml_latest_path+'ratings.csv'
movies_csv = ml_latest_path+'movies.csv'

In [4]:
ratings_df = pd.read_csv(ratings_csv)
movies_df = pd.read_csv(movies_csv)

In [5]:
NUMBER_USER = ratings_df.loc[ratings_df['userId'].idxmax()]['userId'] # 610

In [6]:
#ratings_df

In [7]:
#input df and output two processed data(1 : pair with timestamp, 2 : dictionary)

def getSession_with_timestamp(ratings_df):
    training_pair_list = []
    user_session_dict = {}
    for index, row in ratings_df.iterrows():
        training_pair_list.append((int(row.userId), int(row.movieId), int(row.timestamp)))
        userId = int(row.userId)
        if userId not in user_session_dict:
            user_session_dict[userId] = []
            user_session_dict[userId].append(int(row.movieId))
        else:
            user_session_dict[userId].append(int(row.movieId))
    
    return training_pair_list, user_session_dict

In [8]:
training_pair_list, user_session_dict = getSession_with_timestamp(ratings_df)

In [9]:
#training_pair_list # user, item, time

In [10]:
#user_session_dict[1] # {user : [items]}

In [11]:
# for userId, item_list in user_session_dict.items():
#     print(userId, item_list)
#     break
    

In [12]:
movies_df.loc[movies_df.movieId == 1].genres.item()

'Adventure|Animation|Children|Comedy|Fantasy'

In [13]:
def genresParser(genres_str):
    return genres_str.split('|')

In [14]:
genresParser('Comedy')

['Comedy']

In [15]:
def get_user_genres_distribution(user_session_dict, movies_df):
    user_genres_dict = {}
    genres_dict = {}
    max_genres = 0
    all_genres = []
    for userId, item_list in user_session_dict.items():
        #user_session_len = len(item_list)
        #print(userId, item_list)
        
        item_sum = 0
        for item in item_list:
            #print(item)
            genres = movies_df.loc[movies_df.movieId == item].genres.item()
            genres_list = genresParser(genres)
            
            for g in genres_list:
                if g not in all_genres:
                    all_genres.append(g)
                item_sum += 1
                if g not in genres_dict.keys():
                    genres_dict[g] = 1
                else:
                    genres_dict[g] += 1
        if len(genres_dict) > max_genres:
            max_genres = len(genres_dict)
            
        div_genres_dict = {k: v/item_sum for k,v in genres_dict.items()}
        user_genres_dict[userId] = div_genres_dict
        genres_dict = {}
    
    #print(max_genres)
    
    return user_genres_dict, all_genres

In [16]:
user_genres_distri_dict, all_genres = get_user_genres_distribution(user_session_dict, movies_df)

In [17]:
def set_user_all_genres_distribution(user_genres_distri_dict, all_genres):
    
    for user in user_genres_distri_dict:
        if len(user_genres_distri_dict[user]) == 20:
            #user_genres_distri_dict[user].sort()
            print(user)
            continue
        else:
            missing_genres_list =  list(set(all_genres) - set(user_genres_distri_dict[user].keys()))
            missing_num = len(missing_genres_list)
            proportion = 1 - 0.005*missing_num
            user_genres_distri_dict[user] = {k : v*proportion for k, v in user_genres_distri_dict[user].items()}
            for g in missing_genres_list:
                user_genres_distri_dict[user][g] = 0.005
            
#             print(user)
#             print(len(user_genres_distri_dict[user]))
            

In [18]:
set_user_all_genres_distribution(user_genres_distri_dict, all_genres)

50
105
125
318
380
414
448
514
517
525
560
567
596
599


In [19]:
#user_genres_distri_dict

In [20]:
def dict_dict_to_df(user_genres_distri_dict, all_genres):
    df_dict = {}
    for user in user_genres_distri_dict:
        # user col
        if 'user' not in df_dict.keys():
            df_dict['user'] = [user]
        else:
            df_dict['user'].append(user)
            
        # genres col
        for genres in all_genres:
            if genres not in df_dict.keys():
                df_dict[genres] = [user_genres_distri_dict[user][genres]]
            else:
                df_dict[genres].append(user_genres_distri_dict[user][genres])
                
    user_genres_df = pd.DataFrame.from_dict(df_dict)
    return user_genres_df

In [21]:
user_genres_df = dict_dict_to_df(user_genres_distri_dict, all_genres)

In [22]:
user_genres_df.to_csv('./processed_data/user_genres_df.csv')

In [23]:
#user_genres_df

In [24]:
all_genres

['Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Fantasy',
 'Romance',
 'Action',
 'Crime',
 'Thriller',
 'Mystery',
 'Horror',
 'Drama',
 'War',
 'Western',
 'Sci-Fi',
 'Musical',
 'Film-Noir',
 'IMAX',
 'Documentary',
 '(no genres listed)']

In [25]:
len(movies_df.loc[movies_df.genres == '(no genres listed)'])

34

In [26]:
movies_df[movies_df.movieId == 1].genres.item().split('|')

['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy']

In [27]:
all_items = movies_df['movieId'].tolist()

In [28]:
user_session_dict[1][-1]

5060

In [29]:
def random_sampling(all_items, user_session_dict):
    training_data = []
    neg_list = []
    testing_data = {}
    testing_items = []
    
    for uid in user_session_dict:
        user_session = user_session_dict[uid]
        neg_item_list = set(all_items) - set(user_session)
        testing_item = user_session[-1]
        testing_items.append(testing_item) # keep for weighted sampling
        
        
        for pos_item in user_session[:-1]:
            neg_list = random.sample(neg_item_list, 4)
            for neg_item in neg_list:
                training_data.append((uid, pos_item, neg_item))
        
        testing_list = []
        test_neg_list = random.sample(neg_item_list, 99)
        for neg_test in test_neg_list:
            testing_list.append((uid, testing_item, neg_test))
        testing_data[uid] = testing_list
        
    return training_data, testing_data, testing_items

In [30]:
random_training_data, testing_data, testing_items = random_sampling(all_items, user_session_dict)

In [31]:
movies_df.movieId.to_dict()[2991]

4006

In [32]:
item_look_up_table = {v: k for k, v in movies_df.movieId.to_dict().items()}


In [33]:
temp_list = []
for tr in random_training_data:
    temp_list.append((tr[0], item_look_up_table[tr[1]], item_look_up_table[tr[2]]))
random_training_data = temp_list


In [34]:
temp_dict = {}
for user in testing_data:
    #print(user)
    temp_list = []
    for t in testing_data[user]:
        temp_list.append((t[0], item_look_up_table[t[1]], item_look_up_table[t[2]]))
    temp_dict[user] = temp_list
    temp_list = []

testing_data = temp_dict

In [35]:
#random_training_data

In [36]:
#testing_data[1]

In [37]:
len(testing_items)

610

In [38]:
time = datetime.now().strftime('%Y%m%d_%H%M%S')
print(time)

20190919_175344


In [39]:
with open('./processed_data/ml_latest_'+time+'_random_training'+'.pkl', 'wb') as f:
    pk.dump(random_training_data, f)
    
with open('./processed_data/ml_latest_'+time+'_testing'+'.pkl', 'wb') as f:
    pk.dump(testing_data, f)

In [40]:
def get_genres_item_list(movies_df):
    
    genres_item_dict = {}
    all_items = movies_df['movieId'].tolist()
    
    for item in all_items:
        item_genres_list = movies_df[movies_df.movieId == item].genres.item().split('|')
        for item_genres in item_genres_list:
            if item_genres not in genres_item_dict.keys():
                genres_item_dict[item_genres] = [item]
            else:
                 genres_item_dict[item_genres].append(item)
    
    return genres_item_dict

In [41]:
genres_item_dict = get_genres_item_list(movies_df)

In [42]:
user_genres_df.to_dict('records')[1]

{'(no genres listed)': 0.005,
 'Action': 0.14226666666666665,
 'Adventure': 0.0388,
 'Animation': 0.005,
 'Children': 0.005,
 'Comedy': 0.09053333333333334,
 'Crime': 0.12933333333333333,
 'Documentary': 0.0388,
 'Drama': 0.21986666666666665,
 'Fantasy': 0.005,
 'Film-Noir': 0.005,
 'Horror': 0.012933333333333333,
 'IMAX': 0.05173333333333333,
 'Musical': 0.005,
 'Mystery': 0.025866666666666666,
 'Romance': 0.012933333333333333,
 'Sci-Fi': 0.05173333333333333,
 'Thriller': 0.12933333333333333,
 'War': 0.012933333333333333,
 'Western': 0.012933333333333333,
 'user': 2.0}

In [43]:
#user_genres_df.to_dict('records')

In [44]:
def get_distribution(all_genres, user_genres_df):
    user_distribution_list = []
    distribution_list = []
    for user_genres_distri_dict in user_genres_df.to_dict('records'):
        for g in all_genres:
            distribution_list.append(user_genres_distri_dict[g])
        user_distribution_list.append(distribution_list)
        distribution_list = []
    
    return user_distribution_list

In [45]:
user_distribution_list = get_distribution(all_genres, user_genres_df)

In [46]:
for g in range(len(all_genres)):
    print(all_genres[g], end=', ')
    print(user_distribution_list[1][g])


Adventure, 0.0388
Animation, 0.005
Children, 0.005
Comedy, 0.09053333333333334
Fantasy, 0.005
Romance, 0.012933333333333333
Action, 0.14226666666666665
Crime, 0.12933333333333333
Thriller, 0.12933333333333333
Mystery, 0.025866666666666666
Horror, 0.012933333333333333
Drama, 0.21986666666666665
War, 0.012933333333333333
Western, 0.012933333333333333
Sci-Fi, 0.05173333333333333
Musical, 0.005
Film-Noir, 0.005
IMAX, 0.05173333333333333
Documentary, 0.0388
(no genres listed), 0.005


In [47]:
len(user_distribution_list)

610

In [48]:
t = np.random.choice(all_genres, 4, p=user_distribution_list[1])
list(t)

['Thriller', 'IMAX', 'Crime', 'Thriller']

In [49]:
# not with epsilon greedy policy
def topic_weighted_sampling(user_distribution_list, genres_item_dict, all_genres, user_session_dict, testing_items):
    training_data = []
    topic_testing_data = {}
    
    for uid in user_session_dict:
        user_session = user_session_dict[uid]
        dis_id = uid - 1
        distribution = user_distribution_list[dis_id]
        testing_item = testing_items[dis_id]
        
        #training
        for positive in user_session:
            if positive is testing_item:
                continue
        
            genres_choice_with_weight = list(np.random.choice(all_genres, 4, p=distribution))
    
            neg_list = []
            for genres in genres_choice_with_weight:
                genres_item_list = genres_item_dict[genres]
                sampled_item = random.sample(genres_item_list, 1)
                while sampled_item[0] in neg_list or sampled_item[0] in user_session_dict:
                    sampled_item = random.sample(genres_item_list, 1)
                neg_list.append(sampled_item[0])
        
            for neg in neg_list:
                training_data.append((uid, positive, neg))
        
        #testing
        testing_choice_with_weight = list(np.random.choice(all_genres, 99, p=distribution))
        neg_test_list = []
        for genres in testing_choice_with_weight:
            genres_item_list = genres_item_dict[genres]
            sampled_item = random.sample(genres_item_list, 1)
            while sampled_item[0] in neg_test_list or sampled_item[0] in user_session_dict:
                sampled_item = random.sample(genres_item_list, 1)
            neg_test_list.append(sampled_item[0])
        neg_test_list.append(testing_item)
        topic_testing_data[uid] = neg_test_list
    
    return training_data, topic_testing_data

In [50]:
# not with epsilon greedy policy
def topic_reverse_weighted_sampling(user_distribution_list, genres_item_dict, all_genres, user_session_dict, testing_items):
    training_data = []
    topic_testing_data = {}
        
    for uid in user_session_dict:
        user_session = user_session_dict[uid]
        dis_id = uid - 1
        distribution = user_distribution_list[dis_id]
        tmp_dis_list = [1-x for x in distribution]
        sum_re_dis = sum(tmp_dis_list)
        re_distribution = [x/sum_re_dis for x in tmp_dis_list]
        testing_item = testing_items[dis_id]
        
        for positive in user_session:
            if positive is testing_item:
                continue
        
            genres_choice_with_weight = list(np.random.choice(all_genres, 4, p=re_distribution))
            neg_list = []
            for genres in genres_choice_with_weight:
                genres_item_list = genres_item_dict[genres]
                sampled_item = random.sample(genres_item_list, 1)
                while sampled_item[0] in neg_list or sampled_item[0] in user_session_dict:
                    sampled_item = random.sample(genres_item_list, 1)
                neg_list.append(sampled_item[0])
        
            for neg in neg_list:
                training_data.append((uid, positive, neg))
    
        
        #testing
        testing_choice_with_weight = list(np.random.choice(all_genres, 99, p=re_distribution))
        neg_test_list = []
        for genres in testing_choice_with_weight:
            genres_item_list = genres_item_dict[genres]
            sampled_item = random.sample(genres_item_list, 1)
            while sampled_item[0] in neg_test_list or sampled_item[0] in user_session_dict:
                sampled_item = random.sample(genres_item_list, 1)
            neg_test_list.append(sampled_item[0])
        neg_test_list.append(testing_item)
        topic_testing_data[uid] = neg_test_list
    
    return training_data, topic_testing_data

In [51]:
#def topic_test_sampling(user_distribution_list, genres_item_dict, testing_items, )

In [52]:
topic_reverse_weighted_training_data, topic_reverse_testing_data = topic_reverse_weighted_sampling(user_distribution_list, genres_item_dict, all_genres, user_session_dict, testing_items)


In [53]:
#topic_reverse_weighted_training_data

In [54]:
len(topic_reverse_testing_data[1])

100

In [55]:
topic_weighted_training_data, topic_testing_data = topic_weighted_sampling(user_distribution_list, genres_item_dict, all_genres, user_session_dict, testing_items)

In [56]:
topic_weighted_training_data

[(1, 1, 8015),
 (1, 1, 1717),
 (1, 1, 69134),
 (1, 1, 45517),
 (1, 3, 62155),
 (1, 3, 69453),
 (1, 3, 7216),
 (1, 3, 122926),
 (1, 6, 3810),
 (1, 6, 4329),
 (1, 6, 5675),
 (1, 6, 4578),
 (1, 47, 1256),
 (1, 47, 35347),
 (1, 47, 3430),
 (1, 47, 818),
 (1, 50, 2048),
 (1, 50, 91485),
 (1, 50, 2394),
 (1, 50, 117922),
 (1, 70, 1405),
 (1, 70, 141544),
 (1, 70, 113453),
 (1, 70, 170957),
 (1, 101, 5630),
 (1, 101, 166635),
 (1, 101, 52973),
 (1, 101, 4804),
 (1, 110, 1592),
 (1, 110, 3764),
 (1, 110, 108090),
 (1, 110, 88267),
 (1, 151, 59615),
 (1, 151, 106489),
 (1, 151, 65514),
 (1, 151, 143031),
 (1, 157, 1251),
 (1, 157, 59016),
 (1, 157, 3910),
 (1, 157, 106883),
 (1, 163, 5094),
 (1, 163, 848),
 (1, 163, 4459),
 (1, 163, 84637),
 (1, 216, 3406),
 (1, 216, 2260),
 (1, 216, 51927),
 (1, 216, 8487),
 (1, 223, 4874),
 (1, 223, 2741),
 (1, 223, 7767),
 (1, 223, 72601),
 (1, 231, 4774),
 (1, 231, 8965),
 (1, 231, 170827),
 (1, 231, 7820),
 (1, 235, 95473),
 (1, 235, 2153),
 (1, 235, 2531)

In [57]:
#topic_testing_data[1]

In [58]:
temp_list = []
for tp in topic_weighted_training_data:
    temp_list.append((tp[0], item_look_up_table[tp[1]], item_look_up_table[tp[2]]))
topic_weighted_training_data = temp_list

In [59]:
temp_list = []
for re in topic_reverse_weighted_training_data:
    temp_list.append((re[0], item_look_up_table[re[1]], item_look_up_table[re[2]]))
topic_reverse_weighted_training_data = temp_list

In [60]:
temp_dict = {}
for user in topic_testing_data:
    #print(user)
    temp_list = []
    for t in topic_testing_data[user]:
        temp_list.append(item_look_up_table[t])
    temp_dict[user] = temp_list
    temp_list = []

topic_testing_data = temp_dict

In [61]:
temp_dict = {}
for user in topic_reverse_testing_data:
    #print(user)
    temp_list = []
    for t in topic_reverse_testing_data[user]:
        temp_list.append(item_look_up_table[t])
    temp_dict[user] = temp_list
    temp_list = []

topic_reverse_testing_data = temp_dict

In [62]:
#topic_weighted_training_data

In [63]:
with open('./processed_data/ml_latest_'+time+'_weighted_training'+'.pkl', 'wb') as f:
    pk.dump(topic_weighted_training_data, f)

In [64]:
with open('./processed_data/ml_latest_'+time+'_reverse_weighted_training'+'.pkl', 'wb') as f:
    pk.dump(topic_reverse_weighted_training_data, f)

In [65]:
with open('./processed_data/ml_latest_'+time+'_weighted_testing'+'.pkl', 'wb') as f:
    pk.dump(topic_testing_data, f)

with open('./processed_data/ml_latest_'+time+'_reverse_weighted_testing'+'.pkl', 'wb') as f:
    pk.dump(topic_reverse_testing_data, f)
    

In [66]:
new_dict = {}
temp = []
for k, v in genres_item_dict.items():
    new_dict[k] = [item_look_up_table[x] for x in v]

In [67]:
#new_dict

In [68]:
def check_distribution(genres_item_dict, training_data, all_genres):
    len_list = []
    max_num = 0
    user_dis_list = []
    user_genres_list = [0]*len(all_genres)
     
    start = 1
    for train_pair in training_data:
        if train_pair[0] > 610 :
            break
        if train_pair[0] == start:
            max_num += 1
            for g in genres_item_dict:
                if train_pair[2] in genres_item_dict[g]:
                    user_genres_list[all_genres.index(g)] = user_genres_list[all_genres.index(g)] + 1
            if train_pair[0] == 610:
                print(start)
                start += 1
                len_list.append(max_num)
                temp_list = [x/max_num for x in user_genres_list]
                user_dis_list.append(temp_list)
                temp_list = []
                user_genres_list = [0]*len(all_genres)
                max_num = 0
        else:
            print(start, end=", ")
            start += 1
            len_list.append(max_num)
            temp_list = [x/max_num for x in user_genres_list]
            user_dis_list.append(temp_list)
            temp_list = []
            user_genres_list = [0]*len(all_genres)
            max_num = 0
       
    return user_dis_list

In [69]:
random_distribution = check_distribution(new_dict, random_training_data, all_genres)
topic_distribution = check_distribution(new_dict, topic_weighted_training_data, all_genres)

1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222

ZeroDivisionError: division by zero

In [None]:

re_topic_distribution = check_distribution(new_dict, topic_reverse_weighted_training_data, all_genres)

In [None]:
for i in range(len(all_genres)):
    print(all_genres[i], end=',\t\t\t')
    print(user_distribution_list[0][i], end=',\t\t\t')
    print(random_distribution[0][i], end=',\t\t\t')
    print(topic_distribution[0][i])

In [None]:
random_distribution[0]

In [None]:
topic_distribution

In [None]:
sns.heatmap(random_distribution, vmin=0, vmax = 1)

In [None]:
sns.heatmap(topic_distribution, vmin=0, vmax = 1)

In [None]:
sns.heatmap(re_topic_distribution, vmin=0, vmax=1)

## Topic Clustering 

In [None]:
from sklearn.cluster import KMeans

In [None]:
len(random_distribution)

In [None]:
topic_dis_np = np.array(topic_distribution)

In [None]:
kmeans = KMeans(n_clusters=10, random_state=0).fit(topic_dis_np)

In [None]:
kmeans.labels_