In [1]:
import json
from functools import lru_cache

import pandas as pd
import numpy as np
import sklearn as sk
import sklearn.ensemble
from progressbar import ProgressBar

In [2]:
likes_df = pd.read_csv('train_likes.csv')

In [3]:
items_data = dict()
for item in json.load(open('items.json')):
    items_data[item['id']] = item

In [4]:
likes_df.head()

Unnamed: 0,user_id,item_id,channel,time
0,612d8e8eef05acff3278c061ec10f704,7aa5d00445cb9d61d1739dd0df9a0a88,1f0e3dad99908345f7439f8ffabdffc4,1389734000.0
1,71a7f1d1be96603971ba66e4a17e845c,5edaf734b432e5cc954a10b59cb97e70,ec5decca5ed3d6b8079e2e7e7bacc9f2,1390459000.0
2,6eaa117728d50265e6b2ac24a80e04ae,8ad97d075fce19c2d182eb2a4539aa1c,98f13708210194c475687be6106a3b84,1391064000.0
3,5d9db6ab742755197343505bccfad516,aa5f2ca699da42e467e550f9f071fb3f,98f13708210194c475687be6106a3b84,1391053000.0
4,3000a163610654f1fa181e74136d2d35,8142d0e687c1c7a317ed9673db9f11a4,c74d97b01eae257e44aa9d5bade97baf,1392381000.0


In [5]:
items_data['5edaf734b432e5cc954a10b59cb97e70']

{'duration': -0.6535735882045764,
 'f_106593': 1,
 'f_114306': 2,
 'f_122038': 1,
 'f_122097': 1,
 'f_125216': 1,
 'f_127794': 1,
 'f_131919': 1,
 'f_137909': 1,
 'f_143336': 1,
 'f_153548': 1,
 'f_161336': 1,
 'f_173035': 1,
 'f_173938': 1,
 'f_174560': 1,
 'f_177744': 1,
 'f_180790': 1,
 'f_183958': 1,
 'f_192481': 1,
 'f_193515': 1,
 'f_196983': 1,
 'f_205162': 1,
 'f_35526': 1,
 'f_38095': 1,
 'f_61462': 1,
 'f_63680': 1,
 'f_64513': 1,
 'f_68894': 1,
 'f_96274': 2,
 'genre': 2,
 'id': '5edaf734b432e5cc954a10b59cb97e70',
 'year': 0.13337536834806843}

-----

In [6]:
items_index = dict(zip(items_data.keys(), range(len(items_data))))

In [7]:
features_index = dict()
for data in items_data.values():
    for k in data.keys():
        if k[0] == 'f':
            features_index[k] = int(k[2:]) + 1

In [8]:
users = likes_df.user_id.unique()

-----

In [9]:
items_matrix = np.zeros((len(items_index), len(features_index) + 1))

for item, item_index in items_index.items():
    data = items_data[item]
    
    items_matrix[item_index, 0] = data['genre']
    for it in data:
        if it[0] == 'f':
            items_matrix[item_index, features_index[it]] = 1

In [10]:
items_matrix.shape

(138078, 213503)

In [11]:
items_matrix_df = pd.DataFrame(items_matrix)

In [12]:
items_unpopularity_threshold = 100
a = likes_df.item_id.value_counts()
b = a < items_unpopularity_threshold
unpopular_items = [items_index[it] for it in a[b].index if it in items_index]
len(unpopular_items)

11646

In [None]:
items_matrix_df.drop(unpopular_items, axis=0)

In [None]:
features_unpopularity_threshold = 100
unpopular_features = []

for feature, feature_index in features_index.items():
    if items_matrix_df[feature_index].sum() < features_unpopularity_threshold:
        unpopular_features.append(feature_index)

items_matrix_df.drop(unpopular_features, axis=1, inplace=True)

In [None]:
items_matrix = items_matrix_df.values

In [None]:
items_matrix.shape

-----

In [15]:
def get_train_data(user):
    liked_items = set(likes_df[likes_df.user_id == user].item_id)
    
    X_batch = items_matrix
    y_batch = np.zeros(len(items_index))
    
    for liked_item in liked_items:
        liked_item_index = items_index.get(liked_item, None)
        if liked_item_index is not None:
            y_batch[liked_item_index] = 1
        
    return X_batch, y_batch

In [17]:
def create_predictor(user):
    predictor = sk.ensemble.RandomForestClassifier()
    
    X_batch, y_batch = get_train_data(user)
    predictor.fit(X_batch, y_batch)
        
    return predictor

-----

In [None]:
users_predictors = dict()
bar = ProgressBar()
for user in bar(users):
    users_predictors[user] = create_predictor(user)

  0% (    0 of 55863) |                  | Elapsed Time: 0:00:00 ETA:  --:--:--

1
2
