In [39]:
from sklearn.model_selection import train_test_split
from sklearn.decomposition import NMF
from custom_estimators import preprocessors
from custom_estimators.multi_labels import MultiLabelBinarizer
import utils
import numpy as np
from sklearn.metrics.pairwise import pairwise_distances
import pandas as pd
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error

In [207]:
X_train_full, y_train_full = utils.load_train_data("Visit")

In [3]:
users = list(set(X_train_full['user'].tolist()))
items = list(set(X_train_full['item'].tolist()))
u_map = dict(zip(users, range(0,len(users))))
i_map = dict(zip(items, range(0,len(items))))

In [211]:
X_train, X_val, y_train, y_val = train_test_split(X_train_full.values.tolist(), y_train_full, test_size=0.2, random_state=12345)

In [5]:
train_data_matrix = np.zeros((len(users), len(items)))

In [6]:
train = list(zip(X_train, y_train))
val = list(zip(X_val, y_val))

In [7]:
for x, y in train:
    train_data_matrix[u_map[x[0]], i_map[x[1]]] = y

In [8]:
test_data_matrix = np.zeros((len(users), len(items)))
for x, y in val:
    test_data_matrix[u_map[x[0]], i_map[x[1]]] = y

In [9]:
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')

In [217]:
item_similarity

array([[ 0.,  1.,  1., ...,  1.,  1.,  1.],
       [ 1.,  0.,  1., ...,  1.,  1.,  1.],
       [ 1.,  1.,  0., ...,  1.,  1.,  1.],
       ..., 
       [ 1.,  1.,  1., ...,  0.,  1.,  1.],
       [ 1.,  1.,  1., ...,  1.,  0.,  1.],
       [ 1.,  1.,  1., ...,  1.,  1.,  0.]])

In [11]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #You use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

In [12]:
item_prediction = predict(train_data_matrix, item_similarity, type='item')

In [14]:
item_prediction.shape

(18793, 21321)

In [216]:
len(item_prediction[item_prediction==0])

1301140

In [21]:
np.median(item_prediction)

0.00023464086347162454

In [212]:
X_val, y_val = utils.sample_negatives(pd.DataFrame(X_val, columns=['user', 'item']), y_val, 1)

In [213]:
X_val_list = X_val.values.tolist()

In [69]:
median = 0.0003064086347162454

In [214]:
pred = []
for u,i in X_val_list:
    index_u = u_map[u]
    index_i = i_map[i]
    estimated = item_prediction[index_u][index_i]
    if estimated > 0:
        pred += [1]
    else:
        pred += [-1]

In [215]:
accuracy_score(y_val, pred)

0.49988749999999998

In [72]:
confusion_matrix(y_val, pred)

array([[14085, 25915],
       [14340, 25660]])

In [195]:
X_train, y_train = utils.sample_negatives(pd.DataFrame(X_train, columns=['user', 'item']), y_train, 1)

In [141]:
y_train = [0 if i == -1 else 1 for i in y_train]

In [83]:
X_train_list = X_train.values.tolist()

In [85]:
est = []
for u,i in X_train_list:
    index_u = u_map[u]
    index_i = i_map[i]
    est += [item_prediction[index_u][index_i]]

In [87]:
X_train['est'] = pd.Series(est)

In [171]:
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import FunctionTransformer
from custom_estimators.multi_labels import MultiLabelBinarizer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, RidgeClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from custom_estimators.preprocessors import enrich_category, to_user_item_matrix
from fastFM import als, sgd

In [94]:
pipeline1 = Pipeline([('categorize', FunctionTransformer(enrich_category, validate=False)),
                            ('mapper', DataFrameMapper([('user', LabelBinarizer(sparse_output=True)),
                                                        ('item', LabelBinarizer(sparse_output=True)),
                                                        ('est', None),
                                                        ('category', MultiLabelBinarizer(sparse_output=True))],
                                                       input_df=True, sparse=True)),
                            ('lgst', RidgeClassifier())])

In [95]:
pipeline1.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('categorize', FunctionTransformer(accept_sparse=False,
          func=<function enrich_category at 0x1a0f366730>,
          inv_kw_args=None, inverse_func=None, kw_args=None,
          pass_y='deprecated', validate=False)), ('mapper', DataFrameMapper(default=False, df_out=False,
        feat...True,
        max_iter=None, normalize=False, random_state=None, solver='auto',
        tol=0.001))])

In [101]:
est2 = []
for u,i in X_val_list:
    index_u = u_map[u]
    index_i = i_map[i]
    est2 += [item_prediction[index_u][index_i]]

In [102]:
X_val['est'] = pd.Series(est2)

In [104]:
train_pred = pipeline1.predict(X_train)

In [105]:
accuracy_score(y_train, train_pred)

0.77766250000000003

In [106]:
y_val = [0 if i == -1 else 1 for i in y_val]

In [108]:
val_pred = pipeline1.predict(X_val)

In [109]:
accuracy_score(y_val, val_pred)

0.40029999999999999

In [110]:
pipeline2 = Pipeline([('mapper', DataFrameMapper([('user', LabelBinarizer(sparse_output=True)),
                                                ('item', LabelBinarizer(sparse_output=True)),
                                                ('est', None)],
                                                       input_df=True, sparse=True)),
                            ('lgst', RidgeClassifier())])

In [112]:
pipeline2.fit(X_train,y_train)

Pipeline(memory=None,
     steps=[('mapper', DataFrameMapper(default=False, df_out=False,
        features=[('user', LabelBinarizer(neg_label=0, pos_label=1, sparse_output=True)), ('item', LabelBinarizer(neg_label=0, pos_label=1, sparse_output=True)), ('est', None)],
        input_df=True, sparse=True)), ('lgst', RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
        max_iter=None, normalize=False, random_state=None, solver='auto',
        tol=0.001))])

In [113]:
train_pred2 = pipeline2.predict(X_train)

In [114]:
accuracy_score(y_train, train_pred2)

0.77767187500000001

In [115]:
val_pred2 = pipeline2.predict(X_val)

In [116]:
accuracy_score(y_val, val_pred2)

0.39996874999999998

In [117]:
X_raing, y_rating = utils.load_train_data("Rating")

In [118]:
X_rating = X_raing.values.tolist()

In [121]:
X_rating = [(i[0], i[1]) for i in X_rating]

In [123]:
rating_map = dict(zip(X_rating, y_rating))

In [129]:
avg_rating = np.mean(list(rating_map.values()))

In [149]:
def enrich_rating(X):
    rating = []
    xlist = X.values.tolist()
    for u,i in xlist:
        if (u,i) in rating_map: 
            rating += [rating_map[(u,i)]]
        else:
            rating += [avg_rating]
    X['rating'] = pd.Series(rating)
    return X

In [151]:
def enrich_rating2(X):
    rating = []
    xlist = X.values.tolist()
    for u,i in xlist:
        if (u,i) in rating_map: 
            rating += [1]
        else:
            rating += [0]
    return rating

In [152]:
X_test = utils.load_test_data("Visit")

In [154]:
in_train = enrich_rating2(X_test)

In [159]:
pipeline3 = Pipeline([('mapper', DataFrameMapper([('user', LabelBinarizer(sparse_output=True)),
                                                ('item', LabelBinarizer(sparse_output=True)),
                                                ('rating', None)],
                                                       input_df=True, sparse=True)),
                            ('lgst', RidgeClassifier())])

In [197]:
X_train = enrich_rating(X_train)

In [164]:
pipeline3.fit(X_train,y_train)

Pipeline(memory=None,
     steps=[('mapper', DataFrameMapper(default=False, df_out=False,
        features=[('user', LabelBinarizer(neg_label=0, pos_label=1, sparse_output=True)), ('item', LabelBinarizer(neg_label=0, pos_label=1, sparse_output=True)), ('rating', None)],
        input_df=True, sparse=True)), ('lgst', RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
        max_iter=None, normalize=False, random_state=None, solver='auto',
        tol=0.001))])

In [165]:
train_pred3 = pipeline3.predict(X_train)

In [166]:
accuracy_score(train_pred3, y_train)

0.77429375

In [175]:
pipeline4 = Pipeline([('categorize', FunctionTransformer(enrich_category, validate=False)),
                            ('mapper', DataFrameMapper([('user', LabelBinarizer(sparse_output=True)),
                                                        ('item', LabelBinarizer(sparse_output=True)),
                                                        ('rating', None),
                                                        ('category', MultiLabelBinarizer(sparse_output=True))],
                                                       input_df=True, sparse=True)),
                            ('als', als.FMClassification(n_iter=100, rank=3))])

In [199]:
pipeline4.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('categorize', FunctionTransformer(accept_sparse=False,
          func=<function enrich_category at 0x1a0f366730>,
          inv_kw_args=None, inverse_func=None, kw_args=None,
          pass_y='deprecated', validate=False)), ('mapper', DataFrameMapper(default=False, df_out=False,
        feat...stdev=0.1, l2_reg=None, l2_reg_V=0.1, l2_reg_w=0.1,
         n_iter=100, random_state=123, rank=3))])

In [202]:
train_pred4 = pipeline4.predict(X_train)

In [203]:
accuracy_score(train_pred4, y_train)

0.94086250000000005

In [204]:
val_pred4 = pipeline4.predict(enrich_rating(X_val))

In [205]:
accuracy_score(val_pred4, y_val)

0.654725