In [14]:
import utils
import numpy as np
from sklearn.metrics.pairwise import pairwise_distances, cosine_similarity

In [2]:
X_train_full, y_train_full = utils.load_train_data("Rating")

In [3]:
X_test = utils.load_test_data("Rating")

In [4]:
users = list(set(X_train_full['user'].tolist()))
items = list(set(X_train_full['item'].tolist()))
u_map = dict(zip(users, range(0,len(users))))
i_map = dict(zip(items, range(0,len(items))))

In [5]:
train_data_matrix = np.zeros((len(users), len(items)))

In [6]:
train = list(zip(X_train_full.values.tolist(), y_train_full))

In [7]:
for x, y in train:
    train_data_matrix[u_map[x[0]], i_map[x[1]]] = y

In [8]:
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')

In [9]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #You use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

In [28]:
item_prediction = predict(train_data_matrix, 1-item_similarity, type='item')

In [27]:
item_prediction

array([[ 0.0034922 ,  0.00352571,  0.0035263 , ...,  0.00352031,
         0.00351899,  0.00351932],
       [ 0.01185713,  0.01189338,  0.01182512, ...,  0.01187518,
         0.01187074,  0.01184807],
       [ 0.00070676,  0.00070514,  0.00070526, ...,  0.00070406,
         0.0007038 ,  0.00070386],
       ..., 
       [ 0.00117793,  0.00117524,  0.00117543, ...,  0.00117344,
         0.001173  ,  0.00117311],
       [ 0.00080099,  0.00079916,  0.0007993 , ...,  0.00079794,
         0.00079764,  0.00079771],
       [ 0.00070676,  0.00070514,  0.00070526, ...,  0.00070406,
         0.0007038 ,  0.00070386]])

In [18]:
from collections import defaultdict
def get_user_rating(users):
    user_ratings = defaultdict(list)
    u_set = set(users)

    for l in utils.read_gz("/Users/ntemiyasathit/Documents/CSE258/258_kaggle/data/train.json.gz"):
        user, business, rating = l['userID'], l['businessID'], l['rating']
        if user in u_set:
            user_ratings[user].append(l['rating'])

    user_average = {}
    for u in user_ratings:
        user_average[u] = sum(user_ratings[u]) / len(user_ratings[u])

    return user_average, user_ratings

In [19]:
user_average, _ = get_user_rating(users)

In [21]:
import random
pred = []
for u,i in X_test.values.tolist():
    if (u in u_map) and (i in i_map):
        index_i = i_map[i]
        index_u = u_map[u]
        estimated =item_prediction[index_u][index_i]
        pred += [estimated]
    else:
        if u in user_average: 
            pred += [user_average[u]]
        else: 
            pred += [4.18703]
        

In [22]:
pred

[0.0059631789372602264,
 0.0022605861182337533,
 0.0002128499632175893,
 0.010072479225851671,
 0.017450332446914744,
 0.0075764639393574315,
 0.00086947394092067238,
 0.0013187474180821142,
 0.0013604005226833199,
 0.019967580297756075,
 0.0041458368798938184,
 0.0050067404030673477,
 0.001149697310845961,
 0.0058118521835510478,
 0.0013529765554354024,
 0.014383165710817261,
 0.0038514656778941658,
 0.0010800275387669214,
 0.044050107249809949,
 0.0078209717298427395,
 0.014933128787838791,
 0.016891229031689398,
 0.0009407506543458998,
 0.0046756388445887186,
 0.00052478444693060183,
 0.012577720931246159,
 0.013448556563650465,
 0.00064079068621690407,
 0.047216795130678543,
 0.031003317058247746,
 0.00046917564004686399,
 0.020143751664900299,
 0.0028663980782270481,
 0.0012621139064427733,
 0.011663284813879382,
 0.0028847002205135436,
 0.0010674885267730866,
 0.0010330893929033831,
 0.0055128033096874512,
 0.00032589960687528521,
 0.0015248764127296806,
 0.001421528784387162,
 0

In [63]:
utils.create_submission(pred, "user based cosine sim",X_test)

True

In [57]:
user_similarity = cosine_similarity(train_data_matrix, dense_output=False)

In [58]:
user_similarity.shape

(18793, 18793)

In [59]:
user_prediction = user_similarity.dot(train_data_matrix)

In [60]:
user_prediction

array([[ 0.        ,  0.02985407,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.04667497,  0.10117215,  0.        , ...,  0.        ,
         0.        ,  0.03571429],
       ..., 
       [ 0.        ,  0.01461763,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.07003635,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [4]:
cd ..

/Users/ntemiyasathit/Documents/CSE258/258_kaggle


In [5]:
from custom_estimators.multi_labels import MultiLabelBinarizer

In [6]:
mlb = MultiLabelBinarizer()

In [7]:
y_train = [['a'],['a', 'b'], ['a', 'b', 'c']]
mlb.fit(y_train)

MultiLabelBinarizer(classes=None, sparse_output=False)

In [10]:
y_test = [['a'],['a','d'],['b']]
mlb.transform(y_test)

array([[1, 0, 0],
       [1, 0, 0],
       [0, 1, 0]])