In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm

In [2]:
ratings = pd.read_csv('data/training_ratings.csv')

In [3]:
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1129,2.0
2,1,1172,4.0
3,1,1263,2.0
4,1,1287,2.0


In [4]:
genome = pd.read_csv('data/190mb/genome-scores.csv')

In [5]:
user_ratings = ratings.pivot(index='userId', columns='movieId', values='rating')

In [6]:
user_ratings = user_ratings.reset_index()

In [8]:
user_ratings.shape

(671, 7964)

In [18]:
sum(item for item in user_ratings.isnull().sum()) / (user_ratings.shape[0] * user_ratings.shape[1]) * 100

98.679396329683271

In [7]:
user_ratings[:2]

movieId,userId,1,2,3,4,5,6,7,8,9,...,160590,160656,160718,161155,161594,161830,161918,162376,162672,163949
0,1,,,,,,,,,,...,,,,,,,,,,
1,2,,,,,,,,,,...,,,,,,,,,,


In [14]:
def get_matrix_features_for_movies(list_of_movie_ids):
    list_of_movie_ids = set(list_of_movie_ids)
    mv_id = genome[genome.movieId.apply(lambda x: x in list_of_movie_ids)].copy()
    mv_id_pivot = mv_id.pivot(index='movieId', columns='tagId', values='relevance')
    return mv_id_pivot

In [20]:
%time  mv_id = get_matrix_features_for_movies([548, 4692, 6734, 7499, 7847])

CPU times: user 2.87 s, sys: 214 ms, total: 3.09 s
Wall time: 3.08 s


In [75]:
set(mv_id.index) & set []

{548, 4692, 6734, 7847}

In [45]:
user = user_ratings[user_ratings.userId == 1].drop('userId', axis = 1)

In [55]:
from collections import OrderedDict

In [56]:
movie_ratings_per_user_dict = OrderedDict({key: val for key,val in user.iloc[0].items() if pd.notnull(val)})

In [62]:
movie_ids = list(movie_ratings_per_user_dict.keys())

In [76]:
mv_features = get_matrix_features_for_movies(movie_ids)

In [104]:
mv_features = mv_features.reset_index()

In [286]:
ratings_df = pd.DataFrame.from_dict(movie_ratings_per_user_dict, orient='index').reset_index()
ratings_df.columns = ['movieId', 'rating']

In [287]:
data = pd.merge(mv_features, ratings_df, left_on='movieId', right_on='movieId', how='inner').drop('movieId', axis = 1)

In [288]:
data.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,1120,1121,1122,1123,1124,1125,1126,1127,1128,rating
0,0.041,0.05125,0.0205,0.033,0.09225,0.11125,0.027,0.113,0.07525,0.01725,...,0.02025,0.013,0.037,0.183,0.06975,0.017,0.01225,0.08025,0.01675,2.5
1,0.03675,0.03725,0.0235,0.0575,0.11,0.115,0.0345,0.206,0.72675,0.0295,...,0.023,0.0245,0.08725,0.1705,0.05425,0.028,0.01475,0.119,0.02425,2.0
2,0.02425,0.02225,0.11475,0.15125,0.1965,0.185,0.214,0.29425,0.185,0.09025,...,0.04475,0.0965,0.096,0.13725,0.07225,0.105,0.134,0.07475,0.021,4.0
3,0.02125,0.0175,0.03075,0.08625,0.13325,0.1125,0.1585,0.3975,0.14275,0.05975,...,0.215,0.056,0.0745,0.18325,0.0395,0.028,0.14475,0.07975,0.0205,2.0
4,0.03825,0.03825,0.233,0.29875,0.40175,0.3405,0.1355,0.2815,0.13325,0.33175,...,0.07575,0.0895,0.04375,0.128,0.0535,0.1005,0.081,0.065,0.01825,2.0


In [289]:
data['target'] = data.rating.apply(lambda x: str(x))

In [290]:
from sklearn.naive_bayes import GaussianNB

In [291]:
gnb = GaussianNB()

In [292]:
gnb.fit(data.drop(['rating','target'], axis = 1).values,data.target.values)

GaussianNB(priors=None)

In [293]:
unrated = user.melt()
unrated = unrated[pd.isnull(unrated.value)]

In [294]:
movie_ids_for_preds = list(unrated.movieId)

In [295]:
mv_features_preds = get_matrix_features_for_movies(movie_ids_for_preds)

In [296]:
mv_features_preds = mv_features_preds.reset_index()

In [297]:
mv_features_preds

tagId,movieId,1,2,3,4,5,6,7,8,9,...,1119,1120,1121,1122,1123,1124,1125,1126,1127,1128
0,1,0.02500,0.02500,0.05775,0.09675,0.14675,0.21700,0.06700,0.26275,0.26200,...,0.03950,0.01800,0.04575,0.03275,0.12500,0.04150,0.01925,0.03625,0.07775,0.02300
1,2,0.03975,0.04375,0.03775,0.04800,0.11025,0.07250,0.04775,0.10975,0.09925,...,0.04175,0.01925,0.01725,0.02425,0.12550,0.02250,0.01550,0.01475,0.09025,0.01875
2,3,0.04350,0.05475,0.02800,0.07700,0.05400,0.06850,0.05600,0.18500,0.04925,...,0.04150,0.02675,0.02775,0.03425,0.15550,0.03675,0.01700,0.01950,0.09700,0.01850
3,4,0.03725,0.03950,0.03675,0.03100,0.06825,0.04050,0.02325,0.08700,0.05125,...,0.05750,0.03375,0.02275,0.03975,0.18525,0.05925,0.01500,0.01525,0.06450,0.01300
4,5,0.04200,0.05275,0.05925,0.03675,0.07525,0.12525,0.02850,0.08500,0.02950,...,0.04250,0.02825,0.02150,0.02600,0.14275,0.02075,0.01650,0.01675,0.10750,0.01825
5,6,0.02825,0.02550,0.01850,0.04550,0.09575,0.05500,0.04400,0.24200,0.12850,...,0.04900,0.01825,0.02075,0.06000,0.29975,0.15525,0.03525,0.01950,0.06650,0.01900
6,7,0.04575,0.05275,0.16675,0.08275,0.11450,0.15625,0.05025,0.11175,0.03950,...,0.03750,0.02825,0.01200,0.03575,0.13000,0.04875,0.01975,0.01050,0.10925,0.01850
7,8,0.03075,0.03550,0.04675,0.02175,0.05600,0.03650,0.01675,0.07325,0.02950,...,0.03700,0.01925,0.01625,0.02325,0.20975,0.02825,0.01675,0.01125,0.07000,0.01500
8,9,0.03500,0.04050,0.01825,0.01800,0.03650,0.01750,0.01300,0.04225,0.01675,...,0.02225,0.01075,0.01175,0.01525,0.14100,0.02225,0.01100,0.00700,0.07275,0.01550
9,10,0.99975,0.99975,0.01950,0.03675,0.06675,0.05450,0.04550,0.12950,0.08550,...,0.46750,0.02325,0.02150,0.03125,0.18400,0.03750,0.01775,0.01775,0.07300,0.01825


In [298]:
mv_features_preds['target'] = gnb.predict(mv_features_preds.drop('movieId', axis = 1).values)

In [205]:
def apply_pseudo_ratings(x):
    if x == 1:
        return 3
    elif x == 0:
        return 1
    return np.NAN

In [309]:
# mv_features_preds['pseudo_ratings'] = mv_features_preds['target'].apply(apply_pseudo_ratings)
mv_features_preds['pseudo_ratings'] = mv_features_preds['target'].astype('float')

In [310]:
mv_features_preds

tagId,movieId,1,2,3,4,5,6,7,8,9,...,1121,1122,1123,1124,1125,1126,1127,1128,target,pseudo_ratings
0,1,0.02500,0.02500,0.05775,0.09675,0.14675,0.21700,0.06700,0.26275,0.26200,...,0.04575,0.03275,0.12500,0.04150,0.01925,0.03625,0.07775,0.02300,2.0,2.0
1,2,0.03975,0.04375,0.03775,0.04800,0.11025,0.07250,0.04775,0.10975,0.09925,...,0.01725,0.02425,0.12550,0.02250,0.01550,0.01475,0.09025,0.01875,2.0,2.0
2,3,0.04350,0.05475,0.02800,0.07700,0.05400,0.06850,0.05600,0.18500,0.04925,...,0.02775,0.03425,0.15550,0.03675,0.01700,0.01950,0.09700,0.01850,2.0,2.0
3,4,0.03725,0.03950,0.03675,0.03100,0.06825,0.04050,0.02325,0.08700,0.05125,...,0.02275,0.03975,0.18525,0.05925,0.01500,0.01525,0.06450,0.01300,2.5,2.5
4,5,0.04200,0.05275,0.05925,0.03675,0.07525,0.12525,0.02850,0.08500,0.02950,...,0.02150,0.02600,0.14275,0.02075,0.01650,0.01675,0.10750,0.01825,2.0,2.0
5,6,0.02825,0.02550,0.01850,0.04550,0.09575,0.05500,0.04400,0.24200,0.12850,...,0.02075,0.06000,0.29975,0.15525,0.03525,0.01950,0.06650,0.01900,2.0,2.0
6,7,0.04575,0.05275,0.16675,0.08275,0.11450,0.15625,0.05025,0.11175,0.03950,...,0.01200,0.03575,0.13000,0.04875,0.01975,0.01050,0.10925,0.01850,4.0,4.0
7,8,0.03075,0.03550,0.04675,0.02175,0.05600,0.03650,0.01675,0.07325,0.02950,...,0.01625,0.02325,0.20975,0.02825,0.01675,0.01125,0.07000,0.01500,2.0,2.0
8,9,0.03500,0.04050,0.01825,0.01800,0.03650,0.01750,0.01300,0.04225,0.01675,...,0.01175,0.01525,0.14100,0.02225,0.01100,0.00700,0.07275,0.01550,2.0,2.0
9,10,0.99975,0.99975,0.01950,0.03675,0.06675,0.05450,0.04550,0.12950,0.08550,...,0.02150,0.03125,0.18400,0.03750,0.01775,0.01775,0.07300,0.01825,2.5,2.5


In [320]:
original_ratings = user.melt(value_name='ratings')

In [321]:
new_ratings = pd.merge(original_ratings, mv_features_preds,left_on='movieId', right_on='movieId',how='left')

In [322]:
new_ratings = new_ratings[['movieId','ratings', 'pseudo_ratings']]

In [323]:
def apply_new_ratings(x):
    if pd.isnull(x[0]):
        return x[1]
    elif pd.notnull(x[0]):
        return x[0]

In [324]:
new_ratings['ratings'] = new_ratings[['ratings', 'pseudo_ratings']].apply(apply_new_ratings, axis =1)

In [325]:
new_ratings['userId'] = 1

In [326]:
new_ratings = new_ratings[['userId','movieId','ratings']]

In [327]:
new_ratings = new_ratings.reset_index(drop=True)

In [334]:
new_ratings

Unnamed: 0,userId,movieId,ratings
0,1,1,2.0
1,1,2,2.0
2,1,3,2.0
3,1,4,2.5
4,1,5,2.0
5,1,6,2.0
6,1,7,4.0
7,1,8,2.0
8,1,9,2.0
9,1,10,2.5
