In [1]:
import numpy as np
from scipy.sparse import rand as sprand
import pandas as pd
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
columns1 = ['user id', 'movie id','rating', 'Timestamp']
df_ratings = pd.read_table('u.data', names=columns1)

columns2 = ['movie id', 'movie title', 'release date', 'video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure', 'Animation', 'Childrens', 'Comedy', 'Crime' ,'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
df_movie = pd.read_csv('u.item', sep='|', names=columns2, encoding='latin-1')

In [3]:
time_80 = np.quantile(df_ratings.Timestamp.values, 0.8)
time_80

889237269.0

In [4]:
train = df_ratings[df_ratings["Timestamp"] < time_80].copy()
val = df_ratings[df_ratings["Timestamp"] >= time_80].copy()

In [5]:
val.head(4)

Unnamed: 0,user id,movie id,rating,Timestamp
1,186,302,3,891717742
7,253,465,5,891628467
13,210,40,3,891035994
19,234,1184,2,892079237


In [6]:
train_user_ids = np.sort(np.unique(train['user id'].values))

In [7]:
num_users = len(train_user_ids)

In [8]:
num_users

751

In [9]:
id_generator = {o: i for i,o in enumerate(train_user_ids)}

In [10]:
train['user id'] = train['user id'].apply(lambda x: id_generator[x])

In [11]:
train.head()

Unnamed: 0,user id,movie id,rating,Timestamp
0,158,242,3,881250949
2,18,377,1,878887116
3,195,51,2,880606923
4,137,346,1,886397596
5,237,474,4,884182806


In [12]:
val['user id'] = val['user id'].apply(lambda x: id_generator.get(x, -1))

val.head()

Unnamed: 0,user id,movie id,rating,Timestamp
1,151,302,3,891717742
7,-1,465,5,891628467
13,170,40,3,891035994
19,-1,1184,2,892079237
21,-1,486,4,892738452


In [13]:
val = val[val['user id']>=0].copy()

val.head()

Unnamed: 0,user id,movie id,rating,Timestamp
1,151,302,3,891717742
13,170,40,3,891035994
72,76,1049,1,890251826
120,78,789,4,891720887
150,238,955,4,889502823


In [14]:
train_movie_ids = np.sort(np.unique(train['movie id'].values))
num_items = len(train_movie_ids)
print(num_items)
train_movie_ids[:15]

1616


array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15],
      dtype=int64)

In [15]:
mid_generator = {o: i for i,o in enumerate(train_movie_ids)}

train['movie id'] = train['movie id'].apply(lambda x: mid_generator[x])
val['movie id'] = val['movie id'].apply(lambda x: mid_generator.get(x, -1))

val = val[val['movie id']>=0].copy()

val.head()

Unnamed: 0,user id,movie id,rating,Timestamp
1,151,301,3,891717742
13,170,39,3,891035994
72,76,1035,1,890251826
120,78,785,4,891720887
150,238,941,4,889502823


In [16]:
val.shape

(2864, 4)

In [17]:
len(val['movie id'])

2864

Embedding Layer

In [18]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [19]:
embed = nn.Embedding(10,3)
embed.weight

Parameter containing:
tensor([[-0.4414,  1.1368,  0.5728],
        [ 1.1979, -0.0068, -1.0915],
        [-0.6839,  0.4857,  0.7642],
        [ 0.3069,  0.7194,  0.2158],
        [ 0.6424,  0.3186, -0.4902],
        [-0.2067, -0.8838, -1.0706],
        [-0.0415,  0.6883, -2.4752],
        [ 0.7335, -0.0426,  1.1024],
        [ 0.2571, -0.4601, -0.9135],
        [ 0.4171, -0.7008, -0.6478]], requires_grad=True)

Matrix Factorization Model

In [20]:
class MF(nn.Module):

    def __init__(self, num_users, num_items, emb_size=100):
        super(MF, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)

        self.user_emb.weight.data.uniform_(0,0.05)
        self.item_emb.weight.data.uniform_(0,0.05)


    def forward(self, u, v):
        u = self.user_emb(u)
        v = self.item_emb(v)

        return (u*v).sum(1)


        

In [21]:
users = torch.LongTensor(train['user id'].values)

users

tensor([158,  18, 195,  ..., 218,   9,   8])

In [22]:
items = torch.LongTensor(train['movie id'].values)

items

tensor([ 241,  376,   50,  ..., 1076,  224,  202])

In [23]:
emb_size=20

user_emb = nn.Embedding(num_users, emb_size)
item_emb = nn.Embedding(num_items, emb_size)

U = user_emb(users)
V = item_emb(items)

In [24]:
U*V

tensor([[ 4.1519e+00, -1.7000e+00,  3.0410e+00,  ..., -3.7739e-01,
          5.8129e-01,  1.1080e+00],
        [-7.0940e-01,  5.6403e-01,  1.7120e-01,  ...,  5.3439e-02,
         -8.4519e-01,  8.8426e-01],
        [-5.9948e-02, -2.8777e-04,  4.4180e-01,  ...,  1.7104e+00,
         -2.6565e-01, -1.1994e+00],
        ...,
        [ 1.3888e-01,  1.0667e-01,  4.1721e-01,  ..., -2.0275e-01,
          2.1180e-01,  3.4194e-01],
        [-4.4374e-01, -1.1165e-01, -9.9126e-02,  ..., -1.5159e+00,
         -1.6603e-02, -6.8127e-02],
        [ 4.0100e-02,  8.7507e-01, -8.9533e-01,  ..., -1.5741e+00,
          2.3268e-01, -1.0910e+00]], grad_fn=<MulBackward0>)

In [25]:
(U*V).sum(1)

tensor([ 5.8527, -5.7014,  1.8134,  ...,  8.4892, -5.7663, -7.2321],
       grad_fn=<SumBackward1>)

In [26]:
def train_epocs(model, epochs=10, lr=0.01, wd=0.0):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    for i in range(epochs):
        model.train()
        users = torch.LongTensor(train['user id'].values)
        items = torch.LongTensor(train['movie id'].values)
        ratings = torch.FloatTensor(train['rating'].values)

        y_hat = model(users, items)
        loss = F.mse_loss(y_hat, ratings)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        testloss = valid_loss(model)

        print("Train loss %.3f Val loss %.3f" % (loss.item(), testloss))


In [27]:
def valid_loss(model):
    model.eval()
    users=torch.LongTensor(val['user id'].values)
    items=torch.LongTensor(val['movie id'].values)
    ratings=torch.FloatTensor(val['rating'].values)

    y_hat= model(users,items)
    loss = F.mse_loss(y_hat, ratings)
    return loss.item()
    

In [28]:
model = MF(num_users, num_items, emb_size=100)


In [29]:
train_epocs(model, epochs=20, lr=0.1, wd=1e-5)

Train loss 13.209 Val loss 5.296
Train loss 5.096 Val loss 2.720
Train loss 2.762 Val loss 3.225
Train loss 3.259 Val loss 1.333
Train loss 1.032 Val loss 2.501
Train loss 2.044 Val loss 3.306
Train loss 2.870 Val loss 2.732
Train loss 2.316 Val loss 1.674
Train loss 1.259 Val loss 1.531
Train loss 1.184 Val loss 2.044
Train loss 1.868 Val loss 1.633
Train loss 1.532 Val loss 1.110
Train loss 0.967 Val loss 1.361
Train loss 1.155 Val loss 1.791
Train loss 1.538 Val loss 1.786
Train loss 1.504 Val loss 1.408
Train loss 1.121 Val loss 1.146
Train loss 0.901 Val loss 1.275
Train loss 1.108 Val loss 1.370
Train loss 1.246 Val loss 1.182


In [30]:
model = MF(num_users, num_items, emb_size=20)
train_epocs(model, epochs=20, lr=0.1, wd=1e-5)

Train loss 13.558 Val loss 11.829
Train loss 11.550 Val loss 8.379
Train loss 8.141 Val loss 4.265
Train loss 4.101 Val loss 1.457
Train loss 1.383 Val loss 2.250
Train loss 2.251 Val loss 3.969
Train loss 4.058 Val loss 3.372
Train loss 3.424 Val loss 1.979
Train loss 1.886 Val loss 1.292
Train loss 1.035 Val loss 1.514
Train loss 1.148 Val loss 2.129
Train loss 1.726 Val loss 2.626
Train loss 2.233 Val loss 2.767
Train loss 2.403 Val loss 2.537
Train loss 2.204 Val loss 2.056
Train loss 1.747 Val loss 1.528
Train loss 1.236 Val loss 1.187
Train loss 0.909 Val loss 1.178
Train loss 0.925 Val loss 1.426
Train loss 1.214 Val loss 1.641


In [31]:
model = MF(num_users, num_items, emb_size=30)
train_epocs(model, epochs=20, lr=0.1, wd=1e-5)

Train loss 13.516 Val loss 10.825
Train loss 10.579 Val loss 6.142
Train loss 5.970 Val loss 1.884
Train loss 1.815 Val loss 2.281
Train loss 2.272 Val loss 4.250
Train loss 4.327 Val loss 3.005
Train loss 3.021 Val loss 1.520
Train loss 1.344 Val loss 1.393
Train loss 1.045 Val loss 2.126
Train loss 1.704 Val loss 2.807
Train loss 2.387 Val loss 2.995
Train loss 2.607 Val loss 2.663
Train loss 2.310 Val loss 2.017
Train loss 1.691 Val loss 1.397
Train loss 1.092 Val loss 1.161
Train loss 0.880 Val loss 1.402
Train loss 1.166 Val loss 1.749
Train loss 1.582 Val loss 1.740
Train loss 1.630 Val loss 1.390
Train loss 1.294 Val loss 1.081


In [32]:
model = MF(num_users, num_items, emb_size=40)
train_epocs(model, epochs=20, lr=0.1, wd=1e-5)

Train loss 13.468 Val loss 9.864
Train loss 9.634 Val loss 4.296
Train loss 4.165 Val loss 1.317
Train loss 1.285 Val loss 4.383
Train loss 4.500 Val loss 3.512
Train loss 3.584 Val loss 1.595
Train loss 1.430 Val loss 1.448
Train loss 1.076 Val loss 2.373
Train loss 1.929 Val loss 3.105
Train loss 2.680 Val loss 3.160
Train loss 2.778 Val loss 2.604
Train loss 2.260 Val loss 1.790
Train loss 1.469 Val loss 1.232
Train loss 0.925 Val loss 1.315
Train loss 1.038 Val loss 1.791
Train loss 1.585 Val loss 1.921
Train loss 1.802 Val loss 1.520
Train loss 1.437 Val loss 1.104
Train loss 0.985 Val loss 1.073
Train loss 0.882 Val loss 1.344


In [33]:
model = MF(num_users, num_items, emb_size=50)
train_epocs(model, epochs=20, lr=0.1, wd=1e-5)

Train loss 13.424 Val loss 8.993
Train loss 8.753 Val loss 2.920
Train loss 2.797 Val loss 2.422
Train loss 2.406 Val loss 4.281
Train loss 4.359 Val loss 2.156
Train loss 2.078 Val loss 1.324
Train loss 1.004 Val loss 2.196
Train loss 1.764 Val loss 3.072
Train loss 2.643 Val loss 3.161
Train loss 2.771 Val loss 2.525
Train loss 2.169 Val loss 1.643
Train loss 1.311 Val loss 1.192
Train loss 0.889 Val loss 1.508
Train loss 1.272 Val loss 1.951
Train loss 1.811 Val loss 1.747
Train loss 1.659 Val loss 1.228
Train loss 1.113 Val loss 1.067
Train loss 0.881 Val loss 1.330
Train loss 1.077 Val loss 1.646
Train loss 1.356 Val loss 1.721


Hybrid Model

In [35]:
df_movie

Unnamed: 0,movie id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Childrens,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1678,Mat' i syn (1997),06-Feb-1998,,http://us.imdb.com/M/title-exact?Mat%27+i+syn+...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1678,1679,B. Monkey (1998),06-Feb-1998,,http://us.imdb.com/M/title-exact?B%2E+Monkey+(...,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1679,1680,Sliding Doors (1998),01-Jan-1998,,http://us.imdb.com/Title?Sliding+Doors+(1998),0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1680,1681,You So Crazy (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?You%20So%20Cr...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
df_collab = pd.read_csv('Coll_Filter_Recomend.csv')

In [39]:
df_collab

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
0,3.835005,3.104121,3.081005,3.808282,3.011640,3.945766,4.105261,4.169735,4.458052,4.250324,...,2.133717,3.530673,2.841187,1.869848,2.770235,1.637758,2.313801,2.012121,2.380869,2.589157
1,4.002513,3.352604,3.187884,3.719986,3.344365,4.003967,3.984703,4.191816,4.132493,3.932797,...,2.452807,3.523968,2.817648,1.869692,2.889614,1.434331,2.569935,1.916138,2.651328,2.646072
2,3.112619,2.482355,2.509738,3.174505,2.378318,3.239981,3.430389,3.436827,3.789069,3.614856,...,1.657656,2.918429,2.354460,1.544489,2.248948,1.419506,1.827063,1.693454,1.877757,2.119885
3,4.794578,3.983519,3.826722,4.529528,3.948206,4.829217,4.859668,5.067469,5.100228,4.856084,...,2.873094,4.267711,3.417788,2.263272,3.461912,1.797419,3.033812,2.348318,3.128028,3.185899
4,3.348346,2.618834,2.712444,3.531109,2.466635,3.537437,3.827012,3.770137,4.313430,4.118218,...,1.680120,3.212878,2.600137,1.698796,2.420007,1.652475,1.894673,1.905603,1.944005,2.306062
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
938,5.142770,4.517693,4.044554,4.306155,4.671302,4.932340,4.562157,5.088329,4.342106,4.117333,...,3.571711,4.228599,3.345762,2.250145,3.709806,1.331546,3.590485,2.120037,3.716264,3.295418
939,3.781894,3.312687,2.976627,3.188174,3.418191,3.636790,3.380254,3.755379,3.237051,3.070313,...,2.607485,3.123223,2.472880,1.661626,2.728257,1.002421,2.627274,1.574567,2.718806,2.428132
940,4.315258,3.627606,3.433778,3.981246,3.628919,4.303643,4.261424,4.500868,4.395306,4.181982,...,2.670550,3.780736,3.020760,2.006334,3.115213,1.514649,2.788651,2.044624,2.877721,2.846341
941,4.454651,3.867603,3.514568,3.832843,3.964996,4.318488,4.072864,4.472149,3.971296,3.769689,...,3.002551,3.727805,2.957729,1.982125,3.214077,1.264458,3.047423,1.910647,3.151768,2.877173


In [46]:
cols = df_collab.columns

In [59]:
target=[]

for i in cols:
    if(round(df_collab[i].mean(),2) >= 2.50):
        target.append(1)
    else:
        target.append(0)
        

In [52]:
df_hybrid = df_movie.iloc[:,5:]

In [54]:
sent = np.load('movie scores.npy')

In [55]:
df_hybrid['Sentiment'] = sent

In [63]:
df_hybrid['Target'] = target

In [64]:
df_hybrid

Unnamed: 0,unknown,Action,Adventure,Animation,Childrens,Comedy,Crime,Documentary,Drama,Fantasy,...,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,Sentiment,Target
0,0,0,0,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0.225,1
1,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0.127,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0.000,1
3,0,1,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0.360,1
4,0,0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,1,0,0,0.089,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0.000,0
1678,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,-0.003,1
1679,0,0,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0.184,0
1680,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0.000,1


In [65]:
X = df_hybrid.iloc[:,:-1]
y = df_hybrid.iloc[:,-1]


In [68]:
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

In [69]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [104]:
svm=SVC(kernel='linear')
svm.fit(X_train,y_train)

y_pred = svm.predict(X_test)


In [105]:
from sklearn.metrics import accuracy_score

In [106]:
accuracy_score(y_test,y_pred)


0.8130563798219584