In [1]:
import gzip
from collections import defaultdict
from sklearn import linear_model
import csv
import random
import matplotlib
import matplotlib.pyplot as plt
from functools import lru_cache
import numpy as np
import pandas as pd
import sys
from datetime import datetime
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from tqdm import tqdm
import scipy

In [2]:
import matplotlib.pyplot as plt

SMALL_SIZE = 20
MEDIUM_SIZE = 20
BIGGER_SIZE = 20

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

In [3]:
def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

In [4]:
def readCSV(path):
    f = gzip.open(path, 'rt')
    c = csv.reader(f)
    header = next(c)
    for l in c:
        d = dict(zip(header,l))
        yield d

## Data Filtering

In [5]:
data_=[]
for d in readCSV("subscribers_count.csv.gz"):
    data_.append(d)

In [6]:
sub_count={}
for d in data_:
    sub_count[d['subreddit']]=int(d['subscribers'])
    
avSub=sum(sub_count.values())/len(sub_count)

In [7]:
avSub

63740.865577889446

In [8]:
data=[]
for d in readCSV("redditSubmissions.csv.gz"):
    data.append(d)

In [9]:
df=pd.DataFrame(data)

In [10]:
#Original Data
df

Unnamed: 0,#image_id,unixtime,rawtime,title,total_votes,reddit_id,number_of_upvotes,subreddit,number_of_downvotes,localtime,score,number_of_comments,username
0,0,1333172439,2012-03-31T12:40:39.590113-07:00,And here's a downvote.,63470,rmqjs,32657,funny,30813,1333197639,1844,622,Animates_Everything
1,0,1333178161,2012-03-31T14:16:01.093638-07:00,Expectation,35,rmun4,29,GifSound,6,1333203361,23,3,Gangsta_Raper
2,0,1333199913,2012-03-31T20:18:33.192906-07:00,Downvote,41,rna86,32,GifSound,9,1333225113,23,0,Gangsta_Raper
3,0,1333252330,2012-04-01T10:52:10-07:00,Every time I downvote something,10,ro7e4,6,GifSound,4,1333277530,2,0,Gangsta_Raper
4,0,1333272954,2012-04-01T16:35:54.393381-07:00,Downvote &quot;Dies Irae&quot;,65,rooof,57,GifSound,8,1333298154,49,0,Gangsta_Raper
...,...,...,...,...,...,...,...,...,...,...,...,...,...
132303,9998,1344759846,2012-08-12T15:24:06-07:00,OM NOM NOM,34,y41wv,25,funny,9,1344785046,16,0,vaggietales
132304,9998,1345270178,2012-08-18T13:09:38-07:00,Don't feed the animals...,19,yfw66,14,funny,5,1345295378,9,2,Deydria
132305,9998,1345953962,2012-08-26T04:06:02+00:00,WTF worthy.,49,yu838,26,WTF,23,1345953962,3,6,beatlesrock
132306,9998,1346625906,2012-09-02T22:45:06+00:00,"Just a camel eating a kids head, welcome to th...",123,z91ah,65,WTF,58,1346625906,7,12,v7o


In [11]:
data[0]

{'#image_id': '0',
 'unixtime': '1333172439',
 'rawtime': '2012-03-31T12:40:39.590113-07:00',
 'title': "And here's a downvote.",
 'total_votes': '63470',
 'reddit_id': 'rmqjs',
 'number_of_upvotes': '32657',
 'subreddit': 'funny',
 'number_of_downvotes': '30813',
 'localtime': '1333197639',
 'score': '1844',
 'number_of_comments': '622',
 'username': 'Animates_Everything'}

In [12]:
# Making Data Batter

# Removing some random useless entry
del data[53033]

# Value Typecasting
max_title_len=0
max_subscribers=0
max_unix_time=0
min_unix_time=float('inf')

for d in data:
    d['unixtime']=int(d['unixtime'])
    d['total_votes']=int(d['total_votes'])
    d['number_of_upvotes']=int(d['number_of_upvotes'])
    d['number_of_downvotes']=int(d['number_of_downvotes'])
    d['localtime']=int(d['localtime'])
    d['score']=int(d['score'])
    d['number_of_comments']=max(int(d['number_of_comments']),0)
    
    max_unix_time=max(max_unix_time, d['unixtime'])
    min_unix_time = min(min_unix_time, d['unixtime'])
    
# Adding some nice features
for d in data:
    if int(d['total_votes'])==0:
        d['score_norm']=0
    else:
        d['score_norm']=float(d['number_of_upvotes']/d['total_votes'])
        
    d['title_len']=len(d['title'])
    max_title_len=max(max_title_len,d['title_len'])

for d in data:
    dt_object=datetime.fromtimestamp(d['unixtime'])
    d['weekday']=int(dt_object.weekday())
    d['timeOfDay']=int(dt_object.time().hour)
    
    sub=d['subreddit']
    subscribe=avSub
    if sub in sub_count:
        subscribe = sub_count[sub]
    
    d['subscribers']=subscribe
    max_subscribers=max(max_subscribers,d['subscribers'])
    
    
def filter(row):
    return row['total_votes'] >= 10

data = [d for d in data if filter(d)]

In [13]:
df_=pd.DataFrame(data)

In [14]:
df_.describe()

Unnamed: 0,unixtime,total_votes,number_of_upvotes,number_of_downvotes,localtime,score,number_of_comments,score_norm,title_len,weekday,timeOfDay,subscribers
count,115709.0,115709.0,115709.0,115709.0,115709.0,115709.0,115709.0,115709.0,115709.0,115709.0,115709.0,115709.0
mean,1338951000.0,2152.534211,1209.516943,943.017267,1338970000.0,266.499676,44.593835,0.636773,35.373273,2.955933,11.70896,2017486.0
std,13331180.0,6339.113842,3374.706731,2971.773873,13325410.0,505.773311,151.833657,0.150304,24.465102,2.019494,7.201332,950610.6
min,1217219000.0,10.0,0.0,0.0,1217244000.0,-264.0,0.0,0.0,1.0,0.0,0.0,1.0
25%,1331826000.0,22.0,13.0,8.0,1331852000.0,5.0,0.0,0.548334,18.0,1.0,5.0,1520137.0
50%,1343269000.0,66.0,45.0,19.0,1343294000.0,25.0,4.0,0.639021,30.0,3.0,12.0,2601566.0
75%,1348555000.0,582.0,426.0,155.0,1348562000.0,253.0,20.0,0.744538,46.0,5.0,18.0,2665613.0
max,1359095000.0,177103.0,90396.0,86707.0,1359095000.0,20570.0,8357.0,1.0,313.0,6.0,23.0,2665613.0


In [15]:
data[0]

{'#image_id': '0',
 'unixtime': 1333172439,
 'rawtime': '2012-03-31T12:40:39.590113-07:00',
 'title': "And here's a downvote.",
 'total_votes': 63470,
 'reddit_id': 'rmqjs',
 'number_of_upvotes': 32657,
 'subreddit': 'funny',
 'number_of_downvotes': 30813,
 'localtime': 1333197639,
 'score': 1844,
 'number_of_comments': 622,
 'username': 'Animates_Everything',
 'score_norm': 0.5145265479754214,
 'title_len': 22,
 'weekday': 4,
 'timeOfDay': 22,
 'subscribers': 2665613}

In [16]:
# Creating 1-Hot Encoder object and removing redundant first dimension

# Hot-encoding the day of week

wd=set()
td=set()

for d in data:
    wd.add(d['weekday'])
    td.add(d['timeOfDay'])

y=[[t] for t in wd]
z=[[t] for t in td]
    
enc1 = OneHotEncoder(drop='first')
print("One Hot Encoding for day of week (after removing first dimension)")
print(enc1.fit(y).transform(y).toarray())

# Hot-encoding the time of day
enc2 = OneHotEncoder(drop='first')
print("\nOne Hot Encoding for time of day (after removing first dimension)")
print(enc2.fit(z).transform(z).toarray())

One Hot Encoding for day of week (after removing first dimension)
[[0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1.]]

One Hot Encoding for time of day (after removing first dimension)
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]

## Model Training

In [17]:
def MSE(y,ypred):
    y=np.array(y)
    ypred=np.array(ypred)
    return round(np.sum(np.square(y-ypred))/len(y-ypred),5)

def MAE(y, ypred):
    y=np.array(y)
    ypred=np.array(ypred)
    return round(np.sum(np.abs(y-ypred))/len(y-ypred),5)    

In [18]:
L=len(data)
data.sort(key=lambda x: x['unixtime'])

In [19]:
train=data[:int(0.8*L)]
val=data[int(0.8*L):int(0.9*L)]
test=data[int(0.9*L):]

In [20]:
count_s_all=defaultdict()
count_u_all=defaultdict()
count_i_all=defaultdict()
max_u=0
max_i=0
max_s=0

user2idx, subreddit2idx, img2idx = {}, {}, {}

for d in data:
    sub=d['subreddit']
    user=d['username']
    image=d['#image_id']    
    
    if sub not in count_s_all:
        count_s_all[sub]=0
    
    if user not in count_u_all:
        count_u_all[user]=0
        
    if image not in count_i_all:
        count_i_all[image]=0
    
    count_s_all[sub]+=1
    count_i_all[image]+=1
    count_u_all[user]+=1

    max_u=max(max_u,count_u_all[user])
    max_i=max(max_i,count_i_all[image])
    max_s=max(max_s,count_s_all[sub])
    
    if sub not in subreddit2idx: subreddit2idx[sub] = len(subreddit2idx)
    if user not in user2idx: user2idx[user] = len(user2idx)
    if image not in img2idx: img2idx[image] = len(img2idx)

In [21]:
len(user2idx), len(img2idx), len(subreddit2idx)

(58473, 16667, 833)

In [22]:
avScore=0
for d  in train:
    avScore+=d['score_norm']
    
avScore/=len(train)    

scorePerImageTime=defaultdict(list)
scorePerUserTime=defaultdict(list)

prev_im=defaultdict(list)
prev_us=defaultdict(list)

for d in train:
    image=d['#image_id']
    user=d['username']
    time=d['unixtime']
    
    if len(prev_im[image])>0:
        scorePerImageTime[(image,time)]=np.mean(prev_im[image])
        
    prev_im[image].append(d['score_norm'])
    
    if len(prev_us[user])>0:
        scorePerUserTime[(user,time)]=np.mean(prev_us[user])
        
    prev_us[user].append(d['score_norm'])   

In [23]:
avScore

0.6468872295806947

## Features

In [24]:
def Jaccard(i1, i2):
    u1=usersPerImage[i1]
    u2=usersPerImage[i2]
    
    inter=len(u1.intersection(u2))
    den=len(u1.union(u2))
    
    if den==0:
        return 0
    
    return inter/den

def SimilarityPred(user_, image):
    users=usersPerImage[image]
    
    candidateImages=set()
    for user in users:
        for im in imagesPerUser[user]:
            candidateImages.add(im)
    
    num=0
    den=0
    for im in candidateImages:
        if im==image:
            continue
        
        sim=Jaccard(image,im)
        num+=(sim*sum(ScoresPerImage[im]) / len(ScoresPerImage[im]))
        den+=sim
        
    if den==0:
        if image in ScoresPerImage:
            return sum(ScoresPerImage[image]) / len(ScoresPerImage[image])
        else:
            return avScore
    else:
        return num/den  

In [25]:
def feata(d):
    return np.array(enc1.transform([[d['weekday']]]).toarray()).squeeze()
    
def featb(d):
    return np.array(enc2.transform([[d['timeOfDay']]]).toarray()).squeeze()     

def featc(d):
    return np.array([d['title_len']/max_title_len])

def featd(d):
    return np.array([d['subscribers']/max_subscribers])
 
def feate(d, train=False):
    user=d['username']
    time=d['unixtime']
    pred=avScore
    
    if train:
        if (user,time) in scorePerUserTime:
            pred=scorePerUserTime[(user,time)]
    else:
        L=prev_us[user]
        if len(L)>0:
            pred=sum(L) / len(L)
    
    return np.array([pred])

def featf(d, train=False):
    image=d['#image_id']
    time=d['unixtime']
    pred=avScore
    
    if train:
        if (image,time) in scorePerImageTime:
            pred=scorePerImageTime[(image,time)]
    else:
        L=prev_im[image]
        if len(L)>0:
            pred=sum(L) / len(L)
    
    return np.array([pred]) 

def featg(d):
    image=d['#image_id']
    user=d['username']
    pred=SimilarityPred(user,image)
    return [pred]

def feath(d):
    return np.array([(d['unixtime'] - min_unix_time)/(max_unix_time - min_unix_time)])

def feati(d):
    sub=d['subreddit']
    user=d['username']
    image=d['#image_id']
    
    a=0
    if sub in count_s:
        a=count_s[sub]
    
    b=0
    if user in count_u:
        b=count_u[user]
        
    c=0
    if image in count_i:
        c=count_i[image]
    
    return np.array(([a/max_s, b/max_u, c/max_i])) 

In [78]:
def feature(di, a=True, b=True, c=True, d=True, e=True, f=True, g=True, h=True, i=True, train=False):
    feat=[1]
    if a: # weekday: [1, 6]
        feat1=feata(di)
        for x in feat1:
            feat.append(x)
      
    if b: # timeOfDay: [7, 29]
        feat2=featb(di)
        for x in feat2:
            feat.append(x)
    
    if c: # title_len: 30
        feat3=featc(di)
        for x in feat3:
            feat.append(x)
    
    if d: # subscribers: 31
        feat4=featd(di)
        for x in feat4:
            feat.append(x)
    
    if e: # Mean  normalized  score  ofpast posts with the same image: 32
        feat5=feate(di, train)
        for x in feat5:
            feat.append(x)
    
    if f: # Mean  normalized  score  ofpast posts with the same user: 33
        feat6=featf(di, train)
        for x in feat6:
            feat.append(x)
    
    if g: # Average  normalized  scoresof post weighted by Jaccard similarities: 34
        feat7=featg(di)
        for x in feat7:
            feat.append(x)        
    
    if h: # Unix time: 35
        feat8=feath(di)
        for x in feat8:
            feat.append(x)  
    
    if i: # Popularity of subreddit, image, user: [36, 37, 38]
        feat9=feati(di)
        for x in feat9:
            feat.append(x)  
            
    return feat

In [27]:
count_s=defaultdict()
count_u=defaultdict()
count_i=defaultdict()

usersPerImage=defaultdict(set)
imagesPerUser=defaultdict(set)
ScoresPerImage=defaultdict(list)

X_train=[]
Y_train=[]
    
for i in tqdm(range(len(train))):
    d=train[i]
    
    sub=d['subreddit']
    user=d['username']
    image=d['#image_id']
    score=d['score_norm']
    
    if sub not in count_s:
        count_s[sub]=0
    
    if user not in count_u:
        count_u[user]=0
        
    if image not in count_i:
        count_i[image]=0
    
    count_s[sub]+=1
    count_i[image]+=1
    count_u[user]+=1

    usersPerImage[image].add(user)
    imagesPerUser[user].add(image)
    if image not in ScoresPerImage:
        ScoresPerImage[image]=[]
        
    ScoresPerImage[image].append(score)
    
    X_train.append(feature(d,train=True))
    Y_train.append(score)

X_val=[]
Y_val=[]

for i in tqdm(range(len(val))):
    d=val[i]
    X_val.append(feature(d))
    score=d['score_norm']
    Y_val.append(score)

X_test=[]
Y_test=[]

for i in tqdm(range(len(test))):
    d=test[i]
    X_test.append(feature(d))
    score=d['score_norm']
    Y_test.append(score)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 92567/92567 [13:24<00:00, 115.00it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11571/11571 [02:12<00:00, 87.13it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11571/11571 [02:07<00:00, 90.52it/s]


In [31]:
X_train_arr = np.array(X_train)
X_val_arr = np.array(X_val)
X_test_arr = np.array(X_test)

In [45]:
X_test_arr.shape

(11571, 39)

In [46]:
def get_sparse_features(X_features, dataset): 
    X_sparse = scipy.sparse.lil_matrix((X_features.shape[0], 
                                              X_features.shape[1] + len(user2idx) + len(img2idx) + len(subreddit2idx)))
    # Copy original features into sparse matrix format
    for r in range(X_features.shape[0]):
        for c in range(X_features.shape[1]):
            X_sparse[r, c] = X_features[r, c]
            
    # Mark features corresponding to user, subreddit and image
    for i in range(X_features.shape[0]):
        user_idx = user2idx[dataset[i]['username']]
        img_idx = img2idx[dataset[i]['#image_id']]
        sub_idx = subreddit2idx[dataset[i]['subreddit']]
        X_sparse[i, X_features.shape[1] + user_idx] = 1
        X_sparse[i, X_features.shape[1] + len(user2idx) + img_idx] = 1
        X_sparse[i, X_features.shape[1] + len(user2idx) + len(img2idx) + sub_idx] = 1
    return X_sparse

In [47]:
X_train_sparse = get_sparse_features(X_train_arr, train)
X_val_sparse = get_sparse_features(X_val_arr, val)
X_test_sparse = get_sparse_features(X_test_arr, test)

In [48]:
y_train = np.array(Y_train)
y_val = np.array(Y_val)
y_test = np.array(Y_test)

In [49]:
X_train_sparse

<92567x76012 sparse matrix of type '<class 'numpy.float64'>'
	with 1370812 stored elements in List of Lists format>

In [50]:
def unison_shuffled_copies(a, b):
    assert a.shape[0] == len(b)
    p = np.random.permutation(len(b))
    return a[p], b[p]

In [45]:
import optuna
from fastFM import als

# 1. Define an objective function to be maximized.
def objective(trial):

    # 2. Suggest values for the hyperparameters using a trial object.
    stdev = trial.suggest_uniform('stdev', 0.001, 0.1)
    rank = trial.suggest_int('rank', 1, 100, log=False)
    reg_w = trial.suggest_loguniform('reg_w', 1.0, 1000.0)
    reg_V = trial.suggest_loguniform('reg_V', 10.0, 10000.0)
    
    fm = als.FMRegression(n_iter=100, init_stdev=stdev, rank=rank, l2_reg_w=reg_w, l2_reg_V=reg_V)

    X_train_shuffled, y_train_shuffled = unison_shuffled_copies(X_train_sparse, y_train)
    fm.fit(X_train_shuffled, y_train_shuffled)

    y_pred=fm.predict(X_val_sparse)

    return MSE(y_val, y_pred)

# 3. Create a study object and optimize the objective function.
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=300)
study.best_params

[32m[I 2021-11-30 07:40:27,895][0m A new study created in memory with name: no-name-5cf4fbca-d9ae-4b35-92d1-88b919be9e6b[0m
[32m[I 2021-11-30 07:44:26,733][0m Trial 0 finished with value: 0.03325 and parameters: {'stdev': 0.005857307009050074, 'rank': 72, 'reg_w': 0.02437792841192978, 'reg_V': 68.99591031347106}. Best is trial 0 with value: 0.03325.[0m
[32m[I 2021-11-30 07:48:03,752][0m Trial 1 finished with value: 0.0273 and parameters: {'stdev': 0.136564559580317, 'rank': 63, 'reg_w': 375.0412546720544, 'reg_V': 2782.778468284878}. Best is trial 1 with value: 0.0273.[0m
[32m[I 2021-11-30 07:48:13,517][0m Trial 2 finished with value: 0.02967 and parameters: {'stdev': 0.24619588867311015, 'rank': 1, 'reg_w': 0.23831735637657056, 'reg_V': 4763.005545110056}. Best is trial 1 with value: 0.0273.[0m
[32m[I 2021-11-30 07:48:34,750][0m Trial 3 finished with value: 0.02651 and parameters: {'stdev': 0.0043334345329522905, 'rank': 3, 'reg_w': 38.34942019823307, 'reg_V': 5217.46267

[32m[I 2021-11-30 08:03:29,474][0m Trial 35 finished with value: 0.0267 and parameters: {'stdev': 0.02671686211695035, 'rank': 1, 'reg_w': 460.9707308945031, 'reg_V': 5024.115806356182}. Best is trial 3 with value: 0.02651.[0m
[32m[I 2021-11-30 08:12:49,580][0m Trial 36 finished with value: 0.02656 and parameters: {'stdev': 0.006136084116866815, 'rank': 98, 'reg_w': 11.932904798902827, 'reg_V': 5570.643721092148}. Best is trial 3 with value: 0.02651.[0m
[32m[I 2021-11-30 08:13:03,757][0m Trial 37 finished with value: 0.02652 and parameters: {'stdev': 0.003666040524972794, 'rank': 3, 'reg_w': 56.933671400411164, 'reg_V': 2979.5247482666323}. Best is trial 3 with value: 0.02651.[0m
[32m[I 2021-11-30 08:13:58,962][0m Trial 38 finished with value: 0.02784 and parameters: {'stdev': 0.0017906649209374515, 'rank': 9, 'reg_w': 1.4923051629360267, 'reg_V': 3127.9501615277745}. Best is trial 3 with value: 0.02651.[0m
[32m[I 2021-11-30 08:14:10,039][0m Trial 39 finished with value: 

[32m[I 2021-11-30 09:17:48,357][0m Trial 71 finished with value: 0.02651 and parameters: {'stdev': 0.0036740121680558634, 'rank': 6, 'reg_w': 37.08584211284407, 'reg_V': 1764.947689265751}. Best is trial 61 with value: 0.02649.[0m
[32m[I 2021-11-30 09:18:11,668][0m Trial 72 finished with value: 0.02652 and parameters: {'stdev': 0.003921088564450098, 'rank': 6, 'reg_w': 61.9454703830185, 'reg_V': 1203.6349127611431}. Best is trial 61 with value: 0.02649.[0m
[32m[I 2021-11-30 09:18:58,069][0m Trial 73 finished with value: 0.02652 and parameters: {'stdev': 0.024811615769254034, 'rank': 13, 'reg_w': 44.30247930440523, 'reg_V': 3414.92347194677}. Best is trial 61 with value: 0.02649.[0m
[32m[I 2021-11-30 09:21:10,428][0m Trial 74 finished with value: 0.02655 and parameters: {'stdev': 0.02277162763874815, 'rank': 40, 'reg_w': 29.55629477885245, 'reg_V': 338.76774690753365}. Best is trial 61 with value: 0.02649.[0m
[32m[I 2021-11-30 09:21:46,838][0m Trial 75 finished with value:

[32m[I 2021-11-30 10:45:16,990][0m Trial 106 finished with value: 0.02651 and parameters: {'stdev': 0.002139974881182097, 'rank': 36, 'reg_w': 22.44030915892349, 'reg_V': 7867.350326824247}. Best is trial 91 with value: 0.02648.[0m
[32m[I 2021-11-30 10:51:09,768][0m Trial 107 finished with value: 0.02651 and parameters: {'stdev': 0.01096393397958127, 'rank': 63, 'reg_w': 22.3273465087392, 'reg_V': 6220.560817377708}. Best is trial 91 with value: 0.02648.[0m
[32m[I 2021-11-30 10:57:48,180][0m Trial 108 finished with value: 0.02654 and parameters: {'stdev': 0.011769973923542412, 'rank': 74, 'reg_w': 14.123269007595162, 'reg_V': 9977.213852641202}. Best is trial 91 with value: 0.02648.[0m
[32m[I 2021-11-30 11:03:05,955][0m Trial 109 finished with value: 0.02651 and parameters: {'stdev': 0.0012779959301408522, 'rank': 59, 'reg_w': 39.87944974829997, 'reg_V': 9348.78140693334}. Best is trial 91 with value: 0.02648.[0m
[32m[I 2021-11-30 11:07:21,979][0m Trial 110 finished with 

KeyboardInterrupt: 

In [1]:
# fm = als.FMRegression(n_iter=10, init_stdev=0.01, rank=1, l2_reg_w=10, l2_reg_V=20)

# X_train_shuffled, y_train_shuffled = unison_shuffled_copies(X_train_sparse, y_train)
# fm.fit(X_train_shuffled, y_train_shuffled)

# y_pred=fm.predict(X_val_sparse)
# print("stdev: {}, rank: {}, reg_w: {}, reg_V: {} --> Validation MSE: {}".format(
#     stdev, rank, reg_w, reg_V, MSE(y_val, y_pred)))

NameError: name 'als' is not defined

In [51]:
from fastFM import als

# fm = als.FMRegression(n_iter=100, 
#                       init_stdev=0.004982963842463915, 
#                       rank=1, 
#                       l2_reg_w=15.623715459053118, 
#                       l2_reg_V=835.2527560035309)

fm = als.FMRegression(n_iter=100, 
                      init_stdev=0.0027417380920231227, 
                      rank=98, 
                      l2_reg_w=18.42916958829367, 
                      l2_reg_V=479.9985482525126)

X_train_shuffled, y_train_shuffled = unison_shuffled_copies(X_train_sparse, y_train)
fm.fit(X_train_shuffled, y_train_shuffled)

y_pred=fm.predict(X_train_sparse)
print("Training MSE: ", MSE(y_train, y_pred))
print("Training MAE: ", MAE(y_train, y_pred))

y_pred=fm.predict(X_val_sparse)
print("Validation MSE: ", MSE(y_val, y_pred))
print("Validation MAE: ", MAE(y_val, y_pred))

y_pred=fm.predict(X_test_sparse)
print("Testing MSE: ", MSE(y_test, y_pred))
print("Testing MAE: ", MAE(y_test, y_pred))

Training MSE:  0.01244
Training MAE:  0.08704
Validation MSE:  0.02648
Validation MAE:  0.12935
Testing MSE:  0.02476
Testing MAE:  0.1248


In [82]:
print(X_train_arr.shape)
print(fm.w_[:39])
important_features = sorted([(i, fm.w_[i]) for i in range(39)], key=lambda x: abs(x[1]), reverse=True)
important_features[-5:]

(92567, 39)
[-4.99646332e-03  1.60387577e-03  2.92944546e-03  3.56491508e-03
  1.39041058e-03 -3.09090004e-03 -2.15156217e-03  3.60898662e-05
  6.08097302e-04 -1.88390920e-03  2.32697727e-04  1.50691677e-04
  9.74411785e-04  6.71992132e-03  7.96970975e-04  9.60383077e-04
  8.11142138e-04 -5.77747866e-03 -1.13083898e-02 -1.87736715e-02
 -2.25401134e-02 -2.36564942e-02 -2.83617561e-02 -2.34454990e-02
 -1.96479427e-02 -1.78771027e-02 -1.63141553e-02 -1.37313444e-02
 -1.22131748e-02 -8.12350617e-03  5.85401368e-02 -9.96811396e-02
  1.05978491e-01  1.64713000e-01  1.31239616e-01 -1.63452077e-01
  1.22422193e-02 -1.56820193e-01 -1.37040113e-01]


[(14, 0.0007969709751047217),
 (8, 0.0006080973023161509),
 (10, 0.00023269772748550505),
 (11, 0.00015069167704144995),
 (7, 3.60898662457705e-05)]

In [91]:
print(fm.V_.shape)
num_features = fm.V_.shape[1]
dot_products = []
seen = set()
for i in tqdm(range(39)):
    for j in range(39):
        if j == i or (i, j) in seen or (j, i) in seen:
            continue
        feat1, feat2 = fm.V_[:,i], fm.V_[:,j]
        dot_products.append((i, j, np.dot(feat1, feat2)))
        seen.add((i,j))

(98, 76012)


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 39/39 [00:00<00:00, 5850.01it/s]


In [94]:
important_interactions = sorted(dot_products, key=lambda x: abs(x[2]), reverse=True)
important_interactions[:5]

[(34, 35, 0.005046995991926385),
 (33, 34, 0.003601347121056183),
 (33, 35, 0.0034915125738092592),
 (31, 34, 0.002821858430155046),
 (31, 35, 0.002735796872451023)]

## Just user, image, subreddit

In [38]:
def get_sparse_representation(dataset): 
    X_sparse = scipy.sparse.lil_matrix((len(dataset), 
                                              len(user2idx) + len(img2idx) + len(subreddit2idx)))
            
    # Mark features corresponding to user, subreddit and image
    for i in range(len(dataset)):
        user_idx = user2idx[dataset[i]['username']]
        img_idx = img2idx[dataset[i]['#image_id']]
        sub_idx = subreddit2idx[dataset[i]['subreddit']]
        X_sparse[i, user_idx] = 1
        X_sparse[i, len(user2idx) + img_idx] = 1
        X_sparse[i, len(user2idx) + len(img2idx) + sub_idx] = 1
    return X_sparse

In [40]:
X_train_sparse = get_sparse_representation(train)
X_val_sparse = get_sparse_representation(val)
X_test_sparse = get_sparse_representation(test)

X_train_sparse

<92567x75973 sparse matrix of type '<class 'numpy.float64'>'
	with 277701 stored elements in List of Lists format>

In [42]:
import optuna
from fastFM import als

# 1. Define an objective function to be maximized.
def objective(trial):

    # 2. Suggest values for the hyperparameters using a trial object.
    stdev = trial.suggest_uniform('stdev', 0.001, 0.1)
    rank = trial.suggest_int('rank', 1, 100, log=False)
    reg_w = trial.suggest_loguniform('reg_w', 1.0, 1000.0)
    reg_V = trial.suggest_loguniform('reg_V', 10.0, 10000.0)
    
    fm = als.FMRegression(n_iter=100, init_stdev=stdev, rank=rank, l2_reg_w=reg_w, l2_reg_V=reg_V)

    X_train_shuffled, y_train_shuffled = unison_shuffled_copies(X_train_sparse, y_train)
    fm.fit(X_train_shuffled, y_train_shuffled)

    y_pred=fm.predict(X_val_sparse)

    return MSE(y_val, y_pred)

# 3. Create a study object and optimize the objective function.
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=300)
study.best_params

[32m[I 2021-11-30 15:23:14,941][0m A new study created in memory with name: no-name-025b7ab6-51d3-4a29-9b04-e0875b2ce4e4[0m
[32m[I 2021-11-30 15:23:22,836][0m Trial 0 finished with value: 0.02906 and parameters: {'stdev': 0.03770255109001753, 'rank': 6, 'reg_w': 900.6132688939524, 'reg_V': 349.695066134584}. Best is trial 0 with value: 0.02906.[0m
[32m[I 2021-11-30 15:25:07,484][0m Trial 1 finished with value: 0.02791 and parameters: {'stdev': 0.004288814125290901, 'rank': 91, 'reg_w': 25.579996068732996, 'reg_V': 64.21689402782452}. Best is trial 1 with value: 0.02791.[0m
[32m[I 2021-11-30 15:27:07,824][0m Trial 2 finished with value: 0.02868 and parameters: {'stdev': 0.01722756505274379, 'rank': 69, 'reg_w': 336.62351628793436, 'reg_V': 706.6956379901259}. Best is trial 1 with value: 0.02791.[0m
[32m[I 2021-11-30 15:28:50,225][0m Trial 3 finished with value: 0.02809 and parameters: {'stdev': 0.08571921474243646, 'rank': 57, 'reg_w': 52.79461650879698, 'reg_V': 560.17316

[32m[I 2021-11-30 16:04:14,261][0m Trial 35 finished with value: 0.02784 and parameters: {'stdev': 0.04454454132717176, 'rank': 12, 'reg_w': 9.873200493343049, 'reg_V': 9957.282272995817}. Best is trial 6 with value: 0.02783.[0m
[32m[I 2021-11-30 16:04:49,907][0m Trial 36 finished with value: 0.02805 and parameters: {'stdev': 0.05606499356102778, 'rank': 21, 'reg_w': 45.602686570203204, 'reg_V': 896.3404410142319}. Best is trial 6 with value: 0.02783.[0m
[32m[I 2021-11-30 16:05:41,317][0m Trial 37 finished with value: 0.02783 and parameters: {'stdev': 0.06559402749287929, 'rank': 29, 'reg_w': 14.028092671916072, 'reg_V': 586.9152281180716}. Best is trial 6 with value: 0.02783.[0m
[32m[I 2021-11-30 16:05:51,809][0m Trial 38 finished with value: 0.02788 and parameters: {'stdev': 0.012506413066845447, 'rank': 5, 'reg_w': 21.393932156666146, 'reg_V': 486.5560121715237}. Best is trial 6 with value: 0.02783.[0m
[32m[I 2021-11-30 16:06:23,899][0m Trial 39 finished with value: 0.

[32m[I 2021-11-30 16:42:23,877][0m Trial 71 finished with value: 0.02784 and parameters: {'stdev': 0.02002897683641198, 'rank': 31, 'reg_w': 16.64456537096102, 'reg_V': 384.4617112892503}. Best is trial 46 with value: 0.02782.[0m
[32m[I 2021-11-30 16:42:56,267][0m Trial 72 finished with value: 0.02783 and parameters: {'stdev': 0.06557516968419655, 'rank': 28, 'reg_w': 13.260226976919407, 'reg_V': 167.35532184191348}. Best is trial 46 with value: 0.02782.[0m
[32m[I 2021-11-30 16:43:47,120][0m Trial 73 finished with value: 0.02782 and parameters: {'stdev': 0.06975248883027775, 'rank': 28, 'reg_w': 12.027634082887204, 'reg_V': 485.24309755316597}. Best is trial 46 with value: 0.02782.[0m
[32m[I 2021-11-30 16:44:20,746][0m Trial 74 finished with value: 0.02791 and parameters: {'stdev': 0.07096090698579341, 'rank': 29, 'reg_w': 6.691406534054806, 'reg_V': 291.28949770497945}. Best is trial 46 with value: 0.02782.[0m
[32m[I 2021-11-30 16:45:31,084][0m Trial 75 finished with val

[32m[I 2021-11-30 17:18:47,132][0m Trial 106 finished with value: 0.02795 and parameters: {'stdev': 0.09952415640949686, 'rank': 64, 'reg_w': 31.46028600720451, 'reg_V': 296.67672159917385}. Best is trial 46 with value: 0.02782.[0m
[32m[I 2021-11-30 17:20:31,618][0m Trial 107 finished with value: 0.02786 and parameters: {'stdev': 0.09821257059746162, 'rank': 57, 'reg_w': 20.124158013163154, 'reg_V': 472.5256820806603}. Best is trial 46 with value: 0.02782.[0m
[32m[I 2021-11-30 17:22:01,438][0m Trial 108 finished with value: 0.02801 and parameters: {'stdev': 0.09380223464042296, 'rank': 49, 'reg_w': 5.0201086641214605, 'reg_V': 422.80870331741124}. Best is trial 46 with value: 0.02782.[0m
[32m[I 2021-11-30 17:23:00,323][0m Trial 109 finished with value: 0.02782 and parameters: {'stdev': 0.0894716683298576, 'rank': 52, 'reg_w': 12.612010027620002, 'reg_V': 327.85635544365346}. Best is trial 46 with value: 0.02782.[0m
[32m[I 2021-11-30 17:24:18,948][0m Trial 110 finished wit

[32m[I 2021-11-30 17:58:21,239][0m Trial 141 finished with value: 0.02782 and parameters: {'stdev': 0.0727206143574668, 'rank': 58, 'reg_w': 12.134110453002135, 'reg_V': 407.88214293863535}. Best is trial 46 with value: 0.02782.[0m
[32m[I 2021-11-30 18:00:05,501][0m Trial 142 finished with value: 0.02782 and parameters: {'stdev': 0.07386878933711391, 'rank': 59, 'reg_w': 12.829475327895368, 'reg_V': 630.360693044685}. Best is trial 46 with value: 0.02782.[0m
[32m[I 2021-11-30 18:01:14,637][0m Trial 143 finished with value: 0.02952 and parameters: {'stdev': 0.09158689220362153, 'rank': 43, 'reg_w': 1.0122188707168502, 'reg_V': 1099.3168940129744}. Best is trial 46 with value: 0.02782.[0m
[32m[I 2021-11-30 18:02:17,067][0m Trial 144 finished with value: 0.02782 and parameters: {'stdev': 0.09168803409329997, 'rank': 54, 'reg_w': 13.218494432274513, 'reg_V': 183.2607925815599}. Best is trial 46 with value: 0.02782.[0m
[32m[I 2021-11-30 18:03:18,003][0m Trial 145 finished with

KeyboardInterrupt: 

In [43]:
fm = als.FMRegression(n_iter=100, 
                      init_stdev=0.07165817687156682, 
                      rank=40, 
                      l2_reg_w=12.182818481739726, 
                      l2_reg_V=367.72337718279016)

X_train_shuffled, y_train_shuffled = unison_shuffled_copies(X_train_sparse, y_train)
fm.fit(X_train_shuffled, y_train_shuffled)

y_pred=fm.predict(X_train_sparse)
print("Training MSE: ", MSE(y_train, y_pred))
print("Training MAE: ", MAE(y_train, y_pred))

y_pred=fm.predict(X_val_sparse)
print("Validation MSE: ", MSE(y_val, y_pred))
print("Validation MAE: ", MAE(y_val, y_pred))

y_pred=fm.predict(X_test_sparse)
print("Testing MSE: ", MSE(y_test, y_pred))
print("Testing MAE: ", MAE(y_test, y_pred))

Training MSE:  0.01235
Training MAE:  0.08655
Validation MSE:  0.02782
Validation MAE:  0.13164
Testing MSE:  0.02643
Testing MAE:  0.12807
