In [1]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
import pandas as pd
import numpy as np
import sys
import os

In [2]:
is_vae = True

In [3]:
# Just choose the name of the dataset directory
DATA_DIR = '/Users/tomas/Documents/FEUP/Tese/data/ml-20m/processed_70_10_20'
if is_vae:
    PARSE_DATA_DIR = os.path.join(DATA_DIR, 'embeddings/vae')
else:
    PARSE_DATA_DIR = os.path.join(DATA_DIR, 'embeddings/cdae')
    file = '200_fac_metadataset_k_20.csv'

In [4]:
file = 'metadataset_k_20.csv'

In [5]:
#read in the data using pandas
metadataset = pd.read_csv(os.path.join(PARSE_DATA_DIR, file ))
metadataset.head()

Unnamed: 0,original_id,1,2,3,4,5,6,7,8,9,...,192,193,194,195,196,197,198,199,200,first_place
0,7648,-0.431028,0.273519,0.000467,-0.021366,0.411798,0.445795,1.206461,0.437548,-0.670044,...,1.261934,0.238986,0.150137,-0.024725,-0.349221,3.509393,-0.146421,1.508804,-0.859744,als_ndcg
1,10208,0.351687,-0.626017,0.034143,-0.080265,0.101129,-0.229498,0.261393,-1.189405,-0.451783,...,0.964082,0.825656,0.064932,0.042978,-1.043557,0.562128,0.23456,0.813003,0.511105,zeroes
2,13315,-0.706174,-0.03129,-0.005091,-0.097111,0.111871,0.484369,0.737223,-1.132764,1.119018,...,1.040254,-1.375516,-0.064951,0.030435,0.757224,0.444452,0.753969,-0.582505,0.761838,bpr_ndcg
3,16144,0.672244,-0.958536,-0.005133,-0.093083,0.118219,-0.32569,1.434977,0.006304,-0.780396,...,1.159395,-0.74661,0.042197,0.038217,-0.813577,1.753534,-0.063353,-0.829087,-1.12244,most_popular_ndcg
4,18064,-0.813108,0.897909,-0.105261,0.08041,0.099298,-1.109625,2.775797,0.139941,-0.745728,...,0.584199,0.619601,0.027197,0.051431,-1.668557,1.609979,-1.117613,1.222671,-2.145877,lmf_ndcg


In [6]:
total = metadataset.shape[0]
names = ['als_ndcg','most_popular_ndcg','bpr_ndcg','zeroes','lmf_ndcg']
for name, count in zip(names,metadataset.first_place.value_counts()):
    print("count % ",count)
    print(name," % ",count/total)

count %  34215
als_ndcg  %  0.2636851961743875
count %  29088
most_popular_ndcg  %  0.22417287699314872
count %  25925
bpr_ndcg  %  0.19979654276840556
count %  21277
zeroes  %  0.16397573926647502
count %  19252
lmf_ndcg  %  0.14836964479758316


### Encode Target

In [21]:
#als:0
#bpr:1
#lmf:2
#most_pop_3
#zeros:4
target_pre = metadataset['first_place'].values 
label_encoder = LabelEncoder()
target = label_encoder.fit_transform(target_pre)

### Normalization Inputs

In [22]:
normalize = True

In [23]:
if normalize:
  #---- SET INPUTS -----
  scaler = StandardScaler()
  #Compute the mean and std to be used for later scaling.
  scaler.fit(metadataset.drop(columns=['first_place','original_id']))
  # Perform standardization by centering and scaling
  inputs_transform = scaler.transform(metadataset.drop(columns=['first_place','original_id']))
  inputs = pd.DataFrame(inputs_transform)
  inputs.head()
else:
  inputs = metadataset.drop(columns=['first_place','original_id'])

In [24]:
inputs.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,-0.762653,0.502876,-0.440999,0.57874,6.48732,0.846224,1.059278,0.77762,-1.36287,0.13231,...,0.737741,2.757368,0.472769,2.932462,-0.474176,-0.526718,3.70292,-0.302123,1.995282,-1.62741
1,0.780034,-1.231799,0.411887,-0.783547,1.087678,-0.557226,0.164737,-2.081762,-0.935257,0.61295,...,-1.196858,2.117003,1.549487,1.140396,1.089726,-1.668867,0.355279,0.447397,1.05302,0.980292
2,-1.304951,-0.084922,-0.581762,-1.173176,1.274384,0.926392,0.615127,-1.982215,2.142214,-0.955726,...,-0.042859,2.280769,-2.490335,-1.59135,0.799997,1.293329,0.221618,1.469252,-0.836794,1.457249
3,1.411833,-1.873034,-0.582824,-1.080004,1.384711,-0.75714,1.275576,0.019706,-1.579068,-1.203121,...,-2.283741,2.536914,-1.3361,0.662212,0.979756,-1.29056,1.708534,-0.138699,-1.170718,-2.127123
4,-1.515711,1.706955,-3.118682,2.932744,1.055853,-2.386381,2.544711,0.254573,-1.511147,-1.952087,...,-1.104837,1.300275,1.171314,0.346739,1.284994,-2.696961,1.545478,-2.212789,1.607798,-4.07396


### Model

In [25]:
kf = KFold(n_splits=5)
kf.get_n_splits()
print(kf)

KFold(n_splits=5, random_state=None, shuffle=False)


In [26]:
params = {
    'activation': 'relu',
    'alpha': 0.05,
    'hidden_layer_sizes': (50, 50, 50),
    'learning_rate': 'constant',
    'solver': 'sgd'}

In [None]:
i = 1 
reports = []
base_impact_with_zeroes = []
base_impact_without_zeroes_most = []
base_impact_without_zeroes_best = []
matrix = []
for train_index, test_index in kf.split(inputs):
    print('iteration: ', i)
    #get data fold
    X_train, X_test = inputs.iloc[train_index], inputs.iloc[test_index]
    y_train, y_test = target[train_index], target[test_index]
    
    #start model 
    print('fit')
    clf = MLPClassifier(random_state=0, 
                        max_iter=300,
                       activation=params['activation'],
                       alpha=params['alpha'],
                       hidden_layer_sizes=params['hidden_layer_sizes'],
                       learning_rate=params['learning_rate'],
                       solver=params['solver'],
                       verbose=5)
        


    
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    report = classification_report(y_test, 
                                   y_pred, 
                                   target_names=np.unique(metadataset['first_place'].values),
                                  output_dict=True)
    
    bl_zeroes, bl_no_zeroes_most, bl_no_zeroes_best = base_level_eval(metadataset.iloc[test_index]['original_id'].values,
             list(label_encoder.inverse_transform(y_pred)))

    base_impact_with_zeroes.append(bl_zeroes)
    base_impact_without_zeroes_most.append(bl_no_zeroes_most)
    base_impact_without_zeroes_best.append(bl_no_zeroes_best)

    confusion = confusion_matrix(y_test,y_pred)
    matrix.append(confusion)
    np.set_printoptions(suppress=True)    
    reports.append(report)
    
    print('end: ', i)
    i+=1


iteration:  1
fit
Iteration 1, loss = 1.63828318
Iteration 2, loss = 1.60920346
Iteration 3, loss = 1.59945346
Iteration 4, loss = 1.59269732
Iteration 5, loss = 1.58698897
Iteration 6, loss = 1.58234605
Iteration 7, loss = 1.57821292
Iteration 8, loss = 1.57464012
Iteration 9, loss = 1.57145871
Iteration 10, loss = 1.56851788
Iteration 11, loss = 1.56588084
Iteration 12, loss = 1.56337162
Iteration 13, loss = 1.56118501
Iteration 14, loss = 1.55903726
Iteration 15, loss = 1.55702379
Iteration 16, loss = 1.55508423
Iteration 17, loss = 1.55327655
Iteration 18, loss = 1.55152049
Iteration 19, loss = 1.54976155
Iteration 20, loss = 1.54818858
Iteration 21, loss = 1.54666260
Iteration 22, loss = 1.54519199
Iteration 23, loss = 1.54368514
Iteration 24, loss = 1.54233001
Iteration 25, loss = 1.54096628
Iteration 26, loss = 1.53964596
Iteration 27, loss = 1.53846388
Iteration 28, loss = 1.53717346
Iteration 29, loss = 1.53597488
Iteration 30, loss = 1.53481334
Iteration 31, loss = 1.53377662



starting base_level_eval
end:  1
iteration:  2
fit
Iteration 1, loss = 1.63845910
Iteration 2, loss = 1.60929621
Iteration 3, loss = 1.59936426
Iteration 4, loss = 1.59230906
Iteration 5, loss = 1.58643718
Iteration 6, loss = 1.58140231
Iteration 7, loss = 1.57708393
Iteration 8, loss = 1.57333025
Iteration 9, loss = 1.56998688
Iteration 10, loss = 1.56693795
Iteration 11, loss = 1.56423254
Iteration 12, loss = 1.56165496
Iteration 13, loss = 1.55937162
Iteration 14, loss = 1.55720850
Iteration 15, loss = 1.55515127
Iteration 16, loss = 1.55321293
Iteration 17, loss = 1.55131962
Iteration 18, loss = 1.54960670
Iteration 19, loss = 1.54788987
Iteration 20, loss = 1.54629600
Iteration 21, loss = 1.54473743
Iteration 22, loss = 1.54328159
Iteration 23, loss = 1.54182392
Iteration 24, loss = 1.54054238
Iteration 25, loss = 1.53915718
Iteration 26, loss = 1.53786583
Iteration 27, loss = 1.53661909
Iteration 28, loss = 1.53538977
Iteration 29, loss = 1.53419154
Iteration 30, loss = 1.5331354

In [None]:
avg_reports = report_average(reports)
print_report(avg_reports)

##### Print

In [17]:
def print_report(avg_reports):
    from prettytable import PrettyTable
    x = PrettyTable()

    x.field_names = ["Algorithm", "Precision", "Recall", "F1"]

    for label in avg_reports.keys():
        if label in 'accuracy':
            x.add_row(['---','---','---','---'])
            continue
        x.add_row([label, 
                   avg_reports[label]['precision'], 
                   avg_reports[label]['recall'], 
                   avg_reports[label]['f1-score']])


    print(x)
    print('Accuracy: ', avg_reports['accuracy'])

In [18]:
def print_confusion(values, classes):
    from prettytable import PrettyTable
    x = PrettyTable()
    print(classes)

    names = []
    names.append('algorithm')
    names = names + list(classes.values())

    x.field_names = names

    i = 0
    for row in values:
        #row = np.array(row)
        r = []
        r.append(classes[i])
        row = r + list(row)
        #r.append(classes[i])
        #r = r + row
        #row = np.insert(row,0,'als')
        x.add_row(row)
        #r  = np.concatenate(csses[i],row[])
        i +=1
    print(x)

In [19]:
def report_average(reports):
    mean_dict = dict()
    for label in reports[0].keys():
        dictionary = dict()

        if label in 'accuracy':
            mean_dict[label] = sum(d[label] for d in reports) / len(reports)
            continue

        for key in reports[0][label].keys():
            dictionary[key] = sum(d[label][key] for d in reports) / len(reports)
        mean_dict[label] = dictionary

    return mean_dict

In [20]:
def base_level_eval(users, predictions):
    """Uses the predctions to return the average of the ndcg impact at base level.
    Args:
        users: list of users ids
        predictions:predictions for users. PREDS HAVE TO be the same index ahas the users list
    Returns:
        average of ndcg
    """
    print('starting base_level_eval')
    results_algo = pd.read_csv(os.path.join(DATA_DIR, 'results_metadataset.csv'))
    base_impact = []
    base_impact_zeroes_most = []
    base_impact_zeroes_best = []
    for user_uid, pred in zip(users, predictions):


        val = results_algo.loc[ results_algo['original_id'] == user_uid, pred ]
        if pred == 'zeroes':
            val_zeroes = results_algo.loc[ results_algo['original_id'] == user_uid, 'most_popular_ndcg']
            best = results_algo.loc[ results_algo['original_id'] == user_uid]

            base_impact.append(val.values[0])
            base_impact_zeroes_most.append(val_zeroes.values[0])
            base_impact_zeroes_best.append(best.drop('original_id', 1).max(axis=1).values[0])
        else:
            base_impact.append(val.values[0])
            base_impact_zeroes_most.append(val.values[0])
            base_impact_zeroes_best.append(val.values[0])


        if len(val.values) > 1:
            raise Exception("More than one case")

    return np.mean(base_impact), np.mean(base_impact_zeroes_most), np.mean(base_impact_zeroes_best)



### Grid search

In [29]:
grid_params = {
    'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
} 

In [30]:
mlp = MLPClassifier(max_iter=300)

In [31]:
clf = GridSearchCV(mlp, grid_params, n_jobs=-1, cv=5, verbose=5)
clf.fit(inputs,target)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed: 19.7min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 60.2min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed: 134.6min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=MLPClassifier(activation='relu', alpha=0.0001,
                                     batch_size='auto', beta_1=0.9,
                                     beta_2=0.999, early_stopping=False,
                                     epsilon=1e-08, hidden_layer_sizes=(100,),
                                     learning_rate='constant',
                                     learning_rate_init=0.001, max_iter=300,
                                     momentum=0.9, n_iter_no_change=10,
                                     nesterovs_momentum=True, power_t=0.5,
                                     random_sta...
                                     solver='adam', tol=0.0001,
                                     validation_fraction=0.1, verbose=False,
                                     warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'activation': ['tanh', 'relu'],
                         'alph

In [32]:
clf.best_params_

{'activation': 'tanh',
 'alpha': 0.05,
 'hidden_layer_sizes': (100,),
 'learning_rate': 'adaptive',
 'solver': 'sgd'}