In [1]:
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.model_selection import KFold
from sklearn import metrics
from sklearn import svm
import pandas as pd
import numpy as np
import sys
import os

In [2]:
is_vae = True

In [3]:
# Just choose the name of the dataset directory
DATA_DIR = '/Users/tomas/Documents/FEUP/Tese/data/ml-20m/processed_70_10_20'
if is_vae:
    PARSE_DATA_DIR = os.path.join(DATA_DIR, 'embeddings/vae')
else:
    PARSE_DATA_DIR = os.path.join(DATA_DIR, 'embeddings/cdae')

In [4]:
file = 'metadataset_k_20.csv'

In [6]:
#read in the data using pandas
metadataset = pd.read_csv(os.path.join(PARSE_DATA_DIR, file ))
metadataset = metadataset[metadataset.first_place != 'zeroes']
metadataset.head()

Unnamed: 0,original_id,1,2,3,4,5,6,7,8,9,...,192,193,194,195,196,197,198,199,200,first_place
0,7648,-0.431028,0.273519,0.000467,-0.021366,0.411798,0.445795,1.206461,0.437548,-0.670044,...,1.261934,0.238986,0.150137,-0.024725,-0.349221,3.509393,-0.146421,1.508804,-0.859744,als_ndcg
2,13315,-0.706174,-0.03129,-0.005091,-0.097111,0.111871,0.484369,0.737223,-1.132764,1.119018,...,1.040254,-1.375516,-0.064951,0.030435,0.757224,0.444452,0.753969,-0.582505,0.761838,bpr_ndcg
3,16144,0.672244,-0.958536,-0.005133,-0.093083,0.118219,-0.32569,1.434977,0.006304,-0.780396,...,1.159395,-0.74661,0.042197,0.038217,-0.813577,1.753534,-0.063353,-0.829087,-1.12244,most_popular_ndcg
4,18064,-0.813108,0.897909,-0.105261,0.08041,0.099298,-1.109625,2.775797,0.139941,-0.745728,...,0.584199,0.619601,0.027197,0.051431,-1.668557,1.609979,-1.117613,1.222671,-2.145877,lmf_ndcg
5,24397,0.891482,-1.064005,0.014642,-0.073796,0.273126,0.394037,1.217203,-0.783643,1.011596,...,0.687227,-0.272977,0.078848,-0.01248,-1.249612,1.758417,-0.38983,2.174819,-0.597049,lmf_ndcg


### Encode Target

In [7]:
#als:0
#bpr:1
#lmf:2
#most_pop_3
#zeros:4
target_pre = metadataset['first_place'].values 
label_encoder = LabelEncoder()
target = label_encoder.fit_transform(target_pre)

### Normalization Inputs

In [8]:
normalize = True

In [9]:
if normalize:
  #---- SET INPUTS -----
  scaler = StandardScaler()
  #Compute the mean and std to be used for later scaling.
  scaler.fit(metadataset.drop(columns=['first_place','original_id']))
  # Perform standardization by centering and scaling
  inputs_transform = scaler.transform(metadataset.drop(columns=['first_place','original_id']))
  inputs = pd.DataFrame(inputs_transform)
  inputs.head()
else:
  inputs = metadataset.drop(columns=['first_place','original_id'])

### Model

In [None]:
kf = KFold(n_splits=5)
kf.get_n_splits()
print(kf)

In [None]:
params = {
    'C': 0.1,
    'gamma': 1,
    'kernel': 'rbf'
}

In [None]:
is_smote = False

In [None]:
i = 1 
reports = []
for train_index, test_index in kf.split(inputs):
    print('iteration: ', i)
    #get data fold
    X_train, X_test = inputs.iloc[train_index], inputs.iloc[test_index]
    y_train, y_test = target[train_index], target[test_index]
    
    #start model 
    print('fit')
    svm.SVC()
    clf = svm.SVC(
        kernel='linear',
        C=params['C'],
        gamma=params['gamma'],
        kerner=params['kernel'],
        verbose=True) # Linear Kernel
    
    if is_smote:
        print('dataset shape %s' % Counter(y_train))
        sm = SMOTE(random_state=42)
        X_train_re, y_train_re = sm.fit_resample(X_train, y_train)
        print('Resampled dataset shape %s' % Counter(y_train_re))

        clf.fit(X_train_re, y_train_re)
        print('predict')
    else:
        clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    report = classification_report(y_test, 
                               y_pred, 
                               target_names=np.unique(metadataset['first_place'].values),
                              output_dict=True)
    reports.append(report)
    print('end: ', i)
    i+=1

In [None]:
avg_reports = report_average(reports)
print_report(avg_reports)

#### Print

In [None]:
def print_report(avg_reports):
    from prettytable import PrettyTable
    x = PrettyTable()

    x.field_names = ["Algorithm", "Precision", "Recall", "F1"]

    for label in avg_reports.keys():
        if label in 'accuracy':
            x.add_row(['---','---','---','---'])
            continue
        x.add_row([label, 
                   avg_reports[label]['precision'], 
                   avg_reports[label]['recall'], 
                   avg_reports[label]['f1-score']])


    print(x)
    print('Accuracy: ', avg_reports['accuracy'])

In [None]:
def report_average(reports):
    mean_dict = dict()
    for label in reports[0].keys():
        dictionary = dict()

        if label in 'accuracy':
            mean_dict[label] = sum(d[label] for d in reports) / len(reports)
            continue

        for key in reports[0][label].keys():
            dictionary[key] = sum(d[label][key] for d in reports) / len(reports)
        mean_dict[label] = dictionary

    return mean_dict

### Gridsearch

In [29]:
param_grid = {'C': [0.1, 1, 10, 500],  
              'gamma': [1, 0.1, 0.01], 
              'kernel': ['rbf']}

In [None]:
grid = GridSearchCV(svm.SVC(), param_grid, cv=2, verbose = 10, n_jobs=-1) 
grid.fit(inputs, target) 

Fitting 2 folds for each of 12 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed: 532.3min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed: 978.6min


In [None]:
grid.best_params_