In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise import NormalPredictor
from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import KNNWithZScore
from surprise import KNNBaseline
from surprise import SVD
from surprise import BaselineOnly
from surprise import SVDpp
from surprise import NMF
from surprise import SlopeOne
from surprise import CoClustering
from surprise.accuracy import rmse
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV

In [2]:
path = r"C:\Users\Shibbs\Desktop\Praxis\CAPP\cwd\datasets\final\df.csv"

In [10]:
def get_only_view_data(path, view_count_limit):
    df = pd.read_csv(path ,sep=',', index_col=False, dtype='unicode')
    only_view = df.loc[df['event_type'] == 'view']
    only_view_count = only_view.groupby(['user_id','product_id'])['event_type'].count().reset_index()
    only_view_count = only_view_count.rename(columns={"event_type": "view_count"})
    only_view_count = only_view_count[only_view_count['view_count']<view_count_limit]
    
    return only_view_count

In [4]:
def get_only_cart_data(path, cart_count_limit):
    df = pd.read_csv(path ,sep=',', index_col=False, dtype='unicode')
    only_cart = df.loc[df['event_type'] == 'cart']
    only_cart_count = only_cart.groupby(['user_id','product_id'])['event_type'].count().reset_index()
    only_cart_count = only_cart_count.rename(columns={"event_type": "cart_count"})
    only_cart_count = only_cart_count[only_cart_count['cart_count']<cart_count_limit]
    
    return only_cart_count

In [5]:
def get_only_purchase_data(path, purchase_count_limit):
    df = pd.read_csv(path ,sep=',', index_col=False, dtype='unicode')
    only_purchase = df.loc[df['event_type'] == 'cart']
    only_purchase_count = only_purchase.groupby(['user_id','product_id'])['event_type'].count().reset_index()
    only_purchase_count = only_purchase_count.rename(columns={"event_type": "purchase_count"})
    only_purchase_count = only_purchase_count[only_purchase_count['purchase_count']<purchase_count_limit]
    
    return only_purchase_count

In [6]:
def setup_training(df):
    reader = Reader(rating_scale=(df.view_count.min(),df.view_count.max()))
    data = Dataset.load_from_df(df, reader)
    dataset = data.build_full_trainset()
    print('Number of users: ',dataset.n_users,'\n')
    print('Number of items: ',dataset.n_items,'\n')
    print('Ready for Learning...')

In [7]:
def train_baselineonly(df, pg):
    


    # bsl_options = {'method': ['als','sgd'],'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],
    #               'reg_all': [0.4, 0.6]}
    
    bsl_algo = BaselineOnly()
    gs = GridSearchCV(BaselineOnly, pg, measures=['rmse', 'mae'], cv=3)

    gs.fit(data)

    # best RMSE score
    print(gs.best_score['rmse'])

    # combination of parameters that gave the best RMSE score
    print(gs.best_params['rmse'])

In [16]:
def train_svd(df, pg):
    reader = Reader(rating_scale=(df.view_count.min(),df.view_count.max()))
    data = Dataset.load_from_df(df, reader)
    dataset = data.build_full_trainset()
    print('Number of users: ',dataset.n_users,'\n')
    print('Number of items: ',dataset.n_items,'\n')
    print('Ready for Learning...')
    
    gs = GridSearchCV(SVD, pg, measures=['rmse', 'mae'], cv=3)

    gs.fit(data)

    # best RMSE score
    print(gs.best_score['rmse'])

    # best RMSE score
    print(gs.best_score['mae'])

    # combination of parameters that gave the best RMSE score
    print(gs.best_params['rmse'])

    # combination of parameters that gave the best MAE score
    print(gs.best_params['mae'])
    
    return 

In [11]:
df = get_only_view_data(path,50)

In [12]:
setup_training(df)

Number of users:  13284 

Number of items:  53450 

Ready for Learning...


In [20]:
train_baselineonly(df, {'bsl_options':
                                    {'method': ['als'],
                                     'n_epochs': [55], 
                                     'lr_all': [0.001],
                                     'reg_all': [0.4]
                                    }})

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
1.4991115832002453
{'bsl_options': {'method': 'als', 'n_epochs': 50, 'lr_all': 0.001, 'reg_all': 0.4}}


In [26]:
    reader = Reader(rating_scale=(df.view_count.min(),df.view_count.max()))
    data = Dataset.load_from_df(df, reader)
    dataset = data.build_full_trainset()
    print('Number of users: ',dataset.n_users,'\n')
    print('Number of items: ',dataset.n_items,'\n')
    print('Ready for Learning...')
    
    gs = GridSearchCV(SVD, {'n_epochs': [70], 
               'lr_all': [0.002],
               'reg_all': [0.4]}, measures=['rmse', 'mae'], cv=3)

    gs.fit(data)

    # best RMSE score
    print(gs.best_score['rmse'])

    # best RMSE score
    print(gs.best_score['mae'])

    # combination of parameters that gave the best RMSE score
    print(gs.best_params['rmse'])

    # combination of parameters that gave the best MAE score
    print(gs.best_params['mae'])

Number of users:  13284 

Number of items:  53450 

Ready for Learning...
1.5004584753459038
0.7047929293298957
{'n_epochs': 70, 'lr_all': 0.002, 'reg_all': 0.4}
{'n_epochs': 70, 'lr_all': 0.002, 'reg_all': 0.4}


In [20]:
train_svd(df, {'n_epochs': [70], 
               'lr_all': [0.002],
               'reg_all': [0.4]})

Number of users:  13284 

Number of items:  53450 

Ready for Learning...
1.4982060806547215
0.7048611258188942
{'n_epochs': 70, 'lr_all': 0.002, 'reg_all': 0.4}
{'n_epochs': 70, 'lr_all': 0.002, 'reg_all': 0.4}


In [None]:
def make_pred_baselineonly(uid,iid):
    return algo.predict(uid,iid)

In [22]:
def make_pred_svd(uid,iid):
    return svd.predict(uid,iid)

In [28]:
gs.predict('1515915625353230000','73593')

ValueError: refit is False, cannot use predict()

In [24]:
predictions = make_pred_svd('1515915625353230000','73593')

NameError: name 'svd' is not defined