<a href="https://colab.research.google.com/github/Colley-K/rock_climbing_recommendation_system/blob/master/create_top_ten_df.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Creating a Top 10 recommendations dataframe
* Using the SVD++ algorithm, this notebook calculates all the predictions with the sample6 data (not the full 6x filter data), and turns it into a dataframe of top ten predictions for each user
>* If I had more memory, I would like to run this on the entire dataframe
>* The max sample size I could do without google colab crashing was a 65% sample of the original 6x filtered df
* *The "build anti-testset" function takes a lot of RAM, and thus I needed to do this outside my main ML notebook*

In [1]:
#Colab stuffs:

#get a fast operator system
!nvidia-smi

#mount google drive
from google.colab import drive
drive.mount('/content/drive')#click on the link it provides and copy and paste that code into the authorization area

#access the OS system to work with current directories:
import os 
os.getcwd() 

Thu Jul 11 19:14:54 2019       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.67       Driver Version: 410.79       CUDA Version: 10.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P8    27W / 149W |      0MiB / 11441MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

'/content'

In [2]:
#importing functions for this notebook
from google.colab import files
uploaded = files.upload()

#importing functions for this notebook
from google.colab import files
uploaded = files.upload()

Saving function.py to function (1).py


Saving final_ml_predict.py to final_ml_predict (1).py


In [3]:
#Installs
!pip install surprise

#import coldstart_thresholds as cst
import function as f
import final_ml_predict as fml
import numpy as np 
import pandas as pd 
from collections import defaultdict
import surprise as sp
from surprise.model_selection import cross_validate, KFold, train_test_split, GridSearchCV

#Setting defaults
pd.set_option('display.max_columns', 10)



In [0]:
rock6x = pd.read_csv('drive/My Drive/Capstone/data/rock6x.csv', low_memory=False)

In [5]:
rock6x.shape

(334748, 10)

In [6]:
#create a 40% sample out of rock6x dataframe
samp6= f.df_samp_unique_vals(rock6x, .65, "user_id", "route")

User drop: 17878
Route drop: 7195
length of entire sample w/ unique users & routes: 4677
Num of unique routes: 4677
Num of unique users: 4677
Matrix size: 21874329
Shape of df: (47475, 10)
Density of matrix: 0.002170352288291906


In [0]:
def top_ten_df (df):
    '''
    inputs:
    df (Pandas DF) the dataframe that you would like to train on/NOTE: use f.df_samp_unique_vals() to get a smaller DF if you dont have enough memory to run full DF

    outputs:
    top_ten_df (DataFrame Pandas) returns a dataframe with the top ten predictions for every user in your original dataframe
    '''

    data= f.read_data_surprise(df)#use f.df_samp_unique_vals() to get a smaller DF if you dont have enough memory to run full DF

    # First train an SVD algorithm on entire dataset (choose 6x name filter)
    trainset = data.build_full_trainset()
    algo = sp.SVDpp(n_epochs= 18, lr_all= 0.01, reg_all= 0.175)
    algo.fit(trainset)

    # Than predict ratings for all pairs (u, i) that are NOT in the training set.
    testset = trainset.build_anti_testset()#HEAVY THIS TAKES THE MOST RAM
    predictions = algo.test(testset)
    sp.accuracy.rmse(predictions)

    #create a dictionary of predictions
    top_n = f.get_top_n(predictions, n=10)

    #Turn the dictionary into a df
    top_ten_df = pd.DataFrame(top_n)

    return top_ten_df

In [0]:
top_n_predict65_wparams = top_ten_df(samp6)

In [0]:
sp.accuracy.rmse(predictions6f)
f.violin_accuracy(algo, testset6f)

RMSE: 0.6638


In [0]:
#Export to google drive
top_n_predict65_wparams.to_csv(r'/content/drive/My Drive/Capstone/data/top_n_predict65_wparams.csv', index = None, header=True)

In [10]:
top_n_predict65_wparams.head()

Unnamed: 0,1,11,12,6,2,...,47000,66968,66301,63921,61713
0,"(abstrakt, 3)","(abstrakt, 2.88670383777043)","(abstrakt, 2.965450796247701)","(abstrakt, 3)","(abstrakt, 3)",...,"(abstrakt, 3)","(abstrakt, 2.9770649908094136)","(abstrakt, 2.815107198447452)","(abstrakt, 2.9932757950442443)","(abstrakt, 3)"
1,"(stromlinie, 3)","(vestpillaren, 2.7799805233998978)","(hegar, 2.809462278617374)","(magnetfinger, 3)","(vestpillaren, 2.907098380872724)",...,"(magnetfinger, 3)","(vestpillaren, 2.8794411568884124)","(magnetfinger, 2.7489119052501017)","(vestpillaren, 2.917344459767645)","(magnetfinger, 3)"
2,"(magnetfinger, 3)","(magnetfinger, 2.7457047518051616)","(thaiboxing, 2.767406704232296)","(misericorde, 3)","(magnetfinger, 2.9007281593044647)",...,"(titanofestivalen, 3)","(kachoong, 2.8337364580708475)","(vestpillaren, 2.7129596880725875)","(magnetfinger, 2.8833288714238656)","(vestpillaren, 3)"
3,"(duroxmanie, 3)","(thaiboxing, 2.727071911280836)","(vestpillaren, 2.7656782513690805)","(titanofestivalen, 3)","(villskudd, 2.8694463553324128)",...,"(villskudd, 3)","(magnetfinger, 2.8200310771490193)","(kachoong, 2.675970626138466)","(snapdragon, 2.8691523731053294)","(hegar, 3)"
4,"(orestes, 3)","(gluttony, 2.7208677397953855)","(quimera, 2.745291868794412)","(villskudd, 3)","(superlative, 2.8672495726148406)",...,"(vestpillaren, 3)","(prismaster, 2.8185161854614624)","(hegar, 2.673604979353643)","(prismaster, 2.861558261528032)","(prismaster, 2.9984506628295846)"
