In [1]:
from fastai.structured import *
from fastai.column_data import *
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
import pickle

In [2]:
# lets load the data
df = pd.read_csv('course_ratings.csv')

In [3]:
#lets remove high rating rows
joined = df.drop(df[df['rating'] > 5].index)

In [4]:
# lets define catigorical vars
cat_vars = ['user','course','category','job','institution','state']
#coninuous vars
cont_vars = ['rating']

# df length
n = len(df)

# test percentage
testp = n*.2

#lets update features, categoricial should be category and continuous should be float32
for v in cat_vars: 
    joined[v] = joined[v].astype('category').cat.as_ordered()
    
for v in cont_vars:
    joined[v] = joined[v].fillna(0).astype('float32')

In [5]:
# we have to convert all category variables into contiguous ones and save the look up table
lookup_table = {}
for c in cat_vars:
    uq = joined[c].unique()
    col2idx =  {o:i for i,o in enumerate(sorted(uq))}
    lookup_table[c] = col2idx
    joined[c] = joined[c].apply(lambda x: col2idx[x])

In [6]:
#lets save the mapper for later use
with open('models/data/lookup_table', 'wb') as fp:
    pickle.dump(lookup_table, fp)

In [7]:
#lets check lookup table
with open ('models/data/lookup_table', 'rb') as fp:
    lut_load = pickle.load(fp)

#lut_load

In [8]:
# create a test/train set split
joined, joined_test = train_test_split(joined, test_size=0.25)
df = joined.drop('rating', axis=1)
y = joined['rating']
df_test = joined_test.drop('rating', axis=1)

# create a validation set of ids
train_ratio = 0.75
train_size = int(len(df) * train_ratio)
val_idx = list(range(train_size, len(df)))

with open('models/data/val_idx', 'wb') as fp:
    pickle.dump(val_idx, fp)
    
with open('models/data/ratings', 'wb') as fp:
    pickle.dump(y, fp)

In [9]:
#final df , everything is numeric and continguous
with open('models/data/final_df', 'wb') as fp:
    pickle.dump(df, fp)

In [10]:
df.head(10)

Unnamed: 0,user,course,category,job,institution,state
47905,3571,165,5,6,0,23
67014,43032,75,7,6,14,42
20054,17559,203,7,6,11,44
97174,42501,44,7,6,0,57
15260,3128,51,7,6,0,3
50672,21806,174,7,6,9,31
127168,22250,146,5,39,0,35
83647,15755,36,7,6,9,45
25068,33139,172,7,6,11,5
84546,24762,172,7,6,0,41


In [12]:
#lets build the initial model
md = ColumnarModelData.from_data_frame("models/", val_idx, df, y.astype(np.float32), cat_flds=cat_vars, bs=128, test_df=df_test)

In [13]:
#build embedding matrix, matrices should be 1 bigger then the the number of categorical options to leave room for unknown
cat_sz = [(c, len(joined[c].cat.categories)+1) for c in cat_vars]
emb_szs = [(c, min(50, (c+1)//2)) for _,c in cat_sz]
with open('models/data/emb_sizes', 'wb') as fp:
    pickle.dump(emb_szs, fp)

In [14]:
#build the learner, 
# .04 dropout on the embedding matrix
# 1000 and 500 nodes on 2 different layers
# .001 and .01 dropouts
# specifying the range of y (rating) to be 0-5
m = md.get_learner(emb_szs,0 ,0.4, 1, [200,100], [0.5,0.01],y_range=(0,5))

In [None]:
m.summary()

In [None]:
#lets find a learning rate
m.lr_find()

In [None]:
m.sched.plot(100)

In [None]:
lr = 1e-3

In [None]:
# Local
#m.fit(lr, 2, cycle_len=2)

In [None]:
# GPU
# lets fit the model, 50 epochs with restarts
m.fit(lr, 10, cycle_len=1, wds=1e-3)

In [None]:
m.save('mdl')

In [None]:
m.load('mdl')

In [None]:
x,y=m.predict_with_targs()

In [None]:
pred_test=m.predict(is_test=True)

In [None]:
joined_test['rating_pred'] = pred_test

In [None]:
joined_test[['rating','rating_pred']].head(10)

In [None]:
rms = sqrt(mean_squared_error(joined_test['rating'], joined_test['rating_pred']))

In [None]:
# we achieved a root mean squared error of .47
rms

In [None]:
test_record = df_test.iloc[0]

In [None]:
joined_test.iloc[0]

In [None]:
cat = test_record[cat_vars].values.astype(np.int64)[None]
contin = test_record.drop(cat_vars).values.astype(np.float32)[None]

#Prediction
model = m.model
model.eval()
prediction = to_np(model(V(cat),[]))

In [None]:
prediction[0][0]

In [None]:
#model is ready, need to create simple api and also test with new data since the original was pulled

In [None]:
# lets load up the test set

In [15]:
post_test = pd.read_csv('test_ratings.csv')

In [16]:
for v in cat_vars: 
    post_test[v] = post_test[v].astype('category').cat.as_ordered()
    
for v in cont_vars:
    post_test[v] = post_test[v].fillna(0).astype('float32')

In [38]:
course_extras = {}
user_extras = {}
unknown_user = df['user'].max() + 1
unknown_course = df['course'].max() + 1

def look_course_up(x):
    global unknown_course
    global course_extras
    
    try:
        val = lookup_table['course'][x]
    except KeyError:
        # user is not there, lets set to unknown
        # lets check extras
        try:
            val = course_extras[x]
        except KeyError:
            # so we really dont have it, set to unknown
            val = unknown_course
            course_extras[x] = unknown_course
            unknown_course = unknown_course + 1
    return val

def look_user_up(x):
    global unknown_user
    global user_extras
    try:
        val = lookup_table['user'][x]
    except KeyError:
        # user is not there, lets set to unknown
        # lets check extras
        try:
            val = user_extras[x]
        except KeyError:
            # so we really dont have it, set to unknown
            val = unknown_user
            user_extras[x] = unknown_user
            unknown_user = unknown_user + 1
    return val

In [36]:
unknown_user

52117

In [37]:
#unknown_course
#unknown_user
#luu = look_user_up('01e15a509a7a66df61edc488557456cc5e937c9c949c99517e3ec080938ec3e388b8522eccf7914f9760d6301b0f1d2a6927bee2b510d8716eb3f4bcad2e24c3')

In [18]:
test_vars = ['category','job','institution','state']

for c in test_vars:
    col2idx =  lookup_table[c]
    post_test[c] = post_test[c].apply(lambda x: col2idx[x])

In [43]:
post_test['user'] = post_test['user'].apply(look_user_up)
post_test['course'] = post_test['course'].apply(look_course_up)

In [44]:
post_test

Unnamed: 0,user,course,category,rating,job,institution,state
0,52674,266,8,4.05,29,0,41
1,52618,246,8,3.23,6,0,41
2,52539,246,8,3.77,6,0,41
3,52439,250,7,4.59,6,9,57
4,52328,261,9,3.73,6,14,39
5,52508,335,9,2.82,6,9,41
6,52508,332,7,3.73,6,9,41
7,52508,247,7,3.73,6,9,41
8,52659,349,7,4.77,6,2,41
9,52471,330,7,3.73,6,14,41


In [None]:
# make a function to apply to test df

In [None]:
def get_rating(df):
    cat = df[cat_vars].values.astype(np.int64)[None]
    prediction = to_np(model(V(cat),[]))
    return prediction[0][0]

In [None]:
post_test['prating'] = post_test.apply(get_rating, axis=1)

In [None]:
post_test.head(10)

In [None]:
rms_pt = sqrt(mean_squared_error(post_test['rating'], post_test['prating']))

In [None]:
rms_pt