In [None]:
# !pip install lightfm

In [None]:
!pip install pyforest

import pyforest
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
from tqdm import tqdm_notebook
# from lightfm import LightFM




In [None]:
# enabling the GPU
%tensorflow_version 2.x
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [None]:
PATH = '/content/drive/My Drive/JanataHack_recommendation_engine/'

In [None]:
train = pd.read_csv(PATH + 'train.csv')
test = pd.read_csv(PATH + 'test.csv')
challenge = pd.read_csv(PATH + 'challenge_data.csv')

In [None]:
train.head()

Unnamed: 0,user_sequence,user_id,challenge_sequence,challenge
0,4576_1,4576,1,CI23714
1,4576_2,4576,2,CI23855
2,4576_3,4576,3,CI24917
3,4576_4,4576,4,CI23663
4,4576_5,4576,5,CI23933


In [None]:
print(train.shape)
print(test.shape)

(903916, 4)
(397320, 4)


In [None]:
challenge.isna().sum()

challenge_ID               0
programming_language       0
challenge_series_ID       12
total_submissions        352
publish_date               0
author_ID                 39
author_gender             97
author_org_ID            248
category_id             1841
dtype: int64

In [None]:
challenge.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5606 entries, 0 to 5605
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   challenge_ID          5606 non-null   object 
 1   programming_language  5606 non-null   int64  
 2   challenge_series_ID   5594 non-null   object 
 3   total_submissions     5254 non-null   float64
 4   publish_date          5606 non-null   object 
 5   author_ID             5567 non-null   object 
 6   author_gender         5509 non-null   object 
 7   author_org_ID         5358 non-null   object 
 8   category_id           3765 non-null   float64
dtypes: float64(2), int64(1), object(6)
memory usage: 394.3+ KB


## LIGHTFM MODEL 

## PROCESSING CHALLENGE DATASET

In [None]:
challenge.head()

Unnamed: 0,challenge_ID,programming_language,challenge_series_ID,total_submissions,publish_date,author_ID,author_gender,author_org_ID,category_id
0,CI23478,2,SI2445,37.0,06-05-2006,AI563576,M,AOI100001,
1,CI23479,2,SI2435,48.0,17-10-2002,AI563577,M,AOI100002,32.0
2,CI23480,1,SI2435,15.0,16-10-2002,AI563578,M,AOI100003,
3,CI23481,1,SI2710,236.0,19-09-2003,AI563579,M,AOI100004,70.0
4,CI23482,2,SI2440,137.0,21-03-2002,AI563580,M,AOI100005,


In [None]:
challenge['total_submissions'].replace(np.nan,0, inplace=True)


In [None]:
challenge['publish_date'] = pd.to_datetime(challenge['publish_date'], dayfirst =True)
challenge['publishing_day'] = challenge['publish_date'].dt.day
challenge['publishing_year'] = challenge['publish_date'].dt.year
challenge['publishing_month'] = challenge['publish_date'].dt.month
challenge.drop(['publish_date'], axis=1, inplace=True)


In [None]:
challenge.isna().sum()

challenge_ID               0
programming_language       0
challenge_series_ID       12
total_submissions          0
author_ID                 39
author_gender             97
author_org_ID            248
category_id             1841
publishing_day             0
publishing_year            0
publishing_month           0
dtype: int64

In [None]:
print(challenge['category_id'].nunique())
print(challenge['challenge_series_ID'].nunique())
print(challenge['author_ID'].nunique())
print(challenge['author_org_ID'].nunique())

194
435
3484
1717


In [None]:
# dummify categorical features
challenge = pd.get_dummies(challenge, columns = ['programming_language', 'author_gender', 'challenge_series_ID',
                                                                                         'category_id', 'publishing_day', 
                                                                                         'publishing_year','publishing_month'])

In [None]:
def dic_mapping(df, col):
  req_list = df[col].unique().tolist()
  req_dict = {}
  for e,i in enumerate(req_list):
    req_dict[i] = e
  return req_dict

In [None]:
author_dict = dic_mapping(challenge, 'author_ID')
author_org_dict = dic_mapping(challenge, 'author_org_ID')
challenge_dict = dic_mapping(challenge, 'challenge_ID')

In [None]:
challenge_dict_demap = {v:k for k,v in challenge_dict.items()}

In [None]:
challenge['author_ID'].replace(author_dict, inplace=True)
challenge['author_org_ID'].replace(author_org_dict, inplace=True)
challenge['challenge_ID'].replace(challenge_dict, inplace=True)

In [None]:
challenge['challenge_ID'] = challenge['challenge_ID'].astype(int)
challenge.sort_values(by=['challenge_ID'],inplace=True)

In [None]:
# convert to csr matrix
challenge_csr = csr_matrix(challenge.drop('challenge_ID', axis=1).values)

# PROCESSING USER CHALLENGE PARTICIPATION DATASET

In [None]:
train.head()

Unnamed: 0,user_sequence,user_id,challenge_sequence,challenge
0,4576_1,4576,1,CI23714
1,4576_2,4576,2,CI23855
2,4576_3,4576,3,CI24917
3,4576_4,4576,4,CI23663
4,4576_5,4576,5,CI23933


User_sequence is unique for all the records <br>
Challenge sequence has 1 unique records with equal distribution of 69k values<br>
69k unique users <br>
5348 unique challenges



In [None]:
train_test = pd.concat([train,test],axis=0)

In [None]:
# user_dict = dic_mapping(train_test, 'user_id')

In [None]:
train_test['challenge'].replace(challenge_dict, inplace=True)

In [None]:
train_test.head()

Unnamed: 0,user_sequence,user_id,challenge_sequence,challenge
0,4576_1,4576,1,236
1,4576_2,4576,2,377
2,4576_3,4576,3,1439
3,4576_4,4576,4,185
4,4576_5,4576,5,455


In [None]:
df = pd.pivot_table(train_test, index='user_id', columns='challenge', values='challenge_sequence')
df.fillna(0, inplace=True)

In [None]:
user_id = list(df.index)
user_dict = {}

for e,i in enumerate(user_id):
    user_dict[i] = e


In [None]:
# convert to csr matrix
df_csr = csr_matrix(df.values)

In [None]:

model = LightFM(loss='warp',
                random_state=2016,
                no_components=150,
                user_alpha=1e-3)

model.fit(df_csr,item_features=challenge_csr,
                  epochs=50,
                  num_threads=16, verbose=False)

<lightfm.lightfm.LightFM at 0x7fb5c32a58d0>

# PREDICTIONS

In [None]:
n_items=3

In [None]:
prediction = {}
for user_id in tqdm_notebook(test['user_id'].unique()):
  n_users, n_items = df.shape
  user_x = user_dict[user_id]
  scores = pd.Series(model.predict(user_x,np.arange(n_items), item_features=challenge_csr))
  scores.index = df.columns
  scores = list(pd.Series(scores.sort_values(ascending=False).index))

  known_items = list(pd.Series(df.loc[user_id,:] \
                                [df.loc[user_id,:] > 0].index).sort_values(ascending=False))

  scores = [x for x in scores if x not in known_items]
  return_score_list = scores[0:3]
  known_items = list(pd.Series(known_items).apply(lambda x: challenge_dict_demap[x]))
  scores = list(pd.Series(return_score_list).apply(lambda x: challenge_dict_demap[x]))
  prediction[user_id] = scores


This function will be removed in tqdm==5.0.0
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`



HBox(children=(FloatProgress(value=0.0, max=39732.0), HTML(value='')))




In [None]:
final = pd.DataFrame(prediction)

In [None]:
final = final.T
final.reset_index(inplace=True)
final.columns = ['user','11','12','13']
final = pd.melt(final, id_vars = 'user', value_vars=['11','12','13'])
final['user'] = final['user'].astype(str)
final['user_sequence'] = final['user'] + '_' +final['variable']
final.rename(columns={'value':'challenge'}, inplace=True)

In [None]:
final[['user_sequence','challenge']].to_csv('lightfm.csv', index=False)

REFERENCE LINKS:

https://towardsdatascience.com/recommendation-system-in-python-lightfm-61c85010ce17


# KNN based model

In [None]:
train_test = pd.concat([train,test],axis=0)

In [None]:
train_test.head()
del train

In [None]:
df = pd.pivot_table(train_test, index='challenge', columns='user_id', values='challenge_sequence').fillna(0)

In [None]:
# transform matrix to scipy sparse matrix
df_csr = csr_matrix(df.values)

In [None]:
from sklearn.neighbors import NearestNeighbors
# define model
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=40, n_jobs=-1)
# fit
model_knn.fit(df_csr)


NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=-1, n_neighbors=40, p=2,
                 radius=1.0)

In [None]:
distances, indices = model_knn.kneighbors(df.iloc[2,:].values.reshape(1, -1), n_neighbors = 6)

In [None]:
challenge_similarity = dict()
for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(df.index[2]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, df.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations for CI23480:

1: CI26621, with distance of 0.9217279230174384:
2: CI26249, with distance of 0.925458648880627:
3: CI25119, with distance of 0.9467957138177462:
4: CI23510, with distance of 0.9551198941386762:
5: CI26057, with distance of 0.9710276746680095:


Predicting the upcoming challenges based on recent challenges

In [None]:
recent_challenge = test[test['challenge_sequence'] == 9]

In [None]:
def popularChallenge(challenge_name,userid,test_df,user_challenge_df):
  specific_user = test_df[test_df['user_id'] == userid]
  challenges_taken = specific_user['challenge'].unique()
  challenges_taken = np.append(challenges_taken,challenge_name)
  challenges_taken = np.unique(challenges_taken)
  distances, indices = model_knn.kneighbors(user_challenge_df.loc[user_challenge_df.index ==challenge_name,:].values.reshape(1, -1), n_neighbors = 15)
  req_challenges = list()
  for i in range(1, len(distances.flatten()+1)): # starting from 1 since it is considering the query 
    closely_related_challenge = user_challenge_df.index[indices.flatten()[i]]
    if closely_related_challenge not in challenges_taken:
       req_challenges.append(closely_related_challenge)
  return req_challenges
    


In [None]:
%%time
recent_challenge['predicted_challenges'] = recent_challenge.apply\
                                  (lambda x: popularChallenge(x['challenge'],x['user_id'],\
                                                              test,df),\
                                   axis=1)

CPU times: user 23min 59s, sys: 34min 9s, total: 58min 9s
Wall time: 1h 19min 16s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [None]:
pred_challenge = recent_challenge.set_index('user_id')[['predicted_challenges']].to_dict()['predicted_challenges']

In [None]:
final = pd.DataFrame(pred_challenge)

ValueError: ignored

In [None]:
final = final.T
final.reset_index(inplace=True)
final.columns = ['user','11','12','13']
final = pd.melt(final, id_vars = 'user', value_vars=['11','12','13'])
final['user'] = final['user'].astype(str)
final['user_sequence'] = final['user'] + '_' +final['variable']
final.rename(columns={'value':'challenge'}, inplace=True)

In [None]:
final[['user_sequence','challenge']].to_csv('knn.csv', index=False)

REFERENCE: https://github.com/aniketng21/Movie-Recommendation-System-Using-KNN-Algorithm/blob/master/Movie_Recommendation_System.ipynb

# MATRIX FACTORISATION USING SVD

In [None]:
train_test = pd.concat([train,test],axis=0)

In [None]:
train_test.head()

Unnamed: 0,user_sequence,user_id,challenge_sequence,challenge
0,4576_1,4576,1,CI23714
1,4576_2,4576,2,CI23855
2,4576_3,4576,3,CI24917
3,4576_4,4576,4,CI23663
4,4576_5,4576,5,CI23933


In [None]:
df = pd.pivot_table(train_test, index='challenge',columns='user_id', values='challenge_sequence').fillna(0)

In [None]:
del train

In [None]:
challenge_unique = df.index
challenge_list=list(challenge_unique)

In [None]:
df = csr_matrix(df)

In [None]:
import sklearn
from sklearn.decomposition import TruncatedSVD

SVD = TruncatedSVD(n_components=50, random_state=17)
matrix = SVD.fit_transform(df)
matrix.shape

(5502, 500)

In [None]:
import warnings
warnings.filterwarnings("ignore",category =RuntimeWarning)
corr = np.corrcoef(matrix)
corr.shape

(5502, 5502)

In [None]:
recent_challenges = test[test['challenge_sequence']==10].reset_index(drop=True)

In [None]:
def get_challenge(corr_coffey_hands,challenge_list):
  temp = pd.DataFrame()
  temp['measure'] = corr_coffey_hands
  temp['challenge'] = challenge_list
  temp = temp.sort_values(by='measure', ascending=False).reset_index(drop=True)
  pred = temp.loc[1:3,'challenge'].to_list()
  return pred

  
pred_dic = {}
for i in tqdm_notebook(range(recent_challenges.shape[0])):
  challenge_temp = recent_challenges.loc[i,'challenge']
  user_temp = recent_challenges.loc[i,'user_id']
  coffey_hands = challenge_list.index(challenge_temp)
  corr_coffey_hands  = corr[coffey_hands]
  pred_challenge = get_challenge(corr_coffey_hands,challenge_list)
  pred_dic[user_temp] = pred_challenge

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # This is added back by InteractiveShellApp.init_path()


HBox(children=(FloatProgress(value=0.0, max=39732.0), HTML(value='')))




In [None]:
final = pd.DataFrame(pred_dic)

In [None]:
final = final.T
final.reset_index(inplace=True)
final.columns = ['user','11','12','13']
final = pd.melt(final, id_vars = 'user', value_vars=['11','12','13'])
final['user'] = final['user'].astype(str)
final['user_sequence'] = final['user'] + '_' +final['variable']
final.rename(columns={'value':'challenge'}, inplace=True)

In [None]:
final[['user_sequence','challenge']].to_csv('svd.csv', index=False)

# KERAS

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [None]:
train_test = pd.concat([train,test], axis=0)

In [None]:
del train

In [None]:
train_test['challenge'] = le.fit_transform(train_test['challenge'])

In [None]:
train_test.head()

Unnamed: 0,user_sequence,user_id,challenge_sequence,challenge
0,4576_1,4576,1,233
1,4576_2,4576,2,374
2,4576_3,4576,3,1421
3,4576_4,4576,4,182
4,4576_5,4576,5,451


In [None]:
train_new, test_new = train_test_split(train_test, test_size=0.2, random_state=42)

<IPython.core.display.Javascript object>

In [None]:
n_users = len(train_test.user_id.unique())
n_challenges = len(train_test.challenge.unique())

In [None]:
from keras.layers import Input, Embedding, Flatten, Dot, Dense
from keras.models import Model

challenge_input = Input(shape=[1], name="challenge-Input")
challenge_embedding = Embedding(n_challenges+1, 5, name="Book-Embedding")(challenge_input)
challenge_vec = Flatten(name="Flatten-Books")(challenge_embedding)

user_input = Input(shape=[1], name="User-Input")
user_embedding = Embedding(n_users+1, 5, name="User-Embedding")(user_input)
user_vec = Flatten(name="Flatten-Users")(user_embedding)

prod = Dot(name="Dot-Product", axes=1)([challenge_vec, user_vec])
x = Dense(128, activation='relu')(prod)
y = Dense(1)(x)

model = Model([user_input, challenge_input], y)
model.compile('adam', 'mean_squared_error')

Using TensorFlow backend.


In [None]:
history = model.fit([train_new.user_id, train_new.challenge], train_new.challenge_sequence, epochs=10, verbose=1)
model.save('regression_model.h5')


Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
challenge-Input (InputLayer)    (None, 1)            0                                            
__________________________________________________________________________________________________
User-Input (InputLayer)         (None, 1)            0                                            
__________________________________________________________________________________________________
Book-Embedding (Embedding)      (None, 1, 5)         27515       challenge-Input[0][0]            
__________________________________________________________________________________________________
User-Embedding (Embedding)      (None, 1, 5)         546325      User-Input[0][0]                 
____________________________________________________________________________________________

In [None]:
model.load_weights('/content/regression_model.h5')

In [None]:
# Extract embeddings
book_em = model.get_layer('Book-Embedding')
book_em_weights = book_em.get_weights()[0]


In [None]:
weights = model.get_weights()
print("weights shapes",[w.shape for w in weights])

weights shapes [(5503, 5), (109265, 5), (1, 128), (128,), (128, 1), (1,)]


In [None]:

# Extract embeddings
user_em = weights[1]

In [None]:

EPSILON = 1e-07

def cosine_similarities(x,embeddings):
    dot_pdt = np.dot(embeddings, x)
    norms = np.linalg.norm(x) * np.linalg.norm(embeddings,axis = 1)
    return dot_pdt / (norms + EPSILON)

# Computes top_n most similar items to an idx
def most_similar(idx, embeddings,taken,top_n=3):
  # cosine similarity between idx and the rest
  distance = cosine_similarities(embeddings[idx],embeddings)
  order = (-distance).argsort()
  order= [x for x in order if x not in taken and x != 5502]
  order= order[:top_n]
  # return list(zip(order, distance[order]))
  return le.inverse_transform(order)

In [None]:
test['challenge'] = le.transform(test['challenge'])

In [None]:
unique_chall = test.groupby(['user_id']).agg({'challenge':'unique'}).reset_index().rename(columns={'challenge':'unique_Challenges'})

In [None]:
recent_challenges = test[test['challenge_sequence']==10].reset_index(drop=True)

In [None]:
recent_challenges.shape

(39732, 4)

In [None]:
recent_challenges = pd.merge(recent_challenges, unique_chall, on=['user_id'])

https://github.com/PGuti/DeepBeer/blob/master/DEEP_BEERS_BLOG_POST_PART_1.ipynb <br>

https://towardsdatascience.com/building-a-book-recommendation-system-using-keras-1fba34180699


In [None]:
from IPython.core import display as ICD
predictions = {}
for ix in tqdm_notebook(range(recent_challenges.shape[0])):
  req_user = recent_challenges.loc[ix,'user_id']
  lat_challenge = recent_challenges.loc[ix,'challenge']
  taken = recent_challenges.loc[ix,'unique_Challenges']
  similars = most_similar(lat_challenge,book_em_weights,taken, top_n=3)
  predictions[req_user] = similars

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=0.0, max=39732.0), HTML(value='')))




In [None]:
# final = pd.DataFrame(predictions)
# final = final.T
# final.reset_index(inplace=True)
# final.columns = ['user','11','12','13']
# final = pd.melt(final, id_vars = 'user', value_vars=['11','12','13'])
# final['user'] = final['user'].astype(str)
# final['user_sequence'] = final['user'] + '_' +final['variable']
# final.rename(columns={'value':'challenge'}, inplace=True)
# final[['user_sequence','challenge']].to_csv('keras_embedding.csv', index=False)


In [None]:
# final = pd.concat(predictions)

In [None]:
# final.index = [str(i[0])+'_'+str(i[1]+11) for i in final.index]

In [None]:
# final = final.reset_index()

In [None]:
# final.columns = ['user_sequence','challenge']

In [None]:
# final.to_csv('embedding.csv', index=False)