# Build Recsys based on popularity model

In [1]:
import json
from pandas.io.json import json_normalize

import numpy as np
import scipy
import pandas as pd
import math
import random
import sklearn
from nltk.corpus import stopwords
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

## Loading interaction data

In [2]:
# function for filter user game interaction
# https://github.com/bcc008/ucsd-dse-capstone-c4g4/blob/master/notebooks/hyperparameter_search.ipynb
ratings_df  = pd.read_csv('../processed_data/ratings.csv')

In [4]:
ratings_df.drop_duplicates()

Unnamed: 0,user_id,item_id,rating
0,76561197970982479,10,1
1,76561197970982479,20,1
2,76561197970982479,30,1
3,76561197970982479,40,1
4,76561197970982479,50,1
...,...,...,...
5035430,76561198329548331,346330,1
5035431,76561198329548331,373330,1
5035432,76561198329548331,388490,1
5035433,76561198329548331,521570,1


In [6]:
# drop duplicates
interactions_full_df = ratings_df.drop(['rating'], axis=1)

In [8]:
interactions_full_df

Unnamed: 0,user_id,item_id
0,76561197970982479,10
1,76561197970982479,20
2,76561197970982479,30
3,76561197970982479,40
4,76561197970982479,50
...,...,...
5035430,76561198329548331,346330
5035431,76561198329548331,373330
5035432,76561198329548331,388490
5035433,76561198329548331,521570


In [9]:
interactions_full_df["count"] = 1

# Build popularity dataframe

In [10]:
interactions_full_df.head()

Unnamed: 0,user_id,item_id,count
0,76561197970982479,10,1
1,76561197970982479,20,1
2,76561197970982479,30,1
3,76561197970982479,40,1
4,76561197970982479,50,1


In [11]:
interactions_full_df.head()

Unnamed: 0,user_id,item_id,count
0,76561197970982479,10,1
1,76561197970982479,20,1
2,76561197970982479,30,1
3,76561197970982479,40,1
4,76561197970982479,50,1


In [12]:
#Computes the most popular items
item_popularity_df = interactions_full_df.groupby('item_id')['count'].count().sort_values(ascending=False).reset_index()
item_popularity_df.head(10)

Unnamed: 0,item_id,count
0,205790,48705
1,730,42894
2,4000,42406
3,304930,37883
4,223530,36282
5,550,36282
6,105600,28633
7,230410,25229
8,620,23951
9,240,23687


In [13]:
## change column label
item_popularity_df = item_popularity_df.rename(columns={"count":"popularity"})

In [19]:
## save popularity file
item_popularity_df.to_csv('../processed_data/popularity.csv', index=False)  

## recommend by poplarity model

In [22]:
def rec_popularity(topK = 10):
    return item_popularity_df["item_id"].head(topK)

In [23]:
rec_popularity()

0    205790
1       730
2      4000
3    304930
4    223530
5       550
6    105600
7    230410
8       620
9       240
Name: item_id, dtype: int64