In [1]:
import pandas as pd
import numpy as np
import json
import operator
import os
from scipy.sparse.linalg import svds

from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics.pairwise import cosine_similarity

from urllib.parse import urljoin
import itertools
import random

import matplotlib.pyplot as plt
%matplotlib inline

### READ DATA

In [2]:

businesses = pd.read_json('business.json', lines= True)
result = pd.read_json('result.json', lines=True)


### JOIN DATAFRAMES ON BUSINESS IDs

In [3]:
res = pd.merge(businesses, result, on='business_id')
res = res.drop('categories_x',axis=1)
res['categories_y'] = res['categories_y'].apply(lambda x: list(map(lambda x: x.get('alias', ""), x)))

### Get top categories

In [4]:
from collections import defaultdict

categories = defaultdict(lambda : 0)

for cats in res['categories_y']:
    for category in cats:
        categories[category] += 1
        
len(categories)

1209

In [5]:
sorted_x = sorted(categories.items(), key=operator.itemgetter(1),reverse=True)
required_cat = set(map(lambda x: x[0], sorted_x[:25]))

required_cat = ["pizza",
"hotdogs",
"sandwiches",
"tradamerican",
"coffee",
"italian",
"burgers",
"mexican",
"breakfast_brunch",
"chinese",
"bars",
"bakeries",
"newamerican",
"icecream",
"cafes",
"japanese",
"chicken_wings",
"sushi",
"seafood",
"desserts"]

### RESTRICT TO TOP CATEGORIES

In [6]:
mask = res['categories_y'].apply(lambda x: len(set(x).intersection(required_cat)) > 0)

res = res[mask]


### RESTRICT TO BUSINESSES IN US

In [7]:
def in_USA(s):
    try:
        int(s)
        return len(s) == 5
    except ValueError:
        return False

In [8]:
mask2 = res['postal_code'].apply(in_USA)

In [9]:
res = res[mask2]

### Remove categories not in top 10

In [11]:
res['categories_y'] = res['categories_y'].apply(lambda x: list(filter(lambda y: y in required_cat, x)))

In [13]:
path ='project_data' # use your path
allFiles = os.listdir(path)
frame = pd.DataFrame()
list_ = []
for file_ in allFiles:
    df = pd.read_json(path+"/"+file_,lines=True)
    list_.append(df)
frame = pd.concat(list_)

In [14]:
len(frame)

4736897

In [15]:
merged  = pd.merge(res, frame, on='business_id')
s = merged.apply(lambda x: pd.Series(x['categories_y']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'categories'
merged = merged.drop('categories_y', axis=1).join(s)


In [16]:
ratings = merged[['user_id', 'categories','stars_y']].groupby(['user_id', 'categories'], as_index=False).mean()

In [29]:
ratings.to_csv("ratings.csv",sep=";",index=False)