In [None]:
import pandas as pd 
df = pd.read_json('reviews_businesses_users_100k.json', lines=True)
df.shape

In [None]:
# remove 'attributes', 'categories', 'hours' # solved dict problem 
# remove 'date' # solved the timestamp problem 
problems = ['attributes', 'categories', 'hours', 'date']
df.drop(columns=problems, inplace=True)

df['date_joined'] = pd.to_datetime(df['yelping_since'])
df['year_joined'] = df['date_joined'].dt.year
df['month_joined'] = df['date_joined'].dt.month
df['day_joined'] = df['date_joined'].dt.day
df.drop(columns='date_joined', inplace=True)


df.shape

In [None]:
# Prepare 
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size =0.1, shuffle=True, random_state=42)
train.shape, test.shape

target = "average_stars_class"
X_train = train.drop(columns=[target, 'average_stars'])
y_train = train[target]
X_test = test.drop(columns=[target, 'average_stars'])
y_test = test[target]

from sklearn.pipeline import make_pipeline
import category_encoders as ce
from sklearn.impute import SimpleImputer
transformers = make_pipeline(
    ce.OrdinalEncoder(),
    SimpleImputer(strategy='median')
)
X_train_T = transformers.fit_transform(X_train)
X_test_T = transformers.transform(X_test)

In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train_T, y_train) 
print("Model Score: ", model.score(X_test_T, y_test))

In [None]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=100, n_jobs=-1)
model.fit(X_train_T, y_train)
print("Model Score: ", model.score(X_test_T, y_test))

In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=50, n_jobs=-1)
model.fit(X_train_T, y_train)
print("Model Score: ", model.score(X_test_T, y_test))

In [None]:
import shap
import numpy as np 
import random 

rows = 10000
row = X_train.iloc[[rows]]

explainer = shap.TreeExplainer(model)
row_transformed = transformers.transform(row)
shap_values = explainer.shap_values(row_transformed)

shap.initjs()
shap.force_plot(
    base_value=explainer.expected_value, # [0]
    shap_values=shap_values, #[0]
    features=row, 
    link='identity')

In [None]:
df['user_review_count'].mean() # baseline is correct for shap plot 

In [None]:
rows = 1000
X_train.iloc[[rows]]

In [None]:
pd.qcut(df['text_length'], q=10).value_counts(normalize=True)

In [None]:
import plotly.express as px 

px.scatter(df, x='longitude', y='average_stars', trendline='ols')

In [None]:
# Engineering
import numpy as np 

deciles = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
df['text_length'] = df['text'].apply(len)
df['text_class'] = pd.qcut(df['text_length'], q=10, labels=deciles)
df['text_length_log'] = np.log(df['text_length'])
deciles=range(1, 11)
df['review_count_class'] = pd.qcut(df['review_count'], q=10, labels=deciles)

df['average_stars_class'] = pd.qcut(df['average_stars'], q=10, labels=deciles)

import numpy as np
df['review_count_log'] = np.log(df['review_count'])

df['friends_count'] = df['friends'].apply(len)
df['friends_count_log'] = np.log(df['friends_count'])


In [None]:
df.columns

In [None]:
import plotly.express as px 
px.scatter_3d(df, x='longitude', y='latitude', z='year_joined',
              color='average_stars', symbol='average_stars')

In [None]:
quintiles=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
df['average_stars_bin'] = pd.cut(df['average_stars'], 10, labels=quintiles)

In [None]:
pd.cut(df['average_stars'], 10).value_counts()

In [None]:
%pylab inline
result = df.pivot_table(values='average_stars',
                        index='year_joined', columns='review_count_class')

figure(num=None, figsize=(8, 6), dpi=100, facecolor='w', edgecolor='k');             
import seaborn as sns 
ax = sns.heatmap(result);



In [None]:
df['postal_code'].value_counts(normalize=True)

In [None]:
pd.set_option('display.max_rows', 2500)

In [None]:
df['Caters'].value_counts()

In [None]:
df.T