In [1]:
import pandas as pd 
df = pd.read_json('reviews_businesses_users_100k.json', lines=True)
df.shape

(100000, 43)

In [2]:
# remove 'attributes', 'categories', 'hours' # solved dict problem 
# remove 'date' # solved the timestamp problem 
problems = ['attributes', 'categories', 'hours']
df.drop(columns=problems, inplace=True)

df['review_date'] = pd.to_datetime(df['date'])
df['review_year'] = df['review_date'].dt.year
df['review_month'] = df['review_date'].dt.month
df['review_day'] = df['review_date'].dt.day

df['date_joined'] = pd.to_datetime(df['yelping_since'])
df['year_joined'] = df['date_joined'].dt.year
df['month_joined'] = df['date_joined'].dt.month
df['day_joined'] = df['date_joined'].dt.day
df.drop(columns=['date', 'review_date', 'yelping_since', 'date_joined'], inplace=True)

df.shape

(100000, 44)

In [13]:
# unless specified we assume that these are codes for the review
# i.e. stars is the number of stars of the review 
# review_count is the review count for the business

df.columns

Index(['review_id', 'user_id', 'business_id', 'stars', 'useful', 'funny',
       'cool', 'text', 'name', 'address', 'city', 'state', 'postal_code',
       'latitude', 'longitude', 'review_count', 'is_open', 'stars_business',
       'elite', 'friends', 'fans', 'average_stars', 'compliment_hot',
       'compliment_more', 'compliment_profile', 'compliment_cute',
       'compliment_list', 'compliment_note', 'compliment_plain',
       'compliment_cool', 'compliment_funny', 'compliment_writer',
       'compliment_photos', 'user_name', 'user_review_count', 'user_useful',
       'user_funny', 'user_cool', 'review_year', 'review_month', 'review_day',
       'year_joined', 'month_joined', 'day_joined'],
      dtype='object')

In [3]:
# Engineering
import numpy as np # for np.log function

quintiles = [1, 2, 3, 4, 5]
deciles = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

# df['average_stars_class'] = pd.cut(df['average_stars'], 10, labels=quintiles)

df['review_length'] = df['text'].apply(len)
df['review_length_class'] = pd.qcut(df['review_length'], q=5, labels=quintiles)

# df['review_count_class'] = pd.qcut(df['review_count'], q=10, labels=deciles)
# df['average_stars_class'] = pd.qcut(df['average_stars'], q=10, labels=deciles)


df['friends_count'] = df['friends'].apply(len)

df['review_count_log'] = np.log(df['review_count'])
# df['review_length_log'] = np.log(df['review_length'])
df['friends_count_log'] = np.log(df['friends_count'])


#  # get average length of reviews for each user
# dum2 = df.groupby('user_id', as_index=False).review_length.mean()
# dum2['user_average_review_length'] = dum2['review_length']
# dum2.drop(columns='review_length', inplace=True)
# df = pd.merge(df, dum2, 'inner')

# # get average length of reviews for each business
# dum = df.groupby('business_id', as_index=False).review_length.mean() 
# dum['business_average_review_length'] = dum['review_length']
# dum.drop(columns='review_length', inplace=True)
# df = pd.merge(df, dum, 'inner')



In [4]:
# Prepare 
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size =0.1, shuffle=True, random_state=42)
train.shape, test.shape

target = "review_length"
X_train = train.drop(columns=[target, 'review_length_class'])
y_train = train[target]
X_test = test.drop(columns=[target, 'review_length_class'])
y_test = test[target]

from sklearn.pipeline import make_pipeline
import category_encoders as ce
from sklearn.impute import SimpleImputer
transformers = make_pipeline(
    ce.OrdinalEncoder(),
    SimpleImputer(strategy='median')
)
X_train_T = transformers.fit_transform(X_train)
X_test_T = transformers.transform(X_test)

In [8]:
# Baseline
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

X_baseline = [df['review_length'].mean()] * len(y_train)
mse = mean_absolute_error(X_baseline, y_train)

R_2 = r2_score(X_baseline, y_train)
print("Baseline Mean Absolute Error: ", mse)
print("Train Model R-2 Score: ", R_2)

Baseline Mean Absolute Error:  391.67060816088883
Train Model R-2 Score:  0.0


In [10]:
# Linear Regression 
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train_T, y_train)

from sklearn.metrics import mean_absolute_error
mse1 = mean_absolute_error(model.predict(X_train_T), y_train)
mse2 = mean_absolute_error(model.predict(X_test_T), y_test)

from sklearn.metrics import r2_score
R_2a = r2_score(model.predict(X_train_T), y_train)
R_2b = r2_score(model.predict(X_test_T), y_test)

print("Train Model Mean Absolute Error: ", mse1)
print("Test Model Mean Absolute Error: ", mse2)
print("Train Model R-2 Score: ", R_2a)
print("Test Model R-2 Score: ", R_2b)

Train Model Mean Absolute Error:  351.5812936031662
Test Model Mean Absolute Error:  361.12233657147544
Train Model R-2 Score:  -5.1031059860906405
Test Model R-2 Score:  -6.203371286395691


In [5]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=10, n_jobs=-1)
model.fit(X_train_T, y_train)

from sklearn.metrics import mean_absolute_error
mse1 = mean_absolute_error(model.predict(X_train_T), y_train)
mse2 = mean_absolute_error(model.predict(X_test_T), y_test)

from sklearn.metrics import r2_score
R_2a = r2_score(model.predict(X_train_T), y_train)
R_2b = r2_score(model.predict(X_test_T), y_test)

print("Train Model Mean Absolute Error: ", mse1)
print("Test Model Mean Absolute Error: ", mse2)
print("Train Model R-2 Score: ", R_2a)
print("Test Model R-2 Score: ", R_2b)

Train Model Mean Absolute Error:  144.49576444444443
Test Model Mean Absolute Error:  541.3450799999999
Train Model R-2 Score:  0.7249397684472755
Test Model R-2 Score:  -1.427231844958461


In [73]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=100, weights='uniform',
                                       algorithm='auto',
                                       leaf_size=20, p=2, metric='minkowski', 
                                       metric_params=None, n_jobs=-1)


model.fit(X_train_T, y_train)
print("Model Score: ", model.score(X_train_T, y_train))
print("Model Score: ", model.score(X_test_T, y_test))

Model Score:  0.021355555555555555
Model Score:  0.0014


In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=10, n_jobs=-1)
model.fit(X_train_T, y_train)
print("Model Score: ", model.score(X_train_T, y_train))
print("Model Score: ", model.score(X_test_T, y_test))

(487374.37624444446, 523257.5279)

In [98]:
import plotly.express as px 
px.scatter(df, x='average_stars', y='review_length', opacity=0.1, title='One Star Reviews are Longer?')

In [50]:
X_train_T.shape, y_train.shape

((90000, 47), (90000,))

In [120]:
!pip install shap



In [19]:
df['review_length'].mean() # baseline is correct for shap plot 

605.11978

In [16]:
import shap
import numpy as np 
import random 

rows = 11000
row = X_train.iloc[[rows]]

explainer = shap.TreeExplainer(model)
row_transformed = transformers.transform(row)
shap_values = explainer.shap_values(row_transformed)

shap.initjs()
shap.force_plot(
    base_value=explainer.expected_value, # [0]
    shap_values=shap_values, #[0]
    features=row, 
    link='identity') # ! identity 

In [None]:
# what proportion of one star reviews are from low review count users 

In [86]:
dum3 = df.groupby('business_id', as_index=False).user_review_count.median()
# interesting but not what we want 

In [96]:
mask1 = df['stars']==1
mask2 = df['stars']==2
mask3 = df['stars']==3
mask4 = df['stars']==4
mask5 = df['stars']==5
df[mask1].shape, df[mask2].shape, df[mask3].shape, df[mask4].shape, df[mask5].shape


((14994, 54), (8176, 54), (11046, 54), (22022, 54), (43762, 54))

In [100]:
print(df[mask1]['user_review_count'].mean(),
df[mask2]['user_review_count'].mean(),
df[mask3]['user_review_count'].mean(),
df[mask4]['user_review_count'].mean(),
df[mask5]['user_review_count'].mean())


42.270174736561295 131.63600782778866 244.655350353069 199.73476523476523 77.51245372697774


In [101]:
print(df[mask1]['user_review_count'].median(),
df[mask2]['user_review_count'].median(),
df[mask3]['user_review_count'].median(),
df[mask4]['user_review_count'].median(),
df[mask5]['user_review_count'].median())


10.0 28.0 75.0 60.0 17.0


In [190]:
maskb = df['business_id']=='VcpYP6btZ5Tb_qSLSVKHIw' 
maskc = df[maskb]['stars']==1
df[maskb][maskc]['user_review_count'].value_counts(normalize=True)

1    1.0
Name: user_review_count, dtype: float64

In [216]:
def prop_one_business_low(df, business_id, star): # ex. df, Zjddj, 1
    mask1 = df['business_id'] == business_id 
    mask2 = df[mask1]['stars'] == star
    result = df[mask1][mask2]['user_review_count'].value_counts()
    return result 

In [221]:
prop_one_business_low(df, 'LwQB9H3jZ9wTk24Lr-AnZQ', 5)

31     1
230    1
13     1
11     1
59     1
Name: user_review_count, dtype: int64

In [205]:
hmmm = df['review_count'] > 1000

In [225]:
import joblib
import sklearn
import category_encoders as ce
print(f'joblib=={joblib.__version__}')
print(f'scikit-learn=={sklearn.__version__}')
print(f'category_encoders=={ce.__version__}')

joblib==0.14.0
scikit-learn==0.21.3
category_encoders==2.1.0


In [222]:
df.shape

(100000, 44)

In [18]:
# Engineering

 # get average length of reviews for each user
dum2 = df.groupby('user_id', as_index=False).review_length.mean()
dum2['user_average_review_length'] = dum2['review_length']
dum2.drop(columns='review_length', inplace=True)
df = pd.merge(df, dum2, 'inner')

# # get average length of reviews for each business
# dum = df.groupby('business_id', as_index=False).review_length.mean() 
# dum['business_average_review_length'] = dum['review_length']
# dum.drop(columns='review_length', inplace=True)
# df = pd.merge(df, dum, 'inner')


import numpy as np # for np.log function

quintiles = [1, 2, 3, 4, 5]
deciles = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

# df['average_stars_class'] = pd.cut(df['average_stars'], 10, labels=quintiles)

df['review_length'] = df['text'].apply(len)
df['review_length_class'] = pd.qcut(df['review_length'], q=5, labels=quintiles)

# df['review_count_class'] = pd.qcut(df['review_count'], q=10, labels=deciles)
# df['average_stars_class'] = pd.qcut(df['average_stars'], q=10, labels=deciles)


df['friends_count'] = df['friends'].apply(len)

df['review_count_log'] = np.log(df['review_count'])
# df['review_length_log'] = np.log(df['review_length'])
df['friends_count_log'] = np.log(df['friends_count'])


In [None]:
%pylab inline
result = df.pivot_table(values='average_stars',
                        index='year_joined', columns='review_count_class')

figure(num=None, figsize=(8, 6), dpi=100, facecolor='w', edgecolor='k');             
import seaborn as sns 
ax = sns.heatmap(result);



In [17]:
df['negative_people'] = (df['average_stars'] < 2.5) & (df['user_review_count'] > 2)
df['positive_people'] = (df['average_stars'] < 2.5) & (df['user_review_count'] > 2)

df['positive_people'].value_counts(), df['negative_people'].value_counts()

df['polarized_people'] = (df['average_stars'] < 2.5) & (df['user_review_count'] > 2) | (df['average_stars'] < 2.5) & (df['user_review_count'] > 2)
df['polarized_people'].value_counts()

df_polar = df[df['polarized_people']]
df_polar.shape

In [None]:
pip install haversine

In [None]:
# cluster functions 

import random
import numpy as np
import pandas as pd 
import scipy.spatial
from haversine import haversine
def distance(p1,p2):
  return haversine(p1[1:],p2[1:])
def cluster_centroids(data, clusters, k):
  results=[]
  for i in range(k):
    results.append( np.average(data[clusters == i],weights=np.squeeze(np.asarray(data[clusters == i][:,0:1])),axis=0))
  return results
def kmeans(data, k=None, centroids=None, steps=20):
  # Forgy initialization method: choose k data points randomly.
  centroids = data[np.random.choice(np.arange(len(data)), k, False)]
  for _ in range(max(steps, 1)):
    sqdists = scipy.spatial.distance.cdist(centroids, data, lambda u, v: distance(u,v)**2)
    # Index of the closest centroid to each data point.
    clusters = np.argmin(sqdists, axis=0)
    new_centroids = cluster_centroids(data, clusters, k)
    if np.array_equal(new_centroids, centroids):
      break
    centroids = new_centroids
  
  return clusters, centroids


#setup
df = df_new
data = df['average_stars_class']
vals = df[['review_count', 'latitude' ,'longitude']].values
k = 10
random.seed(42)
#run it
clusters,centroids=kmeans(vals,k)
#output
df['c']=[int(c) for c in clusters]
lats = [centroids[i][1] for i in range(k)]
df['clat'] = df['c'].map(lambda x: lats[x])
longs = [centroids[i][2] for i in range(k)]
df['clong'] = df['c'].map(lambda x: longs[x])


In [1]:
import joblib
import sklearn
import category_encoders as ce
print(f'joblib=={joblib.__version__}')
print(f'scikit-learn=={sklearn.__version__}')
print(f'category_encoders=={ce.__version__}')


joblib==0.14.0
scikit-learn==0.21.3
category_encoders==2.1.0
