In [1]:
import json
import pickle
import pandas as pd
import numpy as np
from scipy import stats
from scipy.stats import norm, skew
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

In [2]:
# load data
# event hosts: event_id, host member id, number of rsvps
df_event_hosts = pd.read_pickle('df_event_hosts')
# group topics
df_group_topics = pd.read_pickle('df_group_topics')
# event info
df_events = pd.read_pickle('df_events')
df_events['time'] =  pd.to_datetime(df_events['time'], unit='ms').dt.tz_localize('GMT').dt.tz_convert('US/Eastern')
df_events['duration'] = df_events['duration']/3600000
df_events = df_events.rename(columns = {'id':'event_id', 'time':'event_time', 'rating_average': 'event_rating', 'rating_count':'event_rating_count'})
# rsvp info
df_rsvps = pd.read_pickle('df_rsvps')

# event host info
# remove entry if there is no event_host_member_id
df_event_hosts = df_event_hosts.loc[df_event_hosts.event_hosts.notnull(),:]
# column renaming
df_event_hosts = df_event_hosts.rename(columns = {'event_hosts': 'member_id', 'id': 'event_id'})
df_event_hosts.member_id = df_event_hosts.member_id.astype(int)
df_event_hosts.head()


Unnamed: 0,event_id,member_id,yes_rsvp_count
1,99516892,3856066,16
2,105695942,3856066,18
3,115014682,3856066,51
4,126233622,3856066,60
5,133544832,3856066,94


In [3]:
# map event_id and member_id
with open('group_url_topic_map.pkl', 'rb') as f:
    group_url_topic_map = pickle.load(f)
with open('group_id_topic_map.pkl', 'rb') as f:
    group_id_topic_map = pickle.load(f)

df_rsvps['topic'] = df_rsvps['group_urlname'].map(group_url_topic_map)
df_events['topic'] = df_events['group_urlname'].map(group_url_topic_map)
# map_dicts = df_events.loc[:,['id','group_urlname', 'topic']].set_index('id').to_dict()
#df_event_hosts['topic'] = df_event_hosts['event_id'].map(map_dicts['topic'])
df_event_hosts.drop('yes_rsvp_count', axis = 1, inplace = True)

In [4]:
#
event_rating_map = df_events.loc[:, ['event_id','event_rating', 'event_rating_count']].set_index('event_id').to_dict()

# separate 
df_event_hosts['event_rating'] = df_event_hosts.event_id.map(event_rating_map['event_rating'])
df_event_hosts['event_rating_count'] = df_event_hosts.event_id.map(event_rating_map['event_rating_count'])
member_host_count_map = df_event_hosts.loc[df_event_hosts['event_rating_count']>=3,:].member_id.value_counts().to_dict()
df_event_hosts['host_count']= df_event_hosts.member_id.map(member_host_count_map)

# venue info 
df_venues = pd.read_pickle('df_venues')
event_venue_map = df_venues.loc[df_venues.rating_count > 3, 'rating':'event_id'].set_index('event_id').to_dict()
df_events['venue_rating'] = df_events['event_id'].map(event_venue_map['rating'])
df_events['venue_rating_count'] = df_events['event_id'].map(event_venue_map['rating_count'])

events_cols = ['event_id', 'waitlist_count','yes_rsvp_count', 'event_time', 'duration','topic', 'venue_rating']
df_events_info = df_events.loc[:, events_cols].set_index('event_id')
df_events_info.head()

#df_event_hosts
df_event_hosts = df_event_hosts.join(df_events_info, on = 'event_id')


In [5]:
# select cases: 
mask = (df_event_hosts['host_count']>=5) & (df_event_hosts['event_rating_count']>=3)
df = df_event_hosts.loc[mask,:].sort_values(['member_id', 'event_time'], ascending=False).reset_index().copy()
df.drop('index', axis = 1, inplace = True)

# split 
test_index = list(df.member_id.diff()[df.member_id.diff() != 0].index.values)
train_index = [i + 1 for i in test_index] + [i + 2 for i in test_index]

#df_event_hosts.loc[most_recent_index,:].sort_values('member_id',ascending = False)
host_avg_rating_map = df.loc[~df.index.isin(test_index + train_index),:].groupby('member_id').apply(lambda x: np.average(x['event_rating'], weights=x['event_rating_count'])).to_dict()

#df['member_rating'] = df_second.member_id.map(host_avg_rating_map)
#df['venue_rating'] = df_second.event_id.map(event_venue_map['rating'])
df_test = df.loc[df.index.isin(test_index),:].copy()
df_train = df.loc[df.index.isin(train_index),:].copy()
df_train['member_rating'] = df_train.member_id.map(host_avg_rating_map)
df_test['member_rating'] = df_test.member_id.map(host_avg_rating_map)

train_n = df_train.shape[0]
df_all = pd.concat([df_train, df_test], axis = 0)
df_all.drop(['event_id','member_id','event_rating_count', 'event_time'], axis=1, inplace=True)
df_all = df_all[['event_rating', 'venue_rating', 'member_rating', 'host_count', 'waitlist_count', 'yes_rsvp_count', 'duration', 'topic']]


In [6]:
# transform skewness
from scipy.special import boxcox1p
# fill na
df_all.venue_rating.fillna(df.venue_rating.median(), inplace=True)
df_all.duration.fillna(df.duration.median(), inplace=True)

# skew
lam = 0.01
columns = df_all.dtypes[df_all.dtypes != 'object'].index
for col in columns:
    df_all[col] = boxcox1p(df_all[col], lam)

df_all = pd.get_dummies(df_all)

In [7]:
X_train = df_all.iloc[:train_n, 1:]
y_train = df_all.iloc[:train_n, 0]
X_test = df_all.iloc[train_n:, 1:]
y_test = df_all.iloc[train_n:, 0]

In [30]:
from sklearn.linear_model import LinearRegression, ElasticNet, Lasso, Ridge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error, r2_score


In [9]:
# five fold 
n_folds = 5
def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(X_train)
    rmse= np.sqrt(-cross_val_score(model, X_train, y_train, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)

In [32]:
# compare models
lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1))
lasso_score = rmsle_cv(lasso)

ridge = make_pipeline(RobustScaler(), Ridge(alpha =0.0005, random_state=1))
ridge_score = rmsle_cv(ridge)

ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))
Enet_score = rmsle_cv(ENet)

linear = LinearRegression()
linear_score = rmsle_cv(linear)

scores = [lasso_score, ridge_score, Enet_score, linear_score]
df_score = pd.DataFrame({'mean':np.mean(scores, axis = 1),
                        'std':np.std(scores, axis = 1) }, 
                       index = ['lasso','ridge','Enet','linear'])
df_score.sort_values('mean')

Unnamed: 0,mean,std
Enet,0.061226,0.007415
lasso,0.061232,0.007421
ridge,0.061261,0.00736
linear,0.061261,0.00736


In [28]:
# with information about venues and hosts
ENet.fit(X_train, y_train)
y_pred1 = ENet.predict(X_test)

# withou information about venues and hosts
ENet.fit(X_train.iloc[:,2:], y_train)
y_pred2 = ENet.predict(X_test.iloc[:,2:])

# Explained variance score: 1 is perfect prediction
r2_score(y_test, y_pred1)
r2_score(y_test, y_pred2)
increase = np.subtract(r2_score(y_test, y_pred1), r2_score(y_test, y_pred2))
print(f'Prediction increased by {increase*100:.2f}%')

Prediction increased by 7.71%
