# Yelp User Wrangling

In [None]:
import numpy as np
import pandas as pd
import datetime
import json

DRY_RUN = True

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

# Load User Data

In [None]:
print('[%s] Loading User Data...' % datetime.datetime.now().time())

data = pd.DataFrame()
source_data_file = '../source_data/user.json'

user_list = []
for line in open(source_data_file, 'r'):
    user_list.append(json.loads(line))

print('[%s] creating dataframe...' % datetime.datetime.now().time())    
user_df = pd.DataFrame(user_list)

print('[%s] data type cleanup...' % datetime.datetime.now().time())    
user_df.yelping_since = pd.to_datetime(user_df.yelping_since)

print('[%s] Complete!' % datetime.datetime.now().time())

[01:31:29.479772] Loading User Data...
[01:37:22.029148] creating dataframe...


# Calculate Friend Count

In [None]:
user_df['friend_count'] = user_df['friends'].apply(lambda x: len(x))

# Split out `Friends` dataframe

In [None]:
friends_df = user_df[['user_id', 'friends']].copy()
friends_df.columns = ['user_id', 'friends_list']

friends_df['friend_count'] = 0
friends_df.head(3)

# Drop `friends` Column from users_df

In [None]:
user_df.drop(['friends'], axis=1, inplace=True)

# Calculate `yelper_age` column

In [None]:
if DRY_RUN:
    tmp = user_df[:20].copy()
    tmp['yelper_age'] = tmp.apply(lambda row: (tmp.yelping_since.max() - row.yelping_since).days,axis=1)
    print('{}\t{}'.format(len(tmp.yelp_age.unique()), tmp.yelp_age.unique()))
else:
    user_df['yelper_age'] = user_df.apply(lambda row: (user_df.yelping_since.max() - row.yelping_since).days,axis=1)
    print('{}\t{}'.format(len(user_df.yelp_age.unique()), user_df.yelp_age.unique()))

In [None]:
user_df.yelp_age.unique()

# One Hot Encode `elite` years with sklearn

In [None]:
tmp = pd.DataFrame(user_df.elite.values.tolist(), index= user_df.index)
elite_min_year = int(tmp.min().min())
elite_max_year = int(tmp.max().max())

num_elite_years = elite_max_year - elite_min_year + 1

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
user_df = user_df.join(pd.DataFrame(mlb.fit_transform(user_df.pop('elite')),
                          columns=mlb.classes_,
                          index=user_df.index))

In [None]:
elite_cols = list(user_df.columns[-num_elite_years:])
orig_columns = list(user_df.columns)[:-num_elite_years]
user_df.columns = orig_columns + ['elite_{}'.format(x) for x in elite_cols]

# Write to Files