# IMPORTS

In [None]:
import my_pickle as mp
import pandas as pd
import numpy as np
import json
from collections import defaultdict
import my_pickle as mp
import my_split as ms
import matplotlib.pyplot as plt

%matplotlib inline
import datetime as dt
import seaborn as sns
sns.set()

import warnings
warnings.filterwarnings('ignore')

# READ IN USER INFORMATION

In [None]:
user_filename = "/Users/gandalf/Documents/coding/data_do_not_commit/raw_data_users.json"
DF = pd.read_json(user_filename)
print("... read in dataframe")

In [None]:
# create a copy
df = DF.copy()

# remove users that haven't been onboarded
df = df.drop(df[df.onboarded != 1].index)
print("... dropped users that aren't onboarded")

# convert to datetimes
def my_to_datetime(x):
    if isinstance(x, dict):
        try: return pd.to_datetime(x['$date'])
        except: return None
    else: return None

def my_to_date(x):
    if isinstance(x, dict):
        try: return pd.to_datetime(x['$date']).date()
        except: return None
    else: return None
df['_created_at'] = df._created_at.apply(lambda x: my_to_datetime(x))
df['_updated_at'] = df._updated_at.apply(lambda x: my_to_datetime(x))
df.activeAt = df.activeAt.apply(lambda x: my_to_datetime(x))
df.available = df.available.apply(lambda x: my_to_date(x))
df.birthday = df.birthday.apply(lambda x: my_to_date(x))
print("... changed to datetimes")

# set the user id as the index
df = df.set_index('_id')
print("... set user id as the index")

# create a constant for easy counting
df['const']=1
print("... created a constant value")

# set a value n for the number of users in the sample
n = len(df)
print("There are {} users in the data frame.".format(n))

In [None]:
print(df.columns)

In [None]:
drop_set = set()

# _acl

In [None]:
# What is it? 
#   looks like read/write properties. everything seems set to True for everyone. 
#
# What do I do with it? 
#   delete

drop_set.add('_acl')

# _auth_data_facebook

In [None]:
# What is it? 
#   information about Facebook authoration 
#
# What do I do with it? 
#   turn into has_facebook binary

df['has_facebook'] = df._auth_data_facebook.apply(lambda x: 
                                                  1 if isinstance(x, dict) else 0)
print("{:.3}% of users have linked to facebook.".format(100*df.has_facebook.sum()/n))
drop_set.add('_auth_data_facebook')

# _created_at

In [None]:
# What is it?
#   date the account was created
#
# What do I do with it?
#   get date, day of month, day of week, and hour of day information

df['created_date'] = df._created_at.apply(lambda x: x.date())

df['created_year'] = df._created_at.apply(lambda x: x.year)
df['created_day_of_year'] = df._created_at.apply(lambda x: int(x.strftime('%j')))

df['created_month'] = df._created_at.apply(lambda x: x.isocalendar()[0])
df['created_day_of_month'] = df._created_at.apply(lambda x: x.day)

df['created_week'] = df._created_at.apply(lambda x: x.isocalendar()[1])
df['created_day_of_week'] = df._created_at.apply(lambda x: x.isoweekday())

df['created_day'] = df._created_at.apply(lambda x: x.toordinal())
df['created_hour'] = df._created_at.apply(lambda x: x.hour)

# _hashed_password

In [None]:
# What is it?
#   some ppl have one, some don't
#
# What do I do with it?
#   make it binary!

df['has_password'] = df._hashed_password.apply(lambda x: 1 if isinstance(x, str) else 0)
print("{:.3}% of users have a password.".format(100*df.has_password.sum()/n))
drop_set.add('_hashed_password')

# _p_room

In [None]:
# What is it?
#   does the person have a room already?
#
# What do I do with it?
#   make it binary!

df['has_room'] = df._p_room.apply(lambda x: 1 if isinstance(x, str) else 0)
print("{:.3}% of users already have a room".format(100*df.has_room.sum()/n))
drop_set.add('_hashed_password')

# _rperm

In [None]:
# What is it?
#   a list of an asterix and the user's id
#
# What do I do with it?
#   delete it

drop_set.add('_rperm')

# _updated_at

In [None]:
# What is it?
#   one of three dates in Jan 2018
#
# What do I do with it?
#   delete it

# get information out of when they updated their profile
df['_updated_at_date'] = df._updated_at.apply(lambda x: x.date())
print(df['_updated_at_date'].unique())

drop_set.add('_updated_at')
drop_set.add('_updated_at_date')

# about

In [None]:
# What is it?
#   users self-descriptions
#
# What do I do with it?
#   turn NaN into empty strings
#   create binary has_about variable
#   create int len_about variable

df.about = df.about.fillna("")
df['len_about'] = df.about.apply(lambda x: len(x))
df['has_about'] = df.len_about.apply(lambda x: 1 if x >0 else 0)
print("{:.3}% of users have an about section".format(100*df.has_about.sum()/n))

# activeAt

In [None]:
# What is it?
#   most recent activity
#
# What do I do with it?
#   delete it
#   has data from after the break point --> data bleed

drop_set.add('activeAt')

# amenities

In [None]:
# What is it?
#   ammenities ppl have asked for
#
# What do I do with it?
#   create len amenities
#   create has amenities
#   perhaps lateron tokenize each amenity

df['len_amenities'] = df.amenities.apply(lambda x: len(x) if isinstance(x, list) else 0)
df['has_amenities'] = df.len_amenities.apply(lambda x: 1 if x >0 else 0)

# available

In [None]:
# What is it?
#   date person needs a room
#
# What do I do with it?
#   turn into a date
#   create has_available
#   create timedelta_created_available
#
#   Maybe in the future,
#   fix dates that appear misentered, or somehow code the dates as "past" or "future"

df['has_available'] = df.available.apply(lambda x: 1 if isinstance(x, dt.date) else 0)
df['td_creat_avail'] = (df.available-df.created_date).apply(
    lambda x: None if isinstance(x, pd._libs.tslib.NaTType) else x.days)  

# birthday

In [None]:
# What is it?
#   their birthday, everyone put in a birthday
#
# What do I do with it?
#   get (approx) age
#   get binary for birthday
#
#   in future: summer vs winter babies? (might be clutching at straws here)

df['age'] = 2018-df.birthday.apply(lambda x: x.year)
df['has_birthdate'] = df.birthday.apply(lambda x: 0 if str(x)[5:] == '01-01' else 1)

drop_set.add('birthday')

# blocked

In [None]:
# What is it?
#   who have they been blocked by?
#
# What do I do with it?
#   make has_block binary
#   delete blocked

df['has_block'] = df.blocked.apply(lambda x: 1 if isinstance(x, list) else 0)
print("{} users have been blocked".format(df.has_block.sum()))
drop_set.add('blocked')

# college

In [None]:
# What is it?
#   where did they go to college
#
# What do I do with it?
#   make has_college

df['has_college'] = df.college.apply(lambda x: 1 if isinstance(x, str) else 0)
print("{:.3} of users included a college".format(100*df.has_college.sum()/n))

# email

In [None]:
# What is it?
#   what is their email
#
# What do I do with it?
#   make has_email
#   drop email because security

df['has_email'] = df.email.apply(lambda x: 1 if isinstance(x, str) else 0)
print("{:.3} of users included an email".format(100*df.has_email.sum()/n))
drop_set.add('email')

# emailVerified

In [None]:
# What is it?
#   just nan or 0
#
# What do I do with it?
#   delete it!

drop_set.add('emailVerified')

# facebookId

In [None]:
# What is it?
#   users facebookId, duplicated info from _auth_data_facebook
#
# What do I do with it?
#   delete it!

drop_set.add('facebookId')

# firstName

In [None]:
# What is it?
#   users name, major privacy issues
#
# What do I do with it?
#   delete it!

drop_set.add('firstName')

# foundRoommate

In [None]:
# What is it?
#   users who stoped using the app? 
#   was told to ignore it
#
# What do I do with it?
#   delete it!

drop_set.add('foundRoommate')

# gender

In [None]:
# What is it?
#   male/female
#   no other options, no nans
#
# What do I do with it?
#   binary 1s and 0s


df['gender'] = df.gender.apply(lambda x: 1 if x == 'male' or x == 1 else 0)
print("{:.3} of users are male".format(100*df.gender2.sum()/n))

In [None]:
temp = df.groupby('has_about').count().const
ax = temp.plot(kind='bar',width=.6)

y_ticks = ax.get_yticks()
total_n = len(df)
ylabels = [round(x/total_n,2) for x in y_ticks]
ax.set_yticklabels(ylabels)

ax.set_ylabel("Relative Density")
ax.set_xlabel("")
plt.title("Has About", fontsize=16)

In [None]:
def get_hoods(lst):
    if isinstance(lst, list):
        temp_set = set()
        for thing in lst:
            temp_set.add(thing['objectId'])
        return temp_set
    else:
        return set()

def get_city(lst,metro_dict):
    if len(lst) > 0:
        try: return metro_dict[list(lst)[0] ]['metro']
        except: return 'Unknown'
    else: return None
    
    
    # make a dictionary of all the neighborhoods
    metro_df = pd.read_json(metro_filename)
    metro_df = metro_df.drop(['_created_at','_updated_at','city','name'], axis=1)
    metro_dict = metro_df.set_index('_id').to_dict('index')

    df.neighborhoods = df.neighborhoods.apply(get_hoods)
    print("... got neighborhoods")
    df['metro'] = df.neighborhoods.apply(lambda x: get_city(x,metro_dict))
    print("... got metro areas")

    # create new features
    df = df.fillna({'about':''})
    df['age'] = 2018-df['birthday'].apply(lambda x: x.year)
    df['len_about'] = df.about.apply(lambda x: len(x))
    df['has_about'] = df.len_about > 0
    df['I_count'] = df.about.apply(lambda x: x.count('I'))
    df['I_ratio'] = df.about.apply(lambda x: x.count('I')/len(x) if len(x) > 0 else np.nan)
    df['period_count'] = df.about.apply(lambda x: x.count('.'))
    df['period_ratio'] = df.about.apply(lambda x: x.count('.')/len(x) if len(x) > 0 else np.nan)
    df['question_count'] = df.about.apply(lambda x: x.count('?'))
    df['question_ratio'] = df.about.apply(lambda x: x.count('?')/len(x) if len(x) > 0 else np.nan)
    df['exclaim_count'] = df.about.apply(lambda x: x.count('!'))
    df['exclaim_ratio'] = df.about.apply(lambda x: x.count('!')/len(x) if len(x) > 0 else np.nan)
    df['sentence_count'] = df.period_count+df.question_count+df.exclaim_count
    df['sentence_ratio'] = df.period_ratio+df.question_ratio+df.exclaim_ratio
    df.has_room = df.has_room.apply(lambda x: isinstance(x,str))
    df['has_facebookId'] = df.facebookId.apply(lambda x: isinstance(x,str))
    df['has_linkedinId'] = df.linkedinId.apply(lambda x: isinstance(x,str))
    df['has_picture'] = df.picture.apply(lambda x: isinstance(x,str))
    df['timeframe'] = df.available-df.created
    print("... added new features")

In [None]:
user_filename = "/Users/gandalf/Documents/coding/data_do_not_commit/raw_data_users.json"
metro_filename = "/Users/gandalf/Documents/coding/data_do_not_commit/raw_data_neighborhoods.json"

In [None]:
metro_df = pd.read_json(metro_filename)
metro_df = metro_df.drop(['_created_at','_updated_at','city','name'], axis=1)
# metro_df.set_index('_id')
metro_dict = metro_df.set_index('_id').to_dict('index')


In [None]:
metro_df.head()

In [None]:
metro_dict

In [None]:
df2017 = df[df.created_year == 2017]
ax = df2017['created_day_of_year'].plot(kind='kde')

xlabels = [dt.date(2017,1,1) + dt.timedelta(x) for x in range(0,400, 50)]
ax.set_xticklabels(xlabels)

plt.xticks(rotation=45)

ax.set_xlim(0,365)
ax.set_ylim(0,.008)
ax.set_ylabel("Relative Density")
ax.set_xlabel("")
plt.title("User Signups - Date", fontsize=16)

In [None]:
temp = df.groupby("created_day_of_month").count().created
ax = temp.plot(kind='bar',width=.6)

y_ticks = ax.get_yticks()
total_n = len(df)
ylabels = [round(x/total_n,2) for x in y_ticks]
ax.set_yticklabels(ylabels)

ax.set_ylabel("Relative Density")
ax.set_xlabel("")
plt.title("User Signups - Day of Month", fontsize=16)

In [None]:
temp = df.groupby("created_day_of_week").count().created
ax = temp.plot(kind='bar',width=.6)
x_labels = ["Mon","Tues","Weds","Thurs","Fri","Sat","Sun"]
ax.set_xticklabels(x_labels)

y_ticks = ax.get_yticks()
total_n = len(df)
ylabels = [round(x/total_n,2) for x in y_ticks]
ax.set_yticklabels(ylabels)

ax.set_ylabel("Relative Density")
ax.set_xlabel("")
plt.title("User Signups - Day of Week", fontsize=16)

In [None]:
ax = df['created_hour'].plot(kind='hist',bins=24)
x_labels = ["Midnight", "5 AM", "10 AM", "3 PM","8 PM"]
ax.set_xticklabels(x_labels)
ax.set_xlim(0,23)

y_ticks = ax.get_yticks()
# ax.set_yticks(y_ticks[::2])
total_n = len(df)

ylabels = [round(x/total_n,3) for x in y_ticks]
ax.set_yticklabels(ylabels)

ax.set_ylabel("Relative Density")
ax.set_xlabel("")
plt.title("User Signups - Time of Day", fontsize=16)

In [None]:
temp = df[df.has_available == 1]
temp['timedelta_created_available2'] = temp.timedelta_created_available.apply(lambda x: x.days)
temp[['created_date','available','timedelta_created_available','timedelta_created_available2']].tail()

In [None]:
plt.hist(temp.timedelta_created_available2, bins=range(0,365))
plt.show()

In [None]:
print("{} users put an availability in the past.".format
      ((temp.timedelta_created_available2 < 0).sum()))

plt.hist(temp.timedelta_created_available2, bins=np.arange(-365,-1))
plt.show()

In [None]:
print("{} users put an availability more than a year in the future.".format
      ((temp.timedelta_created_available2 > 365).sum()))

plt.hist(temp.timedelta_created_available2, bins=np.arange(365,730))
plt.show()