# IMPORTS

In [None]:
import pandas as pd
import numpy as np

import datetime as dt

import re

import json

# READ IN USER INFORMATION

In [None]:
# filepath
data_file_path = "/Users/gandalf/Documents/coding/do_not_commit/capstone/"
DF = pd.read_json(data_file_path+"raw_data_users.json")
print("... read in dataframe")

In [None]:
# create a copy
df = DF.copy()

# remove users that haven't been onboarded
df = df.drop(df[df.onboarded != 1].index)
print("... dropped users that aren't onboarded")

# convert to datetimes
def my_to_datetime(x):
    if isinstance(x, dict):
        try: return pd.to_datetime(x['$date'])
        except: return None
    else: return None

def my_to_date(x):
    if isinstance(x, dict):
        try: return pd.to_datetime(x['$date']).date()
        except: return None
    else: return None
df['_created_at'] = df._created_at.apply(lambda x: my_to_datetime(x))
df['_updated_at'] = df._updated_at.apply(lambda x: my_to_datetime(x))
df.activeAt = df.activeAt.apply(lambda x: my_to_datetime(x))
df.available = df.available.apply(lambda x: my_to_date(x))
df.birthday = df.birthday.apply(lambda x: my_to_date(x))
print("... changed to datetimes")

# set the user id as the index
df = df.set_index('_id')
print("... set user id as the index")

# create a constant for easy counting
df['const']=1
print("... created a constant value")

# set a value n for the number of users in the sample
n = len(df)
print("There are {} users in the data frame.".format(n))

# INITIAL DATA INVESTIGATION

In [None]:
drop_set = set()

print(df.columns)

## _acl

In [None]:
# What is it? 
#   looks like read/write properties. everything seems set to True for everyone. 
#
# What do I do with it? 
#   delete

drop_set.add('_acl')

## _auth_data_facebook

In [None]:
# What is it? 
#   information about Facebook authoration 
#
# What do I do with it? 
#   turn into has_facebook binary

df['has_facebook'] = df._auth_data_facebook.apply(lambda x: 
                                                  1 if isinstance(x, dict) else 0)
print("{:.3}% of users have linked to facebook.".format(100*df.has_facebook.sum()/n))
drop_set.add('_auth_data_facebook')


## _created_at

In [None]:
# What is it?
#   date the account was created
#
# What do I do with it?
#   get date, day of month, day of week, and hour of day information

df['created_date'] = df._created_at.apply(lambda x: x.date())

df['created_year'] = df._created_at.apply(lambda x: x.year)
df['created_day_of_year'] = df._created_at.apply(lambda x: int(x.strftime('%j')))

df['created_month'] = df._created_at.apply(lambda x: x.isocalendar()[0])
df['created_day_of_month'] = df._created_at.apply(lambda x: x.day)

df['created_week'] = df._created_at.apply(lambda x: x.isocalendar()[1])
df['created_day_of_week'] = df._created_at.apply(lambda x: x.isoweekday())

df['created_day'] = df._created_at.apply(lambda x: x.toordinal())
df['created_hour_of_day'] = df._created_at.apply(lambda x: x.hour)

drop_set.add('_created_at')

## _hashed_password

In [None]:
# What is it?
#   some ppl have one, some don't
#
# What do I do with it?
#   make it binary!

df['has_password'] = df._hashed_password.apply(lambda x: 1 if isinstance(x, str) else 0)
print("{:.3}% of users have a password.".format(100*df.has_password.sum()/n))
drop_set.add('_hashed_password')

## _p_room

In [None]:
# What is it?
#   does the person have a room already?
#
# What do I do with it?
#   make it binary!

df['yes_room'] = df._p_room.apply(lambda x: 1 if isinstance(x, str) else 0)
print("{:.3}% of users already have a room".format(100*df.yes_room.sum()/n))
drop_set.add('_p_room')

## _rperm

In [None]:
# What is it?
#   a list of an asterix and the user's id
#
# What do I do with it?
#   delete it

drop_set.add('_rperm')

## _updated_at

In [None]:
# What is it?
#   one of three dates in Jan 2018
#
# What do I do with it?
#   delete it

# get information out of when they updated their profile
df['_updated_at_date'] = df._updated_at.apply(lambda x: x.date())
print(df['_updated_at_date'].unique())

drop_set.add('_updated_at')
drop_set.add('_updated_at_date')

# _wperm

In [None]:
# What is it?
#   user id
#
# What do I do with it?
#   drop

drop_set.add('_wperm')

## about

In [None]:
# What is it?
#   users self-descriptions
#
# What do I do with it?
#   turn NaN into empty strings
#   create binary has_about variable
#   create int len_about variable

df.about = df.about.fillna("")
df['len_about'] = df.about.apply(lambda x: len(x))
df['has_about'] = df.len_about.apply(lambda x: 1 if x >0 else 0)
print("{:.3}% of users have an about section".format(100*df.has_about.sum()/n))

## activeAt

In [None]:
# What is it?
#   most recent activity
#
# What do I do with it?
#   delete it
#   has data from after the break point --> data bleed

drop_set.add('activeAt')

## amenities

In [None]:
# What is it?
#   ammenities ppl have asked for
#
# What do I do with it?
#   create len amenities
#   create has amenities
#   perhaps lateron tokenize each amenity

df.amenities = df.amenities.apply(lambda x: set(x) if isinstance(x,list) else set())
df['len_amenities'] = df.amenities.apply(lambda x: len(x))
df['has_amenities'] = df.len_amenities.apply(lambda x: 1 if x > 0 else 0)

## available

In [None]:
# What is it?
#   date person needs a room
#
# What do I do with it?
#   turn into a date
#   create has_available
#   create timedelta_created_available
#
#   Maybe in the future,
#   fix dates that appear misentered, or somehow code the dates as "past" or "future"

df['has_available'] = df.available.apply(lambda x: 1 if isinstance(x, dt.date) else 0)
df['td_creat_avail'] = (df.available-df.created_date).apply(
    lambda x: None if isinstance(x, pd._libs.tslib.NaTType) else x.days)  

## birthday

In [None]:
# What is it?
#   their birthday, everyone put in a birthday
#
# What do I do with it?
#   get (approx) age
#   get binary for birthday
#
#   in future: summer vs winter babies? (might be clutching at straws here)

df['age'] = 2018-df.birthday.apply(lambda x: x.year)
df['has_birthdate'] = df.birthday.apply(lambda x: 0 if str(x)[5:] == '01-01' else 1)

drop_set.add('birthday')

## blocked

In [None]:
# What is it?
#   who have they been blocked by?
#
# What do I do with it?
#   make yes_block binary
#   delete blocked

df['yes_block'] = df.blocked.apply(lambda x: 1 if isinstance(x, list) else 0)
print("{} users have been blocked".format(df.yes_block.sum()))
drop_set.add('blocked')

## college

In [None]:
# What is it?
#   where did they go to college
#
# What do I do with it?
#   make has_college

df['has_college'] = df.college.apply(lambda x: 1 if isinstance(x, str) else 0)
print("{:.3} of users included a college".format(100*df.has_college.sum()/n))

## email

In [None]:
# What is it?
#   what is their email
#
# What do I do with it?
#   make has_email
#   get information about email provider
#   drop email because security
#

df['has_email'] = df.email.apply(lambda x: 1 if isinstance(x, str) else 0)
print("{:.3} of users included an email".format(100*df.has_email.sum()/n))

def get_domain(x):
    try: return re.search('(@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)', x).group()
    except: return ''   
df['email_domain'] = df.email.apply(get_domain)

drop_set.add('email')

## emailVerified

In [None]:
# What is it?
#   just nan or 0
#
# What do I do with it?
#   delete it!

drop_set.add('emailVerified')

## facebookId

In [None]:
# What is it?
#   users facebookId, duplicated info from _auth_data_facebook
#
# What do I do with it?
#   delete it!

drop_set.add('facebookId')

## firstName

In [None]:
# What is it?
#   users name, major privacy issues
#
# What do I do with it?
#   delete it!

drop_set.add('firstName')

## foundRoommate

In [None]:
# What is it?
#   users who stoped using the app? 
#   was told to ignore it
#
# What do I do with it?
#   delete it!

drop_set.add('foundRoommate')

## gender

In [None]:
# What is it?
#   male/female
#   no other options, no nans
#
# What do I do with it?
#   binary 1s and 0s


df['gender'] = df.gender.apply(lambda x: 1 if x == 'male' or x == 1 else 0)
print("{:.3} of users are male".format(100*df.gender.sum()/n))

## groupChat

In [None]:
# What is it?
#   1 or nan
#   about 97% 1s
#   could be leekage if it tells whether a user has ever chatted
#
# What do I do with it?
#   drop it

drop_set.add('groupChat')

## hobbies

In [None]:
# What is it?
#   hobbies ppl have listed
#
# What do I do with it?
#   create len hobbies
#   create has hobbies
#   turn into set
#   perhaps lateron tokenize each hobby


df.hobbies = df.hobbies.apply(lambda x: set(x) if isinstance(x,list) else set())
df['len_hobbies'] = df.hobbies.apply(lambda x: len(x))
df['has_hobbies'] = df.len_hobbies.apply(lambda x: 1 if x >0 else 0)

## hometown, hometownCity, hometownCountry, hometownCounty, hometownState

In [None]:
# What is it?
#   eg:   San Francisco, CA, USA
#         San Francisco
#         United States
#         San Francisco County
#         California
#   'hometown' duplicated in later columns
#
# What do I do with it?
#   turn into has_hometown
#   keep for now, 
#          same_hometown?
#          EEUU vs USA

df['has_hometown'] = df.hometown.apply(lambda x: 1 if isinstance(x, str) else 0)
print("{:.3} of users shared their hometown".format(100*df.has_hometown.sum()/n))

## inRelationship, isClean, isNight, isStudent

In [None]:
# What is it?
#   a bunch of binaries
#
# What do I do with it?
#   nothing needs doing, already done!

print("{:.3} of users identify as in a relationship.".
      format(100*df.inRelationship.sum()/n))
print("{:.3} of users identify as clean.".
      format(100*df.isClean.sum()/n))
print("{:.3} of users identify as night owls.".
      format(100*df.isNight.sum()/n))
print("{:.3} of users identify as students.".
      format(100*df.isStudent.sum()/n))

## lastName

In [None]:
# What is it?
#   users name, major privacy issues
#
# What do I do with it?
#   drop it like it's hot!

drop_set.add('lastName')

## likes

In [None]:
# What is it?
#   appears to ba all nans
#
# What do I do with it?
#   killit

drop_set.add('likes')

## linkedinId

In [None]:
# What is it? 
#   linked in id 
#
# What do I do with it? 
#   turn into has_facebook binary

df['has_linkedin'] = df.linkedinId.apply(lambda x: 
                                                  1 if isinstance(x, str) else 0)
print("{:.3}% of users have added a linkedin ID.".format(100*df.has_linkedin.sum()/n))
drop_set.add('linkedinId')

## location

In [None]:
# What is it? 
#   GPS
#
# What do I do with it? 
#   turn into  binary
#   also info in diff len?

df['has_location'] = df.location.apply(lambda x: 
                                                  1 if isinstance(x, list) else 0)
print("{:.3}% of users have a GPS location.".format(100*df.has_location.sum()/n))

## maxCost

In [None]:
# What is it? 
#   max and min cost
#
# What do I do with it? 
#   binary it!
#   it won't binary
#   create a rent range?

print("92% of users included a rent range.")

df['rent_range'] = df.maxCost-df.minCost

## neighborhoods

In [None]:
# What is it? 
#   id for a neighborhood (will have to look up)
#
# What do I do with it? 
#   extract just neighborhood id
#   binary it!
#   get city, metro, neighborhood info

# Make Metro Dictionary
filename = "/Users/gandalf/Documents/coding/do_not_commit/capstone/raw_data_neighborhoods.json"
metro_df = pd.read_json(filename).drop(['_created_at','_updated_at'], axis=1).set_index('_id')

metro_dict_city = metro_df.city.to_dict()
metro_dict_metro = metro_df.metro.to_dict()
metro_dict_name = metro_df.name.to_dict()

def get_from_dict(d, x):
    try: return d[x]
    except: return 'Other'

def get_hoods(lst):
    if isinstance(lst, set): return lst   # need in case run the cell multiple times
    elif isinstance(lst, list): return set(thing['objectId'] for thing in lst)
    else: return set()
      
df['neighborhoods'] = df.neighborhoods.apply(get_hoods)
df['len_neighborhoods'] = df.neighborhoods.apply(len)
df['has_neighborhoods'] = df.len_neighborhoods.apply(lambda x: 1 if x > 0 else 0)
df['neighborhoods_city'] = df.neighborhoods.apply(lambda x: {get_from_dict(metro_dict_city,i) for i in x})
df['neighborhoods_metro'] = df.neighborhoods.apply(lambda x: {get_from_dict(metro_dict_metro,i) for i in x})
df['neighborhoods_name'] = df.neighborhoods.apply(lambda x: {get_from_dict(metro_dict_name,i) for i in x})

print("{:.3}% of users included at least one prefered neighborhood.".
      format(100*df.has_neighborhoods.sum()/n))

drop_set.add('neighborhoods')

## numRoommates

In [None]:
# What is it? 
#   num roommates
#
# What do I do with it? 
#   binary it!

df['has_numRoommates'] = df.numRoommates.apply(lambda x: 
                                                  1 if x >= 0 else 0)
print("{:.3}% of users specified a number of roommates.".
      format(100*df.has_numRoommates.sum()/n))

## onboarded

In [None]:
# What is it? 
#   use to filter out users
#
# What do I do with it? 
#   kill it!

drop_set.add('onboarded')

## petsOk

In [None]:
# What is it? 
#   pets okay?
#
# What do I do with it? 
#   already perfect!

print("{:.3}% of users are okay with pets.".
      format(100*df.petsOk.sum()/n))

## picture

In [None]:
# What is it? 
#   pic id
#
# What do I do with it? 
#   binary then kill

df['has_picture'] = df.picture.apply(lambda x: 1 if isinstance(x, str) else 0)
print("{:.3}% of users added a picture.". format(100*df.has_picture.sum()/n))

drop_set.add('picture')

# positions

In [None]:
# What is it? 
#   not sure. address? job? from linkedin?
#
# What do I do with it? 
#   kill

df['yes_positions'] = df.positions.apply(lambda x: 1 if isinstance(x, list) else 0)
print("{:.3}% of users have positions, whatever that is.". format(100*df.yes_positions.sum()/n))

drop_set.add('positions')
drop_set.add('yes_positions')

## recommended

In [None]:
# What is it? 
#   list of user ids?
#   99.996% are 100 terms long
#
# What do I do with it? 
#   back burner it
#   delete 4 now


def get_recomendations(lst):
    if isinstance(lst, set): return lst   # need in case run the cell multiple times
    elif isinstance(lst, list): return set(thing['objectId'] for thing in lst)
    else: return set()

df['recommended'] = df.recommended.apply(get_hoods)
df['len_recommended'] = df.recommended.apply(len)
df['yes_100recommended'] = df.len_recommended.apply(lambda x: 1 if x == 100 else 0)

print("{:.5}% of users have 100 recomended user ids.".
      format(100*df.yes_100recommended.sum()/n))

drop_set.add('recommended')
drop_set.add('len_recommended')
drop_set.add('yes_100recommended')

## smokingOk

In [None]:
# What is it? 
#   pets okay?
#
# What do I do with it? 
#   already perfect!

print("{:.3}% of users are okay with smoking.".
      format(100*df.smokingOk.sum()/n))

## term

In [None]:
# What is it? 
#   length of idea lease?
#
# What do I do with it? 
#   binary it!

df['has_term'] = df.term.apply(lambda x: 1 if x >= 0 else 0)
print("{:.3}% of users specified a term length.".
      format(100*df.has_term.sum()/n))

## type

In [None]:
# What is it? 
#   type of room (most say private)
#
# What do I do with it? 
#   hold off on anything for now

## username

In [None]:
# What is it? 
#   something usernamey
#   some are random strings
#   some are email addresses
#
# What do I do with it? 
#   drop it

drop_set.add('username')

## work

In [None]:
# What is it? 
#   employer
#
# What do I do with it? 
#   binary it (NaN prob means unemployed?)

df['has_work'] = df.work.apply(lambda x: 1 if isinstance(x, str) else 0)
print("{:.3}% of users list a work.".format(100*df.has_work.sum()/n))

## engagement_proxy

In [None]:
# What is it? 
#   created
#   add up the number of optional items the user answered

has_list = [col for col in df.columns if col[:3]=='has']
df['engagement_proxy'] = df[has_list].sum(axis=1)

# REMOVE COLUMNS

In [None]:
df = df.drop(drop_set, axis=1)
drop_set = []

# SAVE AS JSON

In [None]:
df.to_json(data_file_path+'user_df.json')
print("... saved as json")

# SAVE AS PICKLE

In [None]:
df.to_pickle(data_file_path+'user_df.pkl')
print("... saved as pickle")