In [None]:
import pandas as pd
import numpy as np
from numpy.random import choice
import my_pickle as mp
import matplotlib.pyplot as plt

from importlib import reload

In [None]:
reload(mp)

# LOAD DATA FRAMES

In [None]:
# Load User Data
user_df = mp.unjson_it('data_user')
user_df = mp.reinstate_date(user_df,['activeAt', 'available', 'birthday','created','updated'])
user_df.head(2)

In [None]:
# Load Conversation Data
convo_df = mp.unjson_it('data_convo')
print("Make sure we're not using ECT data")
print(convo_df.timestamp.max())
convo_df = convo_df.set_index('conversation_id')
convo_df.columns

In [None]:
# Import Message DF
message_df = mp.unjson_it('data_message')
print("Make sure we're not using ECT data")
print(convo_df.timestamp.max())
message_df.head(2)

# GET METRO DATA

In [None]:
# Make Metro Dictionary
filename = "/Users/gandalf/Documents/data/raw_data_neighborhoods.json"
metro_df = pd.read_json(filename)
metro_df = metro_df.drop(['_created_at','_updated_at','city','name'], axis=1)
metro_df.set_index('_id')
metro_df = metro_df.set_index('_id').to_dict('index')

# ASSIGN METRO AREAS TO USERS

In [None]:
def get_city(lst):
    if len(lst) > 0:
        try: return hood_dict[lst[0]]['metro']
        except: return None
    else: return None
    
# cities_list = ['Chicago', 'Phoenix', 'San Francisco Bay Area', None, 'Los Angeles',
#        'Fort Lauderdale', 'San Diego', 'Washington',
#        'Minneapolis / St Paul', 'Detroit Metro', 'Dallas / Fort Worth',
#        'Hawaii', 'South Florida', 'Atlanta', 'Boston', 'Tampa Bay Area',
#        'New York City', 'Portland', 'Sacramento', 'Seattle-Tacoma',
#        'Ft Myers / Sw Florida', 'Philadelphia', 'Orange County CA',
#        'Oxnard-Thousand Oaks-Ventura', 'Not Listed']  
# city_dict = {}
# for value,key in enumerate(cities_list):
#     city_dict[key] = value
    
user_df['metro'] = user_df.neighborhoods.apply(get_city)

In [None]:
user_df['metro']

# GET USER DATA FOR SENDER AND RECEIVER

In [None]:
user_df['flag'] = True
old_names = user_df.columns
new_names = {old:old+'_sender' for old in old_names}
master_df = convo_df.join(user_df, on='uid_sender').join(user_df, on='uid_receiver', rsuffix="_receiver").rename(index=str, columns = new_names)
master_df.head()

# DROP USERS THAT AREN'T ON THE USER_DF

In [None]:
# Investigate
master_df.flag_receiver = master_df.flag_receiver.apply(lambda x: x if x==True else False)
print(len(master_df))
print(master_df.flag_receiver.sum())
len(master_df[master_df.flag_receiver == False])

In [None]:
master_df = master_df[master_df.flag_receiver]
len(master_df)

# ADD FEATURES

In [None]:
def roommate_rules(roommates):
    if roommates > 4: return 3
    elif roommates > 1: return 2
    elif roommates > 0: return 1
    else: return 0

def get_rent_range(row):
    max1 = row.maxCost_sender
    max2 = row.maxCost_receiver
    min1 = row.minCost_sender
    min2 = row.minCost_receiver
    if max1*max2*min1*min2 > 0:
        upper = min(max1,max2)
        lower = max(min1,min2)
        if upper-lower > 0: return upper-lower
        else: return 0
    else: return 175
    
def get_urgency_receiver(row):
    try:
        return row.available_receiver-row.timestamp
    except:
        return None
    
def get_urgency_sender(row):
    try:
        return row.available_sender-row.timestamp
    except:
        return None
    
def my_distance(row):
    a = row.location_receiver
    b = row.location_sender
    try: 
        one = b[0]-a[0]
        two = b[1]-a[1]
        return (one**2+two**2)**(.5)
    except:
        return None

def feature_time(df):
    df['age_dif'] = abs(df.age_sender-df.age_receiver)
    df['same_gender'] = df.gender_sender==df.gender_receiver
    df['same_relate'] = df.inRelationship_sender == df.inRelationship_receiver
    df['same_clean'] = df.isClean_sender == df.isClean_receiver
    df['same_night'] = df.isNight_sender == df.isNight_receiver
    df['same_student'] = df.isStudent_sender == df.isStudent_receiver
    df['same_student'] = df.isStudent_sender == df.isStudent_receiver
    df['same_smoking'] = df.smokingOk_sender == df.smokingOk_receiver
    df['same_type'] = df.type_sender == df.type_receiver
    df['same_term'] = df.term_sender == df.term_receiver
    df['same_work'] = df.work_sender == df.work_receiver
    df['same_city'] = df.hometownCity_sender == df.hometownCity_receiver
    df['same_state'] = df.hometownState_sender == df.hometownState_receiver
    df['same_country'] = df.hometownCountry_sender == df.hometownCountry_receiver
    df['same_college'] = df.college_sender == df.college_receiver
    df['same_metro'] = df.metro_sender == df.metro_receiver

    df['roommate_similarity'] = abs(df.numRoommates_sender.apply(lambda x: roommate_rules(x))
                                  -df.numRoommates_receiver.apply(lambda x: roommate_rules(x)))

    df['hobbies_receiver'] = df.hobbies_receiver.apply(lambda x: set(x) if isinstance(x,list) else set())
    df['hobbies_sender'] = df.hobbies_sender.apply(lambda x: set(x) if isinstance(x,list) else set())
    df['hobbies_overlap'] = df.apply(lambda x: len(x['hobbies_receiver'].intersection(x['hobbies_sender'])), axis=1)

    
    df['amenities_receiver'] = df.amenities_receiver.apply(lambda x: set(x) if isinstance(x,list) else set())
    df['amenities_sender'] = df.amenities_sender.apply(lambda x: set(x) if isinstance(x,list) else set())
    df['amenities_overlap'] = df.apply(lambda x: len(x['amenities_receiver'].intersection(x['amenities_sender'])), axis=1)

    df['neighborhoods_receiver'] = df.neighborhoods_receiver.apply(lambda x: set(x) if isinstance(x,list) else set())
    df['neighborhoods_sender'] = df.neighborhoods_sender.apply(lambda x: set(x) if isinstance(x,list) else set())
    df['neighborhoods_overlap'] = df.apply(lambda x: len(x['neighborhoods_receiver'].intersection(x['neighborhoods_sender'])), axis=1)

    df['rent_overlap'] = df.apply(get_rent_range, axis=1)
    
    df['urgency_receiver'] = df.available_receiver-df.timestamp
    df['urgency_sender'] = df.available_sender-df.timestamp

    df['distance'] = df.apply(my_distance, axis=1)
    
    TF = {True: 1, False: 0, 'male':1, 'female':0, 'shared':0,'private':1,'nan':-1}
    
    col_to_TF = ['response', 
                 'has_about_sender', 'has_about_receiver', 'gender_sender','gender_receiver',
                 'facebookId_sender', 'linkedinId_sender', 'picture_sender', 'has_room_sender',
                 'type_sender','type_receiver',
                 'facebookId_receiver', 'linkedinId_receiver', 'picture_receiver', 'has_room_receiver',
                 'same_work','same_city','same_state','same_country','same_metro','same_college',
                 'same_gender','same_relate','same_clean','same_night','same_student','same_smoking','same_term','same_type']
    for col in col_to_TF:
        df[col] = df[col].map(TF)
    
    print("columns with null values: {}".format(len(df.columns[df.isnull().any()])))
    return df
  
master_df = feature_time(master_df)
master_df.head()

# FILL NA 

In [None]:
no_more_n_a = master_df.fillna(master_df.mean(axis=0))
no_more_n_a.head()

# PICKLE

In [None]:
mp.json_it(df,'data_features')
mp.json_it(no_more_n_a,'data_master_full')