In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt

import seaborn as sns
sns.set()

In [None]:
# directory
data_file_path = "/Users/gandalf/Documents/coding/do_not_commit/capstone/"
website_file_path = '/Users/gandalf/Documents/coding/rczyrnik.github.io/capstone/'

In [None]:
user_df =  pd.read_pickle(data_file_path+'user_df.pkl')
convo_df =  pd.read_pickle(data_file_path+'convo_df.pkl')

user_df.head(2)

In [None]:
convo_df.head(2)

# MERGE USER DATA TO CONVERSATION DATA

In [None]:
# add a flag so we can identify rows with missing user data
user_df['flag'] = True

# get labels for new columns
old_names = user_df.columns
new_names = {old:old+'_sender' for old in old_names}

# merge and rename
master_df = convo_df.join(user_df, on='first_message_uid', rsuffix="_sender") \
        .join(user_df, on='second_message_uid', rsuffix="_receiver") \
        .rename(index=str, columns = new_names)

# DROP CONVERSATIONS WITH MISSING USER DATA

In [None]:
initial_len = len(master_df)

# add False values
master_df.flag_receiver = master_df.flag_receiver.apply(lambda x: x if x==True else False)

# keep only rows with the flag
master_df = master_df[master_df.flag_receiver]

# add False values
master_df.flag_sender = master_df.flag_sender.apply(lambda x: x if x==True else False)

# keep only rows with the flag
master_df = master_df[master_df.flag_sender]

# see how many conversations we lost
print("{} conversations (of {}) were removed because they were missing user data".format(initial_len-len(master_df), initial_len))
# 227+182 conversations (of 11165) were removed because they were missing user data

# ADD FEATURES

In [None]:
def roommate_rules(roommates):
    if roommates > 4: return 3
    elif roommates > 1: return 2
    elif roommates > 0: return 1
    else: return 0

def get_rent_range(row):
    max1 = row.maxCost_sender
    max2 = row.maxCost_receiver
    min1 = row.minCost_sender
    min2 = row.minCost_receiver
    if max1*max2*min1*min2 > 0:
        upper = min(max1,max2)
        lower = max(min1,min2)
        if upper-lower > 0: return upper-lower
        else: return 0
    else: return 175

def my_distance(row):
    a = row.location_receiver
    b = row.location_sender
    try:
        one = b[0]-a[0]
        two = b[1]-a[1]
        return (one**2+two**2)**(.5)
    except:
        return None

def get_features(df):
    # age difference
    df['age_dif'] = abs(df.age_sender-df.age_receiver)

    # similarities
    df['same_gender'] = df.gender_sender==df.gender_receiver
    df['same_relate'] = df.inRelationship_sender == df.inRelationship_receiver
    df['same_clean'] = df.isClean_sender == df.isClean_receiver
    df['same_night'] = df.isNight_sender == df.isNight_receiver
    df['same_student'] = df.isStudent_sender == df.isStudent_receiver
    df['same_smoking'] = df.smokingOk_sender == df.smokingOk_receiver
    df['same_type'] = df.type_sender == df.type_receiver
    df['same_term'] = df.term_sender == df.term_receiver
    df['same_work'] = df.work_sender == df.work_receiver
    df['same_city'] = df.hometownCity_sender == df.hometownCity_receiver
    df['same_state'] = df.hometownState_sender == df.hometownState_receiver
    df['same_country'] = df.hometownCountry_sender == df.hometownCountry_receiver
    df['same_college'] = df.college_sender == df.college_receiver
    df['same_metro'] = df.neighborhoods_metro_sender == df.neighborhoods_metro_receiver

    # overlaps
    df['overlap_roommate'] = abs(df.numRoommates_sender.apply(lambda x: roommate_rules(x))
                                  -df.numRoommates_receiver.apply(lambda x: roommate_rules(x)))
    df['overlap_hobbies'] = df.apply(lambda x: len(x['hobbies_receiver'].intersection(x['hobbies_sender'])), axis=1)
    df['overlap_amenities'] = df.apply(lambda x: len(x['amenities_receiver'].intersection(x['amenities_sender'])), axis=1)
    df['overlap_neighborhoods'] = df.apply(lambda x: len(x['neighborhoods_name_receiver'].intersection(x['neighborhoods_name_sender'])), axis=1)
    df['overlap_rent'] = df.apply(get_rent_range, axis=1)

    # urgencies
    df['urgency_receiver'] = df.available_receiver-df.first_message_date
    df.urgency_receiver = df.urgency_receiver.apply(lambda x: x.days)

    df['urgency_sender'] = df.available_sender-df.first_message_date
    df.urgency_sender = df.urgency_sender.apply(lambda x: x.days)

    # distance between sender and receiver
    df['distance'] = df.apply(my_distance, axis=1)


    # rename T/F as 1/0
    binary = {True: 1, False: 0}
    col_to_binary = ['same_work','same_city','same_state','same_country','same_metro',
                 'same_college','same_gender','same_relate','same_clean','same_night',
                 'same_student','same_smoking','same_term','same_type']
    for col in col_to_binary: df[col] = df[col].map(binary)

    print("columns with null values: {}".format(len(df.columns[df.isnull().any()])))
    return df

In [None]:
master_df = get_features(master_df)

# CREATE X AND y DATAFRAMES

In [None]:
col_to_keep = [
 'first_message_len',       
 'first_message_day_of_year',   
 'first_message_day_of_month',
 'first_message_day_of_week',  
 'first_message_hour',     
 'const2',                  
 'gender_sender',           
 'inRelationship_sender',   
 'isClean_sender',          
 'isNight_sender',          
 'isStudent_sender',        
 'maxCost_sender',          
 'minCost_sender',          
 'numRoommates_sender',     
 'petsOk_sender',           
 'smokingOk_sender',
 'term_sender',
 'has_facebook_sender',     
 'created_day_of_year_sender',
 'created_day_of_month_sender',
 'created_day_of_week_sender',
#  'created_hour_sender',
 'has_password_sender',
 'yes_room_sender',
 'len_about_sender',
 'has_about_sender',
 'len_amenities_sender',
 'has_amenities_sender',
 'has_available_sender',
 'td_creat_avail_sender',
 'age_sender',
 'has_birthdate_sender',
 'yes_block_sender',
 'has_college_sender',
 'has_email_sender',
 'len_hobbies_sender',
 'has_hobbies_sender',
 'has_hometown_sender',
 'has_linkedin_sender',
 'has_location_sender',
 'rent_range_sender',
 'len_neighborhoods_sender',
 'has_neighborhoods_sender',
 'has_numRoommates_sender',
 'has_picture_sender',
 'has_term_sender',
 'has_work_sender',
 'engagement_proxy_sender',
 'gender_receiver',
 'inRelationship_receiver',
 'isClean_receiver',
 'isNight_receiver',
 'isStudent_receiver',
 'maxCost_receiver',
 'minCost_receiver',
 'numRoommates_receiver',
 'petsOk_receiver',
 'smokingOk_receiver',
 'term_receiver',
 'has_facebook_receiver',
 'created_day_of_year_receiver',
 'created_day_of_month_receiver',
 'created_day_of_week_receiver',
#  'created_hour_receiver',
 'has_password_receiver',
 'yes_room_receiver',
 'len_about_receiver',
 'has_about_receiver',
 'len_amenities_receiver',
 'has_amenities_receiver',
 'has_available_receiver',
 'td_creat_avail_receiver',
 'age_receiver',
 'has_birthdate_receiver',
 'yes_block_receiver',
 'has_college_receiver',
 'has_email_receiver',
 'len_hobbies_receiver',
 'has_hobbies_receiver',
 'has_hometown_receiver',
 'has_linkedin_receiver',
 'has_location_receiver',
 'rent_range_receiver',
 'len_neighborhoods_receiver',
 'has_neighborhoods_receiver',
 'has_numRoommates_receiver',
 'has_picture_receiver',
 'has_term_receiver',
 'has_work_receiver',
 'engagement_proxy_receiver',
 'age_dif',
 'same_gender',
 'same_relate',
 'same_clean',
 'same_night',
 'same_student',
 'same_smoking',
 'same_type',
 'same_term',
 'same_work',
 'same_city',
 'same_state',
 'same_country',
 'same_college',
 'same_metro',
 'overlap_roommate',
 'overlap_hobbies',
 'overlap_amenities',
 'overlap_rent',
 'distance',
 'urgency_receiver',
 'urgency_sender']

# ADD CONSTANT FOR FUNSIES

In [None]:
master_df['const'] = 1

# BREAK INTO X AND Y

In [None]:
X = master_df[col_to_keep]
y1 = master_df.response
y2 = master_df.len_convo

# FILL NA VALUES

In [None]:
temp = (X.describe().T ==len(X))
temp.columns = ['a','b','c','d','e','f','g','h']
col_with_na = temp.index[~temp.a]

In [None]:
X[col_with_na].describe()

In [None]:
X = X.apply(lambda x: x.fillna(x.median()),axis=0)

In [None]:
X[col_with_na].describe()

# PICKLE!

In [None]:
X.to_pickle(data_file_path+'X.pkl')
y1.to_pickle(data_file_path+'y1.pkl')
y2.to_pickle(data_file_path+'y2.pkl')
master_df.to_pickle(data_file_path+'master_df.pkl')
print("... saved as pickle")