In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt
from sklearn.model_selection import train_test_split
import seaborn as sns
sns.set()

In [2]:
# directory
data_file_path = "/Users/gandalf/Documents/coding/do_not_commit/capstone/"
website_file_path = '/Users/gandalf/Documents/coding/rczyrnik.github.io/capstone/'

In [3]:
user_df =  pd.read_pickle(data_file_path+'user_df.pkl')
convo_df =  pd.read_pickle(data_file_path+'convo_df.pkl')

user_df.head(2)

Unnamed: 0_level_0,about,amenities,available,college,gender,hobbies,hometown,hometownCity,hometownCountry,hometownCounty,...,len_neighborhoods,has_neighborhoods,neighborhoods_city,neighborhoods_metro,neighborhoods_name,has_numRoommates,has_picture,has_term,has_work,engagement_proxy
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00yWCOsIvK,,{Laundry},,Academy of Art University,1,{Book Worm},,,,,...,0,0,{},{},{},1,0,1,1,9
01Kc0ZYNae,,"{A/C, Laundry, Dishwasher}",,University of Arizona,1,"{Parties, Movies, Foodie, Gaming}",,,,,...,0,0,{},{},{},1,1,1,1,10


In [4]:
convo_df.head(2)

Unnamed: 0,first_message_uid,second_message_uid,len_convo,first_message_read,first_message_timestamp,first_message_text,first_message_len_char,first_message_len_word,first_message_date,first_message_year,first_message_day_of_year,first_message_month,first_message_day_of_month,first_message_week,first_message_day_of_week,first_message_day,first_message_hour,const2,response
02sjTzcsaO02omVxM8Xm,02omVxM8Xm,02sjTzcsaO,1,False,2017-09-11 02:41:19.188,Wya?,4,1,2017-09-11,2017,254,9,11,37,1,736583,2,1,0
07lVtClAEQ01iDVeaXl1,01iDVeaXl1,07lVtClAEQ,1,False,2017-09-06 20:03:30.419,Hey I'm looking for some roommates too have yo...,62,11,2017-09-06,2017,249,9,6,36,3,736578,20,1,0


# MERGE USER DATA TO CONVERSATION DATA

In [5]:
# add a flag so we can identify rows with missing user data
user_df['flag'] = True

# get labels for new columns
old_names = user_df.columns
new_names = {old:old+'_sender' for old in old_names}

# merge and rename
master_df = convo_df.join(user_df, on='first_message_uid', rsuffix="_sender") \
        .join(user_df, on='second_message_uid', rsuffix="_receiver") \
        .rename(index=str, columns = new_names)

# DROP CONVERSATIONS WITH MISSING USER DATA

In [6]:
initial_len = len(master_df)

# add False values
master_df.flag_receiver = master_df.flag_receiver.apply(lambda x: x if x==True else False)

# keep only rows with the flag
master_df = master_df[master_df.flag_receiver]

# add False values
master_df.flag_sender = master_df.flag_sender.apply(lambda x: x if x==True else False)

# keep only rows with the flag
master_df = master_df[master_df.flag_sender]

# see how many conversations we lost
print("{} conversations (of {}) were removed because they were missing user data".format(initial_len-len(master_df), initial_len))
# 227+182 conversations (of 11165) were removed because they were missing user data

361 conversations (of 10594) were removed because they were missing user data


# ADD FEATURES

In [7]:
def roommate_rules(roommates):
    if roommates > 4: return 3
    elif roommates > 1: return 2
    elif roommates > 0: return 1
    else: return 0

def get_rent_range(row):
    max1 = row.maxCost_sender
    max2 = row.maxCost_receiver
    min1 = row.minCost_sender
    min2 = row.minCost_receiver
    if max1*max2*min1*min2 > 0:
        upper = min(max1,max2)
        lower = max(min1,min2)
        if upper-lower > 0: return upper-lower
        else: return 0
    else: return 175

def my_distance(row):
    a = row.location_receiver
    b = row.location_sender
    try:
        one = b[0]-a[0]
        two = b[1]-a[1]
        return (one**2+two**2)**(.5)
    except:
        return None

def get_features(df):
    # age difference
    df['age_dif'] = abs(df.age_sender-df.age_receiver)

    # similarities
    df['same_gender'] = df.gender_sender==df.gender_receiver
    df['same_relate'] = df.inRelationship_sender == df.inRelationship_receiver
    df['same_clean'] = df.isClean_sender == df.isClean_receiver
    df['same_night'] = df.isNight_sender == df.isNight_receiver
    df['same_student'] = df.isStudent_sender == df.isStudent_receiver
    df['same_smoking'] = df.smokingOk_sender == df.smokingOk_receiver
    df['same_type'] = df.type_sender == df.type_receiver
    df['same_term'] = df.term_sender == df.term_receiver
    df['same_work'] = df.work_sender == df.work_receiver
    df['same_city'] = df.hometownCity_sender == df.hometownCity_receiver
    df['same_state'] = df.hometownState_sender == df.hometownState_receiver
    df['same_country'] = df.hometownCountry_sender == df.hometownCountry_receiver
    df['same_college'] = df.college_sender == df.college_receiver
    df['same_metro'] = df.neighborhoods_metro_sender == df.neighborhoods_metro_receiver

    # overlaps
    df['overlap_roommate'] = abs(df.numRoommates_sender.apply(lambda x: roommate_rules(x))
                                  -df.numRoommates_receiver.apply(lambda x: roommate_rules(x)))
    df['overlap_hobbies'] = df.apply(lambda x: len(x['hobbies_receiver'].intersection(x['hobbies_sender'])), axis=1)
    df['overlap_amenities'] = df.apply(lambda x: len(x['amenities_receiver'].intersection(x['amenities_sender'])), axis=1)
    df['overlap_neighborhoods'] = df.apply(lambda x: len(x['neighborhoods_name_receiver'].intersection(x['neighborhoods_name_sender'])), axis=1)
    df['overlap_rent'] = df.apply(get_rent_range, axis=1)

    # urgencies
    df['urgency_receiver'] = df.available_receiver-df.first_message_date
    df.urgency_receiver = df.urgency_receiver.apply(lambda x: x.days)

    df['urgency_sender'] = df.available_sender-df.first_message_date
    df.urgency_sender = df.urgency_sender.apply(lambda x: x.days)

    # distance between sender and receiver
    df['distance'] = df.apply(my_distance, axis=1)


    # rename T/F as 1/0
    binary = {True: 1, False: 0}
    col_to_binary = ['same_work','same_city','same_state','same_country','same_metro',
                 'same_college','same_gender','same_relate','same_clean','same_night',
                 'same_student','same_smoking','same_term','same_type']
    for col in col_to_binary: df[col] = df[col].map(binary)

    print("columns with null values: {}".format(len(df.columns[df.isnull().any()])))
    return df

In [8]:
master_df = get_features(master_df)

columns with null values: 37


# CREATE X AND y DATAFRAMES

In [18]:
col_to_keep = [
 'first_message_len_word',       
 'first_message_day_of_year',   
 'first_message_day_of_month',
 'first_message_day_of_week',  
 'first_message_hour',     
 'const2',                  
 'gender_sender',           
 'inRelationship_sender',   
 'isClean_sender',          
 'isNight_sender',          
 'isStudent_sender',        
 'maxCost_sender',          
 'minCost_sender',          
 'numRoommates_sender',     
 'petsOk_sender',           
 'smokingOk_sender',
 'term_sender',
 'has_facebook_sender',     
 'created_day_of_year_sender',
 'created_day_of_month_sender',
 'created_day_of_week_sender',
#  'created_hour_sender',
 'has_password_sender',
 'yes_room_sender',
 'len_about_sender',
 'has_about_sender',
 'len_amenities_sender',
 'has_amenities_sender',
 'has_available_sender',
 'td_creat_avail_sender',
 'age_sender',
 'has_birthdate_sender',
 'yes_block_sender',
 'has_college_sender',
 'has_email_sender',
 'len_hobbies_sender',
 'has_hobbies_sender',
 'has_hometown_sender',
 'has_linkedin_sender',
 'has_location_sender',
 'rent_range_sender',
 'len_neighborhoods_sender',
 'has_neighborhoods_sender',
 'has_numRoommates_sender',
 'has_picture_sender',
 'has_term_sender',
 'has_work_sender',
 'engagement_proxy_sender',
 'gender_receiver',
 'inRelationship_receiver',
 'isClean_receiver',
 'isNight_receiver',
 'isStudent_receiver',
 'maxCost_receiver',
 'minCost_receiver',
 'numRoommates_receiver',
 'petsOk_receiver',
 'smokingOk_receiver',
 'term_receiver',
 'has_facebook_receiver',
 'created_day_of_year_receiver',
 'created_day_of_month_receiver',
 'created_day_of_week_receiver',
#  'created_hour_receiver',
 'has_password_receiver',
 'yes_room_receiver',
 'len_about_receiver',
 'has_about_receiver',
 'len_amenities_receiver',
 'has_amenities_receiver',
 'has_available_receiver',
 'td_creat_avail_receiver',
 'age_receiver',
 'has_birthdate_receiver',
 'yes_block_receiver',
 'has_college_receiver',
 'has_email_receiver',
 'len_hobbies_receiver',
 'has_hobbies_receiver',
 'has_hometown_receiver',
 'has_linkedin_receiver',
 'has_location_receiver',
 'rent_range_receiver',
 'len_neighborhoods_receiver',
 'has_neighborhoods_receiver',
 'has_numRoommates_receiver',
 'has_picture_receiver',
 'has_term_receiver',
 'has_work_receiver',
 'engagement_proxy_receiver',
 'age_dif',
 'same_gender',
 'same_relate',
 'same_clean',
 'same_night',
 'same_student',
 'same_smoking',
 'same_type',
 'same_term',
 'same_work',
 'same_city',
 'same_state',
 'same_country',
 'same_college',
 'same_metro',
 'overlap_roommate',
 'overlap_hobbies',
 'overlap_amenities',
 'overlap_rent',
 'distance',
 'urgency_receiver',
 'urgency_sender']

# ADD CONSTANT FOR FUNSIES

In [19]:
master_df['const'] = 1

# BREAK INTO X AND Y

In [20]:
# list(master_df.columns)

In [21]:
X = master_df[col_to_keep]
y1 = master_df.response
y2 = master_df.len_convo

# FILL NA VALUES

In [22]:
temp = (X.describe().T ==len(X))
temp.columns = ['a','b','c','d','e','f','g','h']
col_with_na = temp.index[~temp.a]

In [23]:
X[col_with_na].describe()

Unnamed: 0,isStudent_sender,maxCost_sender,minCost_sender,numRoommates_sender,term_sender,td_creat_avail_sender,rent_range_sender,isStudent_receiver,maxCost_receiver,minCost_receiver,numRoommates_receiver,term_receiver,td_creat_avail_receiver,rent_range_receiver,distance,urgency_receiver,urgency_sender
count,9672.0,8406.0,8406.0,8406.0,8406.0,3982.0,8406.0,8339.0,7431.0,7431.0,7431.0,7431.0,2327.0,7431.0,8416.0,2327.0,3982.0
mean,0.528122,1195.015465,623.108494,2.789555,11.38068,-71.490959,571.906971,0.510973,1229.659534,661.088683,2.690755,11.207644,49.706059,568.570852,9.518807,31.336914,-80.315419
std,0.499234,665.788985,446.456559,2.282039,4.926755,1196.889876,513.951499,0.49991,537.240724,353.417519,2.54434,4.207162,296.09395,421.673463,30.562198,292.695182,1195.528476
min,0.0,100.0,50.0,-1.0,1.0,-16010.0,0.0,0.0,50.0,50.0,0.0,1.0,-8184.0,0.0,0.0,-8197.0,-16014.0
25%,0.0,800.0,350.0,1.0,10.0,11.0,300.0,0.0,850.0,400.0,1.0,12.0,17.0,300.0,0.07661,5.0,7.0
50%,1.0,1100.0,600.0,2.0,12.0,29.0,500.0,1.0,1100.0,650.0,2.0,12.0,32.0,500.0,0.263554,21.0,25.0
75%,1.0,1400.0,800.0,4.0,12.0,60.0,700.0,1.0,1500.0,800.0,4.0,12.0,71.0,700.0,2.547652,56.0,49.0
max,1.0,5000.0,5000.0,30.0,24.0,3681.0,4700.0,1.0,5000.0,5000.0,27.0,24.0,1492.0,4700.0,282.228958,1473.0,3681.0


In [24]:
X = X.apply(lambda x: x.fillna(x.median()),axis=0)

In [25]:
X[col_with_na].describe()

Unnamed: 0,isStudent_sender,maxCost_sender,minCost_sender,numRoommates_sender,term_sender,td_creat_avail_sender,rent_range_sender,isStudent_receiver,maxCost_receiver,minCost_receiver,numRoommates_receiver,term_receiver,td_creat_avail_receiver,rent_range_receiver,distance,urgency_receiver,urgency_sender
count,10233.0,10233.0,10233.0,10233.0,10233.0,10233.0,10233.0,10233.0,10233.0,10233.0,10233.0,10233.0,10233.0,10233.0,10233.0,10233.0,10233.0
mean,0.553992,1178.051402,618.982703,2.648588,11.491254,-10.104368,559.068699,0.601485,1194.156161,658.05238,2.501612,11.424607,36.026385,549.794782,7.875418,23.35063,-15.981726
std,0.497101,604.523875,404.735832,2.090279,4.471584,748.175216,466.625429,0.489616,461.444529,301.204009,2.189923,3.602488,141.368768,360.626216,27.940831,139.620567,747.485767
min,0.0,100.0,50.0,-1.0,1.0,-16010.0,0.0,0.0,50.0,50.0,0.0,1.0,-8184.0,0.0,0.0,-8197.0,-16014.0
25%,0.0,900.0,400.0,1.0,12.0,29.0,300.0,0.0,1000.0,500.0,1.0,12.0,32.0,350.0,0.100542,21.0,25.0
50%,1.0,1100.0,600.0,2.0,12.0,29.0,500.0,1.0,1100.0,650.0,2.0,12.0,32.0,500.0,0.263554,21.0,25.0
75%,1.0,1300.0,800.0,3.0,12.0,29.0,650.0,1.0,1300.0,800.0,3.0,12.0,32.0,600.0,0.772706,21.0,25.0
max,1.0,5000.0,5000.0,30.0,24.0,3681.0,4700.0,1.0,5000.0,5000.0,27.0,24.0,1492.0,4700.0,282.228958,1473.0,3681.0


# PICKLE!

In [26]:
X.to_pickle(data_file_path+'X.pkl')
y1.to_pickle(data_file_path+'y1.pkl')
y2.to_pickle(data_file_path+'y2.pkl')
master_df.to_pickle(data_file_path+'master_df.pkl')
print("... saved as pickle")

... saved as pickle


In [33]:
# train test split
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X.as_matrix(), y1.as_matrix(), random_state=17)

X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X.as_matrix(), y2.as_matrix(), random_state=17)

data_file_path = 'data/'
np.save(data_file_path+'X_train_class', X_train_class)
np.save(data_file_path+'X_test_class', X_test_class)
np.save(data_file_path+'y_train_class', y_train_class)
np.save(data_file_path+'y_test_class', y_test_class)


np.save(data_file_path+'X_train_reg', X_train_reg)
np.save(data_file_path+'X_test_reg', X_test_reg)
np.save(data_file_path+'y_train_reg', y_train_reg)
np.save(data_file_path+'y_test_reg', y_test_reg)