In [1]:
import pandas as pd
import numpy as np
from numpy.random import choice
import my_pickle as mp
import matplotlib.pyplot as plt

from importlib import reload

In [None]:
reload(mp)

# LOAD DATA FRAMES

In [2]:
# Load User Data
user_df = mp.unjson_it('data_user')
user_df = mp.reinstate_date(user_df,['activeAt', 'available', 'birthday','created','updated'])
user_df.columns

Index(['I_count', 'I_ratio', 'about', 'activeAt', 'age', 'amenities',
       'available', 'birthday', 'college', 'created', 'exclaim_count',
       'exclaim_ratio', 'facebookId', 'gender', 'has_about', 'has_facebookId',
       'has_linkedinId', 'has_picture', 'has_room', 'hobbies', 'hometownCity',
       'hometownCountry', 'hometownState', 'inRelationship', 'isClean',
       'isNight', 'isStudent', 'len_about', 'linkedinId', 'location',
       'maxCost', 'metro', 'minCost', 'neighborhoods', 'numRoommates',
       'onboarded', 'period_count', 'period_ratio', 'petsOk', 'picture',
       'question_count', 'question_ratio', 'sentence_count', 'sentence_ratio',
       'smokingOk', 'term', 'type', 'updated', 'work'],
      dtype='object')

In [3]:
# Load Conversation Data
convo_df = mp.unjson_it('data_convo')
print("Make sure we're not using ECT data")
print(convo_df.timestamp.max())
convo_df = convo_df.set_index('conversation_id')
convo_df.columns

Make sure we're not using ECT data
2017-10-02 01:35:58.644000


Index(['convo_length', 'len_receiver', 'len_sender', 'mid_receiver',
       'mid_sender', 'response', 'timestamp', 'uid_receiver', 'uid_sender'],
      dtype='object')

In [4]:
# Import Message DF
message_df = mp.unjson_it('data_message')
print("Make sure we're not using ECT data")
print(convo_df.timestamp.max())
message_df.columns

Make sure we're not using ECT data
2017-10-02 01:35:58.644000


Index(['const', 'conversation_id', 'date', 'emailAttempted', 'emailed', 'flag',
       'imageURL', 'message_id', 'read', 'readBy', 's_uid', 'text_length',
       'timestamp', 'uid'],
      dtype='object')

# MERGE USER DATA TO CONVERSATION DATA

In [5]:
user_df['flag'] = True
old_names = user_df.columns
new_names = {old:old+'_sender' for old in old_names}
master_df = convo_df.join(user_df, on='uid_sender').join(user_df, on='uid_receiver', rsuffix="_receiver").rename(index=str, columns = new_names)
master_df.head(2)

Unnamed: 0_level_0,convo_length,len_receiver,len_sender,mid_receiver,mid_sender,response,timestamp,uid_receiver,uid_sender,I_count_sender,...,question_count_receiver,question_ratio_receiver,sentence_count_receiver,sentence_ratio_receiver,smokingOk_receiver,term_receiver,type_receiver,updated_receiver,work_receiver,flag_receiver
conversation_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1Jqbth3nqH0EWNOmyQmW,1,,17.0,,-Kt7F-lKnoanl54lMnUO,False,2017-09-03 14:58:30.759,0EWNOmyQmW,1Jqbth3nqH,3.0,...,0.0,0.0,6.0,0.015789,0.0,,,,Insightly,True
caEwxVJXPTGyR6cCT5K8,1,,9.0,,-KhP4gs1cj3-MfN7aZkb,False,2017-04-10 22:58:15.738,caEwxVJXPT,GyR6cCT5K8,0.0,...,0.0,0.0,10.0,0.022272,1.0,,,,Shamblott Family Dentistry,True


# DROP CONVERSATIONS WITH MISSING USER DATA

In [6]:
initial_len = len(master_df)

# Add False values
master_df.flag_receiver = master_df.flag_receiver.apply(lambda x: x if x==True else False)
master_df = master_df[master_df.flag_receiver]
final_len = len(master_df)

print("{} conversations were removed because they were missing user data".format(initial_len-final_len))

277 conversations were removed because they were missing user data


# ADD FEATURES

In [None]:
def roommate_rules(roommates):
    if roommates > 4: return 3
    elif roommates > 1: return 2
    elif roommates > 0: return 1
    else: return 0

def get_rent_range(row):
    max1 = row.maxCost_sender
    max2 = row.maxCost_receiver
    min1 = row.minCost_sender
    min2 = row.minCost_receiver
    if max1*max2*min1*min2 > 0:
        upper = min(max1,max2)
        lower = max(min1,min2)
        if upper-lower > 0: return upper-lower
        else: return 0
    else: return 175
    
def get_urgency_receiver(row):
    try:
        return row.available_receiver-row.timestamp
    except:
        return None
    
def get_urgency_sender(row):
    try:
        return row.available_sender-row.timestamp
    except:
        return None
    
def my_distance(row):
    a = row.location_receiver
    b = row.location_sender
    try: 
        one = b[0]-a[0]
        two = b[1]-a[1]
        return (one**2+two**2)**(.5)
    except:
        return None

def feature_time(df):
    df['age_dif'] = abs(df.age_sender-df.age_receiver)
    df['same_gender'] = df.gender_sender==df.gender_receiver
    df['same_relate'] = df.inRelationship_sender == df.inRelationship_receiver
    df['same_clean'] = df.isClean_sender == df.isClean_receiver
    df['same_night'] = df.isNight_sender == df.isNight_receiver
    df['same_student'] = df.isStudent_sender == df.isStudent_receiver
    df['same_student'] = df.isStudent_sender == df.isStudent_receiver
    df['same_smoking'] = df.smokingOk_sender == df.smokingOk_receiver
    df['same_type'] = df.type_sender == df.type_receiver
    df['same_term'] = df.term_sender == df.term_receiver
    df['same_work'] = df.work_sender == df.work_receiver
    df['same_city'] = df.hometownCity_sender == df.hometownCity_receiver
    df['same_state'] = df.hometownState_sender == df.hometownState_receiver
    df['same_country'] = df.hometownCountry_sender == df.hometownCountry_receiver
    df['same_college'] = df.college_sender == df.college_receiver
    df['same_metro'] = df.metro_sender == df.metro_receiver

    df['roommate_similarity'] = abs(df.numRoommates_sender.apply(lambda x: roommate_rules(x))
                                  -df.numRoommates_receiver.apply(lambda x: roommate_rules(x)))

    df['hobbies_receiver'] = df.hobbies_receiver.apply(lambda x: set(x) if isinstance(x,list) else set())
    df['hobbies_sender'] = df.hobbies_sender.apply(lambda x: set(x) if isinstance(x,list) else set())
    df['hobbies_overlap'] = df.apply(lambda x: len(x['hobbies_receiver'].intersection(x['hobbies_sender'])), axis=1)

    df['amenities_receiver'] = df.amenities_receiver.apply(lambda x: set(x) if isinstance(x,list) else set())
    df['amenities_sender'] = df.amenities_sender.apply(lambda x: set(x) if isinstance(x,list) else set())
    df['amenities_overlap'] = df.apply(lambda x: len(x['amenities_receiver'].intersection(x['amenities_sender'])), axis=1)

    df['neighborhoods_receiver'] = df.neighborhoods_receiver.apply(lambda x: set(x) if isinstance(x,list) else set())
    df['neighborhoods_sender'] = df.neighborhoods_sender.apply(lambda x: set(x) if isinstance(x,list) else set())
    df['neighborhoods_overlap'] = df.apply(lambda x: len(x['neighborhoods_receiver'].intersection(x['neighborhoods_sender'])), axis=1)

    df['rent_overlap'] = df.apply(get_rent_range, axis=1)
    
    df['urgency_receiver'] = df.available_receiver-df.timestamp
#     df['too_late_receiver'] = df['urgency_receiver'].apply(lambda x: x < 0)
    df['urgency_sender'] = df.available_sender-df.timestamp
#     df['too_late_sender'] = df['urgency_sender'].apply(lambda x: x < 0)
    
    df['distance'] = df.apply(my_distance, axis=1)
    
    
    binary = {True: 1, False: 0}
    
    col_to_binary = ['response', 
#                  'has_about_sender', 'has_about_receiver', 'gender_sender','gender_receiver',
#                  'facebookId_sender', 'linkedinId_sender', 'picture_sender', 'has_room_sender',
#                  'type_sender','type_receiver',
#                  'facebookId_receiver', 'linkedinId_receiver', 'picture_receiver', 'has_room_receiver',
                 'same_work','same_city','same_state','same_country','same_metro','same_college',
                 'same_gender','same_relate','same_clean','same_night','same_student','same_smoking','same_term','same_type']
    
    for col in col_to_binary:
        df[col] = df[col].map(binary)
    
    print("columns with null values: {}".format(len(df.columns[df.isnull().any()])))
    return df
  
master_df = feature_time(master_df)
master_df.head()

# PICKLE

In [None]:
mp.json_it(master_df,'data_master')