In [None]:
import pandas as pd
import numpy as np
from numpy.random import choice
import my_pickle as mp
import matplotlib.pyplot as plt

from importlib import reload

In [None]:
reload(mp)

In [None]:
user_df = mp.unjson_it('data_user')
convo_df = mp.unjson_it('data_convo')

print("Make sure we're not using ECT data")
print(convo_df.timestamp.max())

In [None]:
convo_df.head()

# GET USER DATA FOR SENDER AND RECEIVER

In [None]:
old_names = user_df.columns
new_names = {old:old+'_sender' for old in old_names}
convo_df = convo_df.join(user_df, on='first_uid').join(user_df, on='second_uid', rsuffix="_receiver").rename(index=str, columns = new_names)
convo_df.head()

In [None]:
def roommate_rules(roommates):
    if roommates > 4: return 3
    elif roommates > 1: return 2
    elif roommates > 0: return 1
    else: return 0

def get_rent_range(row):
    max1 = row.maxCost_sender
    max2 = row.maxCost_receiver
    min1 = row.minCost_sender
    min2 = row.minCost_receiver
    if max1*max2*min1*min2 > 0:
        upper = min(max1,max2)
        lower = max(min1,min2)
        if upper-lower > 0: return upper-lower
        else: return 0
    else: return 175
    
def feature_time(df):
    df['age_dif'] = abs(df.age_sender-df.age_receiver)
    df['same_gender'] = df.gender_sender==df.gender_receiver
    df['same_relate'] = df.inRelationship_sender == df.inRelationship_receiver
    df['same_clean'] = df.isClean_sender == df.isClean_receiver
    df['same_night'] = df.isNight_sender == df.isNight_receiver
    df['same_student'] = df.isStudent_sender == df.isStudent_receiver
    df['same_student'] = df.isStudent_sender == df.isStudent_receiver
    df['same_smoking'] = df.smokingOk_sender == df.smokingOk_receiver
    df['same_type'] = df.type_sender == df.type_receiver
    df['same_term'] = df.term_sender == df.term_receiver
    df['same_work'] = df.work_sender == df.work_receiver
    df['same_city'] = df.hometownCity_sender == df.hometownCity_receiver
    df['same_state'] = df.hometownState_sender == df.hometownState_receiver
    df['same_country'] = df.hometownCountry_sender == df.hometownCountry_receiver
    df['same_college'] = df.college_sender == df.college_receiver
    df['roommate_similarity'] = abs(df.numRoommates_sender.apply(lambda x: roommate_rules(x))
                                  -df.numRoommates_receiver.apply(lambda x: roommate_rules(x)))
    df['hobbies_receiver'] = df.hobbies_receiver.apply(lambda x: set() if not x else set(x))
    df['hobbies_sender'] = df.hobbies_sender.apply(lambda x: set() if not x  else set(x))
    df['hobbies_overlap'] = df.apply(lambda x: len(x['hobbies_receiver'].intersection(x['hobbies_sender'])), axis=1)
    
    df['amenities_receiver'] = df.amenities_receiver.apply(lambda x: set() if not x  else set(x))
    df['amenities_sender'] = df.amenities_sender.apply(lambda x: set() if not x  else set(x))
    df['amenities_overlap'] = df.apply(lambda x: len(x['amenities_receiver'].intersection(x['amenities_sender'])), axis=1)

#     df['neighborhoods_receiver'] = df.neighborhoods_receiver.apply(lambda x: set() if not x  else set(x))
#     df['neighborhoods_sender'] = df.neighborhoods_sender.apply(lambda x: set() if not x  else set(x))
#     df['neighborhoods_overlap'] = df.apply(lambda x: len(x['neighborhoods_receiver'].intersection(x['neighborhoods_sender'])), axis=1)

    df['rent_overlap'] = df.apply(get_rent_range, axis=1)

    TF = {True: 1, False: 0, 'male':1, 'female':0, 'shared':0,'private':1,'nan':-1}
#     
    col_to_TF = ['response', 
                 'has_about_sender', 'has_about_receiver', 'gender_sender','gender_receiver',
                 'facebookId_sender', 'linkedinId_sender', 'picture_sender', 'has_room_sender',
                 'type_sender','type_receiver',
                 'facebookId_receiver', 'linkedinId_receiver', 'picture_receiver', 'has_room_receiver',
                 'same_work','same_city','same_state','same_country','same_college',
                 'same_gender','same_relate','same_clean','same_night','same_student','same_smoking','same_term','same_type']
    for col in col_to_TF:
        df[col] = df[col].map(TF)
    
    print("columns with null values: {}".format(len(df.columns[df.isnull().any()])))
    return df

def drop_columns(df):
    col_to_drop = ['conv_id','first_uid','second_uid','first_mid','second_mid',
                   'updated_sender','activeAt_sender',
                   'created_sender','available_sender','about_sender','birthday_sender', 'location_sender', 'work_sender',
                   'hometownCity_sender','hometownState_sender','hometownCountry_sender','college_sender','neighborhoods_sender',
                   'amenities_sender', 'hobbies_sender',
                   'updated_receiver','activeAt_receiver',
                   'created_receiver','available_receiver','about_receiver','birthday_receiver', 'location_receiver', 'work_receiver',
                   'hometownCity_receiver','hometownState_receiver','hometownCountry_receiver','college_receiver','neighborhoods_receiver',
                   'amenities_receiver', 'hobbies_receiver']
                   
    return df.drop(col_to_drop, axis=1)   

master_df = feature_time(convo_df)
df = drop_columns(master_df)
df = df.T.fillna(df.mean(axis=1)).T

In [None]:
df.head()

# PICKLE

In [None]:
mp.json_it(df,'data_features')
mp.json_it(master_df,'data_master')

# OLD STUFF

In [None]:
# def get_rent_range(first_user, second_user):
#     max1 = first_user.maxCost
#     max2 = second_user.maxCost
#     min1 = first_user.minCost
#     min2 = second_user.minCost
#     if max1*max2*min1*min2 > 0:
#         upper = min(first_user.maxCost,second_user.maxCost)
#         lower = max(first_user.minCost,second_user.minCost)
#         if upper-lower > 0: return upper-lower
#         else: return 0
#     else: return 175
    
# def test_get_rent_range():    
#     yes1 = '0a9yOPKFSH'
#     yes2 = '013LzOrVju'
#     no1 = '0UBLgJIHgz'
#     no2 = '0EWNOmyQmW'
#     low = '01DE0NCjwh'
#     high = '02GDyQPLII'
#     print(get_rent_range(user_df.loc[yes1], user_df.loc[yes2])) # should be 150
#     print(get_rent_range(user_df.loc[yes1], user_df.loc[no1]))  # should be 175
#     print(get_rent_range(user_df.loc[no2], user_df.loc[no1]))   # should be 175
#     print(get_rent_range(user_df.loc[low], user_df.loc[high]))  # should be 0
    
# def get_average_overlap(df,samples):
#     # to find average overlap of two users' rent ranges
#     # comes out to about $175
#     s = 0
#     for n in range(samples):
#         two_users = choice(user_df[user_df.cost_range>0].index,2)
#         overlap = user_df.loc[two_users,'maxCost'].min()-user_df.loc[two_users,'minCost'].max()
#         if overlap < 0: overlap = 0
#         s += overlap
#     return s/samples

# def roommate_rules(roommates):
#     if roommates > 4: return 3
#     elif roommates > 1: return 2
#     elif roommates > 0: return 1
#     else: return 0
    
# def get_similar_roommates(first_user, second_user):
#     roommates1 = roommate_rules(first_user.numRoommates)
#     roommates2 = roommate_rules(second_user.numRoommates)
#     return abs(roommates1-roommates2)

# def get_ammenities_overlap(first, second):
#     if isinstance(first.amenities, float) or isinstance(second.amenities, float):
#         return 0
#     else:
#         return len(set(first.amenities).intersection(set(second.amenities))) + 1

# def test_ammenities_overlap():    
#     user0 = user_df.index[0]   # [Dishwasher]
#     user1 = user_df.index[1]   # NaN
#     user2 = user_df.index[2]   # [Laundry, A/C, Dishwasher, Private Bath]
#     user8 = user_df.index[8]   # [Laundry]

#     a = '8Ws3QWmC4B'
#     b = '52lmw7KQcq'
    
#     get_ammenities_overlap(user_df.loc[str(a)], second = user_df.loc[str(b)])

# def get_hobbies_overlap(first, second):
#     if isinstance(first.hobbies, float) or isinstance(second.hobbies, float):
#         return 0
#     else:
#         return len(set(first.hobbies).intersection(set(second.hobbies))) + 1

# FAILED METRICS
# def get_inverse_distance(first_user, second_user):
#     x1 = first_user.longitude
#     x2 = second_user.latitude
#     y1 = first_user.longitude
#     y2 = second_user.latitude
#     dist = ((x2-x1)**2+(y2-y1)**2)**(.5)
# #     if dist > 0: return dist
# #     else: return 1
#     return ((x2-x1)**2+(y2-y1)**2)**(-.5)

# def get_urgency(timestamp, second_user):
#     time_of_message = pd.to_datetime(timestamp*1000000)
#     time_of_looking
#     print(time_of_message)
# get_urgency(1479148499453, 'x')

# def get_length_similarity(first, second):
#     return abs(np.log(first.len_about)-np.log(second.len_about))

# def test_length_similarity(a,b):    
#     user14 = user_df.index[14]   # 14
#     user15 = user_df.index[15]   # 45
#     user16 = user_df.index[16]   # 42
#     user17 = user_df.index[17]   # 108
#     user18 = user_df.index[18]   # 521
#     user19 = user_df.index[19]   # 0
#     user20 = user_df.index[20]   # 0 

#     one = user_df.index[a]
#     two = user_df.index[b]
    
    
#     return get_length_similarity(user_df.loc[str(one)], second = user_df.loc[str(two)])
    
# print(test_length_similarity(14,15))
# print(test_length_similarity(15,16))
# print(test_length_similarity(16,17))
# print(test_length_similarity(17,18))
# print(test_length_similarity(18,19))

In [None]:
# def feature_time(df, user_df):
    
#     '''
#     age_dif: difference in ages between users
#     rent_overlap: buy how much do their ideal rent ranges overlap
#     same_gender: m/m or f/f
#     same_relate:  are they both in relationships or single?
#     same_clean: are the both clean/messy
#     same_night: are they both early-birds or night owls?
#     same_student: are they both students?
#     sender_attractiveness: do people generally respond to this senders messages?
#     receiver_selectivity: does this receiver generally respond to peopls messages?
#     '''
    
#     ad,ro,di,rn,ao,ho,sg,sr,sc,sn,ss,sm,st,sa,rr,al,l1, l2 = [],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]

    
#     for index, row in df.iterrows():
#         first = user_df.loc[str(row.first_uid)]
#         second = user_df.loc[str(row.second_uid)]
#         ad.append(abs(first.age - second.age))
#         ro.append(get_rent_range(first, second))
# #         di.append(get_inverse_distance(first, second))
#         rn.append(get_similar_roommates(first, second))
#         ao.append(get_ammenities_overlap(first, second))
#         ho.append(get_hobbies_overlap(first, second))
#         sg.append(first.gender == second.gender)
#         sr.append(first.inRelationship == second.inRelationship)
#         sc.append(first.isClean == second.isClean)
#         sn.append(first.isNight == second.isNight)
#         ss.append(first.isStudent == second.isStudent)
#         sm.append(first.smokingOk == second.smokingOk)
#         st.append(first.term == second.term)
#         sa.append(first.attractiveness)
#         rr.append(second.responsiveness)
# #         al.append(get_length_similarity(first, second))
#         l1.append(first.len_about)
#         l2.append(second.len_about)

#     print(len(df))
#     print(len(sr))
#     df['age_dif'] = ad
#     df['rent_overlap'] = ro
# #     df['inverse_distance'] = di
#     df['roommate_num_sim'] = rn
#     df['amenities_overlap'] = ao
#     df['hobbies_overlap'] = ho
#     df['same_gender'] = sg
#     df['same_relate'] = sr
#     df['same_clean'] = sc
#     df['same_night'] = sn
#     df['same_student'] = ss
#     df['same_smoking'] = sm
#     df['same_term'] = st
#     df['sender_attractiveness'] = sa
#     df['receiver_responsiveness'] = rr
#     df['length_sender'] = l1
#     df['length_receiver'] = l2
    
#     TF = {True: 1, False: 0}
#     col_to_TF = ['same_gender','same_relate','same_clean','same_night','same_student','same_smoking','same_term']
#     for col in col_to_TF:
#         df[col] = df[col].map(TF)
    
#         print(len(df))
#     print(len(sr))
#     df['age_dif'] = abs(age_sender-age_receiver)
#     df['rent_overlap'] = ro
# #     df['inverse_distance'] = di
#     df['roommate_num_sim'] = rn
#     df['amenities_overlap'] = ao
#     df['hobbies_overlap'] = ho
#     df['same_gender'] = sg
#     df['same_relate'] = sr
#     df['same_clean'] = sc
#     df['same_night'] = sn
#     df['same_student'] = ss
#     df['same_smoking'] = sm
#     df['same_term'] = st
#     df['sender_attractiveness'] = sa
#     df['receiver_responsiveness'] = rr
#     df['length_sender'] = l1
#     df['length_receiver'] = l2
    
    
    return df

In [None]:
df.head()