# Imports

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import json

from collections import defaultdict

# FIRST DATA FILE

### Read in data

In [None]:
filename = "/Users/gandalf/Documents/data/data_users.csv"

user_df = pd.read_csv(filename, parse_dates=['created', 
                                        'updated', 
                                        'available', 
                                        'birthday', 
                                        'lastActive'])

user_df = user_df.fillna({'legacyId' : 'None',
                'about'    : '', 
                'birthday' : pd.to_datetime('1899-01-01'), 
                'latitude' : 0, 
                'longitude': 0})

user_df['len_about'] = user_df.about.apply(lambda x: len(x))
user_df['uid'] = user_df.legacyId
col_to_drop = ['id','legacyId','hashedPassword','email','firstName','lastName','username']
user_df = user_df.drop(col_to_drop, axis=1)

user_df = user_df[['uid', 'created', 'updated', 'about', 'len_about', 'available',
       'birthday', 'collegeId', 'emailVerified', 'foundRoommate', 'gender',
       'groupChat', 'hometownId', 'inRelationship', 'isClean', 'isNight',
       'isStudent', 'lastActive', 'latitude', 'longitude', 'maxCost',
       'minCost', 'numRoommates', 'onboarded', 'petsOk', 'pictureId',
       'roomPostId', 'roomTypeId', 'smokingOk', 'term', 'work']]

user_df = user_df.drop_duplicates()
user_df = user_df.set_index('uid')
user_df.head(2)        

### Pickle dataframe

In [None]:
'''Don't pickle here, pickle later '''

# path_users = "/Users/gandalf/Documents/data/data_users.pkl"
# user_df.to_pickle(path_users)



# SECOND DATA FILE

### Read in data

In [None]:
filename = "/Users/gandalf/Documents/data/data_messages.json"
json_data=open(filename).read()
data = json.loads(json_data)

### Create first data frame (all messages)

In [None]:
column_names = ['message_id',       # Added, unique for each row
               'conversation_id',   # Added, identifies the conversation, not unique
               'uid',               # message sender
               'read',              # true/false value
               'readBy',            # message recipient(s)
               'text_length',       # Added, length of text
               'timestamp',         # time
               'imageURL',          # image url
               'emailAttempted',    # ?
               'emailed']           # ?

mi,ci,ui,rd,rb,tl,ts,iu,ea,em = [],[],[],[],[],[],[],[],[],[]

for conversation_id, conversation_data in data["conversations"].items():
    for message_id, message_data in conversation_data.items():
        information = defaultdict(lambda: '', message_data)
        ci.append(conversation_id)
        mi.append(message_id)
        ui.append(information['uid'])
        rd.append(information['read'])
        rb.append(information['readBy'])
        tl.append(len(information['text'].split()))
        ts.append(information['timestamp'])
        iu.append(information['imageURL'])
        ea.append(information['emailAttempted'])
        em.append(information['emailed'])

message_df = pd.DataFrame([mi,ci,ui,rd,rb,tl,ts,iu,ea,em]).T
message_df.columns=column_names

message_df.head(2)

### Pickle first data frame (all messages)

In [None]:
path_message = "/Users/gandalf/Documents/data/data_message.pkl"
message_df.to_pickle(path_message)

### Create second data frame (last messages)

In [None]:
column_names = ['user_id',       # user id
               'first_ten',      # first ten of the 20 digit key
               'last_ten',       # last ten of the 20 digit key (prob userid)
               'lastMessageId']  # message id

ui,ft,lt,lm = [],[],[],[]

for user_id, user_data in data['users'].items():
    for key, value in user_data['conversations'].items():
        ui.append(user_id)
        ft.append(key[:10])
        lt.append(key[10:])
        lm.append(value['lastMessageId'])

lastmessage_df = pd.DataFrame([ui,ft,lt,lm]).T
lastmessage_df.columns=column_names

lastmessage_df.head(2)

### Pickle second data frame (last messages)

In [None]:
path_lastmessage = "/Users/gandalf/Documents/data/data_lastmessage.pkl"
lastmessage_df.to_pickle(path_lastmessage)

### Create third data frame (response df)

In [None]:
message_df['const'] = 1
convo_length = message_df.groupby('conversation_id').const.sum().T.to_dict()

column_names = ['conv_id',       # user id
                'response',      # did anyone respond?
                'first_uid',     # user who sent the first message
                'first_mid',     # message id of the first message
                'second_uid',    # user who responded
                'second_mid']    # message id of the second message

ci,rs,fu,fm,su,sm = [],[],[],[],[],[]
already_added = set()

first_message = True
second_message = False
        
for index, row in message_df.iterrows():
    if convo_length[row.conversation_id] == 1:
        ci.append(row.conversation_id)
        rs.append(False)
        fu.append(row.uid)
        fm.append(row.message_id)
        su.append(None)
        sm.append(None)
        
    elif row.conversation_id not in already_added:
        if first_message:
            ci.append(row.conversation_id)
            rs.append(True)
            fu.append(row.uid)
            fm.append(row.message_id)
            first_message = False
            second_message = True
        elif second_message:
            su.append(row.uid)
            sm.append(row.message_id)
            already_added.add(row.conversation_id)
            first_message = True
            second_message = False

response_df = pd.DataFrame([ci,rs,fu,fm,su,sm]).T
response_df.columns=column_names
response_df.head()

### Pickle third data frame (response)

In [None]:
path_response = "/Users/gandalf/Documents/data/data_response.pkl"
response_df.to_pickle(path_response)

# COMBINE DATA FILES

In [None]:
response_df['const'] = 1
messages_sent_by_user = response_df.groupby('first_uid').const.sum()
messages_responded_by_user = response_df[response_df.response == True].groupby('first_uid').const.sum()
messages_response_rate_by_user = messages_responded_by_user/messages_sent_by_user

user_response = pd.concat([messages_sent_by_user, messages_responded_by_user,messages_response_rate_by_user], axis=1)
user_response.columns=[['sent','responses','ratio']]
user_response = user_response.fillna(0)

user_df = user_df.join(user_response)
user_df = user_df.fillna({'sent' : 0,'responses'    : 0, 'ratio' :0})

In [None]:
path_users = "/Users/gandalf/Documents/data/data_users.pkl"
user_df.to_pickle(path_users)

# INVESTIGATE A CONVERSATION BY ID

In [None]:
convo_id = 'put id here'

for conversation_id, conversation_data in data["conversations"].items():
    if conversation_id == convo_id:
        for message_id, message_data in conversation_data.items():
            information = defaultdict(lambda: '', message_data)
            print(information['text'])
            
message_df[message_df.conversation_id == convo_id]