In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Location
from shapely.geometry import Point
from geopandas import GeoDataFrame
import geopandas as gpd

import my_pickle as mp

import warnings
warnings.filterwarnings('ignore')

# READ IN DATA

In [None]:
user_df = mp.unjson_it('data_user').reset_index()
user_df.columns = ['uid', 'I_count', 'I_ratio', 'about', 'activeAt', 'age', 'amenities',
       'available', 'birthday', 'college', 'created', 'exclaim_count',
       'exclaim_ratio', 'facebookId', 'gender', 'has_about', 'has_room',
       'hobbies', 'hometownCity', 'hometownCountry', 'hometownState',
       'inRelationship', 'isClean', 'isNight', 'isStudent', 'len_about',
       'linkedinId', 'location', 'maxCost', 'metro','minCost','neighborhoods',
       'numRoommates', 'onboarded', 'period_count', 'period_ratio', 'petsOk',
       'picture', 'question_count', 'question_ratio', 'sentence_count',
       'sentence_ratio', 'smokingOk', 'term', 'type', 'updated', 'work']
user_df = mp.reinstate_date(user_df,['activeAt', 'available', 'birthday','created','updated'])
user_df['const'] = 1

convo_df = mp.unjson_it('data_convo')
message_df = mp.unjson_it('data_message') 
master_df = mp.unjson_it('data_master')

print("Make sure we're not using ECT data")
print(convo_df.timestamp.max())

# SEPARATE INTO ACTIVE AND NOT ACTIVE

In [None]:
# Make lists of senders and receivers
uid_receivers = set(convo_df.uid_receiver)
uid_senders = set(convo_df.uid_sender)

# Combine the lists
uid_active = uid_senders.union(uid_receivers)

In [None]:
# Create a flag for active
user_df['active'] = user_df.uid.apply(lambda x: x in uid_active)
print("{:.3} of onboarded users are active".format(user_df.active.sum()/len(user_df)))

In [None]:
# Separate into two dataframes
active_df = user_df[user_df['active']==True]
not_active_df = user_df[user_df['active']==False]

# GET CITY DATAFRAME WITH ACTIVE/NOT ACTIVE COUNTS

In [None]:
cities = not_active_df.groupby(['metro']).count().const.to_dict().keys()
not_active_count = not_active_df.groupby(['metro']).count().const.to_dict()
active_count = active_df.groupby(['metro']).count().const.to_dict()

In [None]:
city_df = pd.DataFrame(list(cities),[1]*len(cities))
city_df.columns = ['metro']

In [None]:
city_df['active_users'] = city_df.metro.apply(lambda x: active_count[x] if x in active_count.keys() else 0)
city_df['not_active_users'] = city_df.metro.apply(lambda x: not_active_count[x])
city_df['percent_active'] = city_df['active_users']/city_df['not_active_users']
city_df.head(2)

In [None]:
fig, ax = plt.subplots(figsize = (16,6))
plt.barh(city_df.metro,city_df.percent_active)

In [None]:
# fn to display plot
def plot_feature(col, bins=None):
    active = active_df[col].hist(figsize = (16,4), bins = bins, normed=True, alpha = .5)
    not_active = not_active_df[col].hist(figsize = (16,4), bins = bins,normed=True, alpha = .3, color='r')
    
    plt.xlabel(col,fontsize=14)
    plt.ylabel('Number of Users',fontsize=14)
    plt.legend(['Active',"Not Active"])
    plt.show()
    print(col+" active users: {:.9f}".format(active_df[col].mean()))
    print(col+" not active users: {:.9f}\n\n".format(not_active_df[col].mean()))

# COMPARE 'ACTIVE' VS 'NOT ACTIVE' USERS

'active' = sent or received at least one message

In [None]:
user_df.gender = user_df.gender.apply(lambda x: 1 if x == 'male' else 0)
active_df = user_df[user_df['active']==True]
not_active_df = user_df[user_df['active']==False]

## Gender

In [None]:
active_users = len(active_df)
active_males = active_df.gender.values.sum()
active_females = active_users-active_males

In [None]:
not_active_users = len(not_active_df)
not_active_males = not_active_df.gender.values.sum()
not_active_females = not_active_users-active_males

In [None]:
all_males = active_males+not_active_males
all_females = active_females+not_active_females

In [None]:
print(" MALE VS FEMALE ACTIVE USERS \n")
print("         Active  Not-Active     Percent-Active")
print(" Male\t {} \t {} \t\t{:.4}".format(active_males,not_active_males,active_males/all_males))
print(" Female\t {} \t {}\t\t{:.4}".format(active_females,not_active_females,active_females/all_females))

# birthday

In [None]:
plt.title('Comparing Active and Not Active Users',fontsize=24) 
plot_feature('age')

In [None]:
plt.title('Comparing Active and Not Active Users',fontsize=24) 
plot_feature('minCost')

## Age

In [None]:
plt.title('Comparing Active and Not Active Users',fontsize=24) 
plot_feature('age', range(1, 100))

## Has About

In [None]:
plt.title('Comparing Active and Not Active Users',fontsize=24) 
plot_feature('has_about', range(0, 3))

In [None]:
plt.title('Comparing Active and Not Active Users',fontsize=24) 
plot_feature('len_about',range(1,1000,10))

In [None]:
mp.json_it(user_df,'data_user_active')

In [None]:
plt.title('Comparing Active and Not Active Users',fontsize=24) 
plot_feature('created')