In [171]:
import pandas as pd
import numpy as np
import os
from bokeh.charts import Scatter, output_file, show, BoxPlot, Histogram
from bokeh.sampledata.autompg import autompg as bdf
from bokeh.charts import defaults
from bokeh.palettes import brewer

import matplotlib

defaults.width = 700
defaults.height = 600

PROJECT_ROOT = os.path.dirname(os.path.abspath('__file__'))
GENDER_DATA_PATH = os.path.join(PROJECT_ROOT,'all_users_gender_score.csv')

In [65]:
df = pd.read_csv(GENDER_DATA_PATH)

# drop NaN for no messages sent 
df.dropna(thresh=1)

df = df[df.messages_sent.notnull()]

In [159]:
all_users = df['gender'].count()
print(all_users)
grouped = df.groupby('gender')
grouped.count()

224044


Unnamed: 0_level_0,id,messages_sent,msg_scored
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
F,118706,118706,92892
M,105338,105338,72034


In [55]:
#summary table for all users
df.drop(['id'],axis=1).describe()

Unnamed: 0,messages_sent,msg_scored
count,165103.0,165103.0
mean,9.061531,0.060929
std,14.986762,0.219751
min,0.0,-1.0
25%,2.0,0.0
50%,4.0,0.0
75%,11.0,0.165516
max,1037.0,1.0


In [160]:
# summary table for female/male 
df.drop(['id'],axis=1).groupby(['gender'],as_index=True).describe()
# df.drop(['id'],axis=1).groupby(['gender'],as_index=True).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,messages_sent,msg_scored
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
F,count,118706.0,92892.0
F,mean,7.671769,0.061183
F,std,13.83793,0.220757
F,min,0.0,-1.0
F,25%,2.0,0.0
F,50%,2.0,0.0
F,75%,8.0,0.166667
F,max,1037.0,1.0
M,count,105338.0,72034.0
M,mean,6.177809,0.060629


In [170]:
#covariance by gender  
grouped = df.groupby('gender')
grouped.apply(lambda x: x['messages_sent'].cov(x['msg_scored']))

gender
F   -0.006383
M    0.005992
dtype: float64

In [40]:
# total messages sent
df['messages_sent'].sum()

1495437.0

In [43]:
# plot message score box plot by gender 
palette = ['#000080', '#DAA520', '#D3D3D3']

# drop NaN msg scored and no gender 

df = df[df.msg_scored.notnull()]
df = df[df.gender.notnull()]
p = BoxPlot(mdf, values='msg_scored', 
                label='gender', 
                outliers=False,
                whisker_color='gender',
                palette=palette,
                title="Average Message Score (Excluding Outliers)")

output_file("templates/_boxplot_msg_score_by_gender.html")
show(p)

In [169]:
#scatterplot sentiment score v number of messages sent 
palette = ['#000080', '#DAA520', '#D3D3D3']
df.dropna(thresh=1)

df = df[df.messages_sent < 500]

p = Scatter(df, x='msg_scored', 
            y='messages_sent', 
            color='gender', 
            legend="top_left",
            legend_sort_field = 'color',
            legend_sort_direction = 'ascending',
            palette=palette, 
            title="Message Sentiment vs Total Number of Messages Sent for Users with <500 messages",
            xlabel="Avg Message Sentiment", 
            ylabel="Total Number of Messages Sent")
output_file("templates/_scatterplot_messages_v_score_by_gender.html")
show(p)


In [164]:
# total messages sent per user by gender
grouped = df.groupby('gender')
print(grouped)
total_messages = grouped['messages_sent'].sum()
male_messages = data[1]
female_messages = data[0]

print(female_messages - male_messages)
910685.0 - 650758.0

<pandas.core.groupby.DataFrameGroupBy object at 0x10cbeebe0>


259927.0

In [50]:
from bokeh.charts import Bar, output_file, show
from bokeh.models import NumeralTickFormatter


p = Bar(df, 'gender', 
        values='messages_sent', 
        title="Total Messages Sent By Gender", 
        color="goldenrod", ylabel='Total Messages Sent by Gender', legend=False
           )
p.yaxis[0].formatter = NumeralTickFormatter(format="0,000")

output_file("templates/total_messages_by_gender_bar.html")

show(p)

In [75]:
all_users = df['gender'].count()
grouped = df.groupby('gender')
female_users = grouped.get_group('M').count()/all_users 
male_users = grouped.get_group('F').count()/all_users 
print(female_users,male_users)

id               0.470167
gender           0.470167
messages_sent    0.470167
msg_scored       0.321517
dtype: float64 id               0.529833
gender           0.529833
messages_sent    0.529833
msg_scored       0.414615
dtype: float64


In [158]:
from bokeh.charts import Donut, show
import pandas as pd
palette = ['lightblue', '#DAA520']

grouped = df.groupby('gender')
female_users = grouped.get_group('F').count()['gender']
male_users = grouped.get_group('M').count()['gender']

data = pd.Series([female_users,male_users], index =['Female: 53%','Male: 47%'])
output_file("templates/_gender_breakdown.html")
pie_chart = Donut(data,title='Number of Users: Male v Female',color=palette,legend=True)
show(pie_chart)

118706 105338
