# Descriptive Statistics

1. Descriptive Statistics and Graphs
2. Number of Tweets (Total)
3. Number of Tweets (Time Series)
4. Gender Distribution
5. Language Distribution
6. Follower Counts
7. Client Usage (Android, iPhone, web etc.)

In [None]:
import pandas as pd
import numpy as np

In [None]:
!cat ../data/stream.tsv | head

In [2]:
twitterData=pd.read_table('../data/stream.tsv',
                          encoding='utf-8',
                          na_values=['NaN',''],
                          parse_dates=[1]
                        )
# Read in TSV and turn off NaN catching to leave in unrecognised genders

In [3]:
twitterData.head()

Unnamed: 0,ID,Date,Content,Type,DataSiftLanguage,TwitterLanguage,TwitterLocation,UNGPLocation,DSGender,UNGPGender,UNGPGenderProb,Followers,Friends,Sentiment,Topic,Subtopic
0,1e4088e50af4a700e07435975fd8ac10,2014-07-11 00:00:06,@takeyoudrews gjfkdjfkd fiquei puta qnd soube ...,twitter,pt,pt,,,,,,5008,4154,,Prevention,Neutral
1,1e4088e750b2a380e0745149c91047ac,2014-07-11 00:01:07,Odio que insulten a Argentina saben hijas de p...,twitter,es,es,Argentina,AR,,male,1.0,673,979,,Discrimination,Negative
2,1e408946aec2a380e0746014f5019be4,2014-07-11 00:43:47,"SÍNDICA FILHA DA PUTA, TE ODEIO, VELHA NOJENTA...",twitter,pt,pt,,,,,,267,268,,Discrimination,Negative
3,1e4089f9fb1aa800e07446359506a908,2014-07-11 02:04:00,Esse cara cansou de me peitar e ficar encarand...,twitter,pt,pt,,,unisex,,,568,258,,Discrimination,Negative
4,1e40925b4388a380e074eab24d9e7ba2,2014-07-11 18:03:47,@SPachec0 com ela eu vou usar camisinha de hor...,twitter,pt,pt,,,female,male,0.968354,432,243,,Prevention,Positive


In [4]:
twitterData.dtypes

ID                          object
Date                datetime64[ns]
Content                     object
Type                        object
DataSiftLanguage            object
TwitterLanguage             object
TwitterLocation             object
UNGPLocation                object
DSGender                    object
UNGPGender                  object
UNGPGenderProb             float64
Followers                  float64
Friends                    float64
Sentiment                  float64
Topic                       object
Subtopic                    object
dtype: object

## All Tweets

### Number of Tweets

In [5]:
nTweets = len(twitterData.index)
print "There are", nTweets, "tweets in the full dataset"

There are 29150 tweets in the full dataset


### Number of Tweets Over Time

In [6]:
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

In [9]:
#twitterData['Friends'].plot()

### Documents by DataSift Language

In [11]:
# Languages included in taxonomy: en, hi, ur, sw, ha, ig, yo
nDataSiftLanguage = twitterData.DataSiftLanguage.value_counts(sort=True, ascending=False, bins=None)
nDataSiftLanguage[0:10]

pt    25621
en     1079
es      756
sr      468
la      285
gl       73
rm       65
nn       65
sv       57
uz       56
dtype: int64

In [7]:
# Languages included in taxonomy: en, hi, ur, sw, ha, ig, yo
nDataSiftLanguage = twitterData.DataSiftLanguage.value_counts(normalize=True, sort=True, ascending=False, bins=None)
nDataSiftLanguage[0:10]

pt    0.878937
en    0.037015
es    0.025935
sr    0.016055
la    0.009777
gl    0.002504
rm    0.002230
nn    0.002230
sv    0.001955
uz    0.001921
dtype: float64

### Documents by Twitter Language

In [12]:
nTwitterLanguage = twitterData.TwitterLanguage.value_counts(sort=True, ascending=False, bins=None)
nTwitterLanguage[0:10]

pt    27089
es     1061
en      382
it       84
et       77
fr       59
pl       50
ro       44
sl       43
is       33
dtype: int64

In [8]:
nTwitterLanguage = twitterData.TwitterLanguage.value_counts(normalize=True, sort=True, ascending=False, bins=None)
nTwitterLanguage[0:10]

pt    0.929297
es    0.036398
en    0.013105
it    0.002882
et    0.002642
fr    0.002024
pl    0.001715
ro    0.001509
sl    0.001475
is    0.001132
dtype: float64

### Number of Documents by Twitter Location

In [9]:
# Reminder: Locations we are interested in are "IN", "PK", "NG", and "KE".
nLocation = twitterData.TwitterLocation.value_counts(normalize=False, sort=True, ascending=False, bins=None)
nLocation[0:15]

Rio de Janeiro          784
Brasil                  663
São Paulo               353
Brazil                  311
RJ                      153
Rio de Janeiro          143
SP                      122
Porto Alegre             99
Brasília                 92
PORTUGUÊS | Italiano     82
Curitiba                 72
Belo Horizonte           69
sp                       69
Rio Grande do Sul        67
Recife                   64
dtype: int64

### Number of Documents by UNGP Location

In [10]:
# Reminder: Locations we are interested in are "IN", "PK", "NG", and "KE".
nUngpLocation = twitterData.UNGPLocation.value_counts(normalize=False, sort=True, ascending=False, bins=None)
nUngpLocation[0:15]

BR    11267
US      711
AR      226
FR      137
GB      107
AU       92
MX       73
ES       68
IT       67
RO       62
CO       59
CH       58
DE       55
PT       43
PH       38
dtype: int64

In [37]:
# Getting Vincent ready 
vincent.initialize_notebook()
gpBlue='#00aeef'
gpLightGray='#96999b'
gpDarkBlue='#00447c'
gpRed='#cf5c42'
gpBrown='#e1d8ad'
gpPink='#f4d5e3'
gpLightBlue='#e1f4fd'

In [39]:
location_grouped = twitterData.groupby('UNGPLocation')
mean_location_grouped = location_grouped.mean().dropna()
mean_followers = mean_location_grouped.sort('Followers')['Followers']
followersBar = vincent.Bar(mean_followers)
followersBar.axis_titles(x='Country', y='Followers')
from vincent.axes import AxisProperties
from vincent.properties import PropertySet
from vincent.values import ValueRef
for axis in followersBar.axes:
    axis.properties = AxisProperties()
    for prop in ['ticks', 'axis', 'major_ticks', 'minor_ticks']:
        setattr(axis.properties, prop, PropertySet(stroke=ValueRef(value=gpLightGray)))
    axis.properties.title = PropertySet(font_size=ValueRef(value=20), 
                                        fill=ValueRef(value=gpLightGray))
    axis.properties.labels = PropertySet(fill=ValueRef(value=gpLightGray))
followersBar.axes[0].properties.labels.angle = ValueRef(value=0)
followersBar.axes[0].properties.labels.align = ValueRef(value='center')
followersBar.axes[0].properties.title.dy = ValueRef(value=20)
followersBar.scales[2].range = [gpBlue]
followersBar.to_json('../charts/followersBar.json')
followersBar

In [40]:
location_grouped = twitterData.groupby('UNGPLocation')
mean_location_grouped = location_grouped.mean().dropna()
mean_friends = mean_location_grouped.sort('Friends')['Friends']
friendsBar = vincent.Bar(mean_friends)
friendsBar.axis_titles(x='Country', y='Friends')
for axis in friendsBar.axes:
    axis.properties = AxisProperties()
    for prop in ['ticks', 'axis', 'major_ticks', 'minor_ticks']:
        setattr(axis.properties, prop, PropertySet(stroke=ValueRef(value=gpLightGray)))
    axis.properties.title = PropertySet(font_size=ValueRef(value=20), 
                                        fill=ValueRef(value=gpLightGray))
    axis.properties.labels = PropertySet(fill=ValueRef(value=gpLightGray))
friendsBar.axes[0].properties.labels.angle = ValueRef(value=0)
friendsBar.axes[0].properties.labels.align = ValueRef(value='center')
friendsBar.axes[0].properties.title.dy = ValueRef(value=20)
friendsBar.scales[2].range = [gpDarkBlue]
friendsBar.to_json('../charts/friendsBar.json')
friendsBar

In [41]:
location_grouped = twitterData.groupby('UNGPLocation')
mean_location_grouped = location_grouped.mean().dropna()
mean_genderProb = mean_location_grouped.sort('UNGPGenderProb')['UNGPGenderProb']
genderProb = vincent.Bar(mean_genderProb)
genderProb.axis_titles(x='Country', y='Average Gender Probablility')
for axis in genderProb.axes:
    axis.properties = AxisProperties()
    for prop in ['ticks', 'axis', 'major_ticks', 'minor_ticks']:
        setattr(axis.properties, prop, PropertySet(stroke=ValueRef(value=gpLightGray)))
    axis.properties.title = PropertySet(font_size=ValueRef(value=20), 
                                        fill=ValueRef(value=gpLightGray))
    axis.properties.labels = PropertySet(fill=ValueRef(value=gpLightGray))
genderProb.axes[0].properties.labels.angle = ValueRef(value=0)
genderProb.axes[0].properties.labels.align = ValueRef(value='center')
genderProb.axes[0].properties.title.dy = ValueRef(value=20)
genderProb.scales[2].range = [gpRed]
genderProb.to_json('../charts/genderProbBar.json')
genderProb

In [42]:
mpld3.enable_notebook()
gatesCountry = twitterData.UNGPLocation.value_counts(normalize=False, sort=True, ascending=False, bins=None)
gatesCountryFig = gatesCountry.plot(kind='barh', color='#00aeef')
mpld3.display()



AttributeError: 'BlendedGenericTransform' object has no attribute 'contains_branch'

In [70]:
import ggplot as gg
(ggplot(gg.aes(x='UNGPLocation'), data=twitterData)
+ gg.geom_bar() + gg.ggtitle("Gates Tweets")
+ gg.labs("Country", "Number of tweets"))

TypeError: 'module' object is not callable

In [77]:
languagePlot = ggplot(aes(x='DataSiftLanguage'), data=twitterData) + geom_bar() + ggtitle("Language Distribution") + labs("Language", "Number of tweets")
languagePlot

NameError: name 'aes' is not defined

In [1]:
from IPython.core.display import HTML
styles = open("../css/custom.css", "r").read()
HTML(styles)