# Descriptive Statistics - GPS

4-5-17

This notebook is to gather a ground truth on how many tweets are geotagged (have a `lat/lon`).

In [1]:
# Libraries
import os, sys
import pandas as pd

from multiprocessing import Pool

In [2]:
# Directories
ext_dir = '../data/external/'
proc_dir = '../data/processed/'

scrape_in = ext_dir + 'scrape/'
scrape_out = proc_dir + 'scrape/4-5/{}-gps.csv'

**Objectives**

Get an aggregated count over time. For each day we should have 
1. The date
2. Languages
3. GPS

In [3]:
# Functions
def parallelize_series(series, func):
    pool = Pool(6)

    df = pool.map(func, series)

    pool.close()
    pool.join()
    return df


def process_scrape_gps(csv):
    """Does the heavy lifting to process one csv
    """
    agg_cols = ['language', 'date', 'latitude']
    df = pd.read_csv(csv, usecols=agg_cols,
                     parse_dates=['date'],
                    infer_datetime_format=True)
    
    # Drop rows that'll give us trouble
    df['date'].dropna(how='any', inplace=True)
    
    # add day column so we can groupby
    df['day'] = df['date'].dt.date
    
    # get the ground truth for numbers
    # LOGIC: every tweet has to have been created at some point, every tweet has a date
    without_gps = df[['day', 'language', 'date']].groupby(['day', 'language']).count()
    
    # get gps numbers
    with_gps = df[['day', 'language', 'latitude']].groupby(['day', 'language']).count()
    
    # combine them
    gps_df = pd.concat([with_gps, without_gps], axis=1)
    
    # rename columns 
    gps_df.columns = ['gps', 'wo_gps']
    
    return gps_df

def process_scrape_f(f):
     # file names
    f_in = scrape_in + f
    f_out = scrape_out.format(f[:-4])
    
    # sanity check for times sake
    if (os.path.isfile(f_in) == False):
        return
    else:
        try:
            grouped_df = process_scrape_gps(f_in)
            grouped_df.to_csv(f_out)
            print ("...{}".format(f))
        except:
            print ("Couldn't read {}".format(f))
    
#process_scrape_agg(test_f)

In [4]:
# List of all the files we'll be evaluating

# testing
#scrape_fs = os.listdir(scrape_in)

# 'production'
scrape_fs = [x for x in  os.listdir(scrape_in)
           if (x.split('.')[0][-4:] != '_log')]

In [5]:
parallelize_series(scrape_fs, process_scrape_f)

...tweets_immigrant_34360.csv
...tweets_immigrant_34342.csv
Couldn't read tweets_immigrant_34324.csv
...tweets_immigrant_34315.csv
...tweets_immigrant_34333.csv
...tweets_immigrant_34316.csv
...tweets_immigrant_34334.csv
...tweets_immigrant_34325.csv
...tweets_immigrant_34326.csv
...tweets_immigrant_34343.csv
...tweets_immigrant_34351.csv
...tweets_immigrant_34361.csv
...tweets_immigrant_34344.csv


  result = (True, func(*args, **kwds))


...tweets_immigrant_34362.csv
...tweets_immigrant_34352.csv
Couldn't read tweets_immigrant_34345.csv
...tweets_immigrant_34335.csv
...tweets_immigrant_34346.csv
...tweets_immigrant_34336.csv
...tweets_immigrant_34317.csv
...tweets_immigrant_34318.csv
...tweets_immigrant_34327.csv
...tweets_immigrant_34328.csv
...tweets_immigrant_34319.csv
...tweets_immigrant_34363.csv
...tweets_immigrant_34320.csv
...tweets_immigrant_34329.csv
...tweets_immigrant_34353.csv
...tweets_immigrant_34364.csv
...tweets_immigrant_34330.csv
...tweets_immigrant_34354.csv
...tweets_immigrant_34347.csv
...tweets_immigrant_34348.csv
...tweets_immigrant_34337.csv
...tweets_immigrant_34338.csv
...tweets_immigrant_34321.csv
Couldn't read tweets_immigrant_34322.csv
...tweets_immigrant_34331.csv
...tweets_immigrant_34332.csv
...tweets_immigrant_34365.csv
...tweets_immigrant_34366.csv
...tweets_immigrant_34349.csv
...tweets_immigrant_34339.csv
...tweets_immigrant_34355.csv
...tweets_immigrant_34350.csv
...tweets_immigran

  result = (True, func(*args, **kwds))


...tweets_immigrant_34359.csv
...tweets_immigrant_34414.csv
...tweets_immigrant_34405.csv
...tweets_immigrant_34406.csv
...tweets_immigrant_34387.csv
...tweets_immigrant_34381.csv
...tweets_immigrant_34382.csv
...tweets_immigrant_34388.csv
...tweets_immigrant_34397.csv
...tweets_immigrant_34398.csv
Couldn't read tweets_immigrant_34372.csv
Couldn't read tweets_immigrant_34373.csv
...tweets_immigrant_34374.csv
...tweets_immigrant_34415.csv
...tweets_immigrant_34407.csv
...tweets_immigrant_34416.csv
...tweets_immigrant_34408.csv
...tweets_immigrant_34383.csv
...tweets_immigrant_34375.csv
...tweets_immigrant_34384.csv
...tweets_immigrant_34376.csv
...tweets_immigrant_34399.csv
...tweets_immigrant_34400.csv
Couldn't read tweets_immigrant_34401.csv
...tweets_immigrant_34402.csv
...tweets_immigrant_34417.csv
...tweets_immigrant_34409.csv
...tweets_immigrant_34418.csv
...tweets_immigrant_34410.csv
...tweets_immigrant_34377.csv
...tweets_immigrant_34385.csv
...tweets_immigrant_34386.csv
...twee

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

#### Combining individual files

In [6]:
# list all newly processed files 
gps_fs = os.listdir((proc_dir + 'scrape/4-5/'))

In [7]:
def prep_gps_csv(csv):
    """Reads a csv in, sets the multiindex, and returns it"""
    df = pd.read_csv(csv)
    df.set_index(['day', 'language'], inplace=True)
    return df

In [9]:
# create one frame out of all of them
all_gps = pd.concat([prep_gps_csv((proc_dir+'scrape/4-5/'+x)) for x in gps_fs])

# Add all the days/langs up
all_gps = all_gps.groupby(level=[0, 1]).sum()

In [10]:
all_gps.to_csv((proc_dir+'scrape/4-5/all-gps.csv'))

In [11]:
all_gps.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,gps,wo_gps
day,language,Unnamed: 2_level_1,Unnamed: 3_level_1
2016-12-22,ar,348,8464
2016-12-22,ca,0,10
2016-12-22,cs,0,2
2016-12-22,de,12,180
2016-12-22,el,0,6


---

Rollup the languages to day sums.

In [12]:
all_gps = pd.read_csv((proc_dir+'scrape/4-5/all-gps.csv'))
all_gps.head()

Unnamed: 0,day,language,gps,wo_gps
0,2016-12-22,ar,348,8464
1,2016-12-22,ca,0,10
2,2016-12-22,cs,0,2
3,2016-12-22,de,12,180
4,2016-12-22,el,0,6


In [15]:
day_gps = all_gps[['day', 'gps', 'wo_gps']].groupby(['day']).sum()
day_gps.to_csv((proc_dir+'scrape/4-5/day-gps.csv'))