In [1]:
import pandas as pd
import numpy as np
from time import time
import re
import string
import socket
from glob import glob
import os
import pyarrow.parquet as pq

In [2]:
country_code = "US"
print('Country:', country_code)
labels=['is_hired_1mo','is_unemployed','job_offer','job_search','lost_job_1mo']

# Local
if 'samuel' in socket.gethostname().lower():
    path_to_data = '../../data'
# Cluster
else:
    path_to_data = '/scratch/spf248/twitter/data'
print('Path to data:',path_to_data) 

Country: US
Path to data: ../../data


In [3]:
print('Extract labeled tweets:\n')
labeled_tweets=pd.read_pickle(os.path.join(path_to_data,'classification',country_code,'labeling','labels.pkl'))
print('# labeled tweets:', labeled_tweets.shape[0])

Extract labeled tweets:

# labeled tweets: 13045


In [4]:
labeled_tweets.head()

Unnamed: 0,tweet_id,text,is_unemployed,lost_job_1mo,job_search,is_hired_1mo,job_offer
0,470867540739383296,Is it true? 47% of Unemployed Americans Have J...,no,no,no,no,no
1,360093342517837824,"Unemployed, Under-Employed, Unhappily Employed...",no,no,no,no,yes
2,458751933139132417,Currently Unemployed and Looking to Expand you...,no,no,no,no,no
3,703677501579497472,Been unemployed for awhile? Contracting can he...,no,no,no,no,yes
4,375336349336739840,"Unemployed, Under-Employed, Unhappily Employed...",no,no,no,no,yes


In [5]:
tweets=pd.DataFrame()
sample_files=glob(os.path.join(path_to_data,'classification',country_code,'labeling','0','sampled','training','*'))
for sample_file in sample_files:
    if '.parquet' in sample_file:
        tweets=pd.concat([tweets,pd.read_parquet(sample_file)])
    else:
        print(sample_file,'skipped')
print('# Input tweets:',tweets.shape[0])

../../data/classification/US/labeling/0/sampled/training/_SUCCESS skipped
# Input tweets: 9800


In [6]:
df=tweets[tweets.target=='random'].drop(['target','text'],1).copy()
df=df.merge(labeled_tweets.drop('text',1))
df[labels]=df[labels].applymap(lambda x:{'yes':1,'no':0,'unsure':np.nan}[x])
df=df.drop('tweet_id',1).groupby('keyword').agg(['count','sum']).stack(level=0)
df=df['sum'].divide(df['count']).unstack()

In [7]:
df

Unnamed: 0_level_0,is_hired_1mo,is_unemployed,job_offer,job_search,lost_job_1mo
keyword,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
fired,0.01,0.050505,0.0,0.0,0.02
hired,0.061224,0.010101,0.112245,0.010101,0.0
job,0.0,0.02,0.39,0.01,0.0
laid_off,0.0,0.131313,0.04,0.0,0.030303
position,0.0,0.0,0.79,0.01,0.0
quit,0.0,0.03,0.0,0.01,0.0
unemployed,0.0,0.23,0.06,0.06,0.03
work,0.0,0.0,0.0,0.0,0.0


In [8]:
base_rates_keywords=spark.read.option("header", "true").csv(os.path.join(path_to_data,'classification',country_code,'base_rates_keywords')).toPandas()
base_rates_keywords=base_rates_keywords.loc[0].astype(float).rename('pct_tweets')

In [9]:
print('% keywords:')
base_rates_keywords

% keywords:


fired         0.000617
hired         0.000182
job           0.012056
laid_off      0.000013
position      0.001112
quit          0.002123
unemployed    0.000061
work          0.022954
Name: pct_tweets, dtype: float64

In [10]:
print('Lower bound base rates:')
base_rates=df.multiply(base_rates_keywords,0).sum()
base_rates

Lower bound base rates:


is_hired_1mo     0.000017
is_unemployed    0.000353
job_offer        0.005605
job_search       0.000158
lost_job_1mo     0.000015
dtype: float64

In [11]:
base_rates.to_csv(os.path.join(path_to_data,'classification',country_code,'base_rates.csv'),header=False)