In [5]:
import os
from timeit import default_timer as timer
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.ml.feature import QuantileDiscretizer
try:
    spark
except NameError:
    spark = SparkSession.builder.appName("").getOrCreate()

In [1]:
country_code="US"
labels=['loss', 'unemployed', 'search', 'hire', 'offer']
out_of_work = ['loss', 'unemployed', 'search']
features=['statuses_per_day', 'account_age']
n_quantiles=10
path_to_data='/user/spf248/twitter/data'
print('Country:',country_code)
print('Number of quantiles:',n_quantiles)
print('Labels:',', '.join(labels))
print('Features:',', '.join(features))

Country: US
Number of quantiles: 10
Labels: loss, unemployed, search, hire, offer
Features: statuses_per_day, account_age


In [None]:
print('Load...')
start = timer()
predictions=spark.read.orc(os.path.join(path_to_data,'classification',country_code,'keywords'))
users=spark.read.parquet(os.path.join(path_to_data,'classification',country_code,'users'))
end = timer()
print('Computing time (in sec.):', round(end - start))

In [None]:
print('Discretize...')
start = timer()
tmp=users.select('user_id','statuses_count','account_age')
tmp=tmp.withColumn('statuses_per_day',F.col('statuses_count')/F.col('account_age'))
for feature in features:
    qds=QuantileDiscretizer(numBuckets=n_quantiles,inputCol=feature,outputCol=feature+'_quantile',relativeError=0.01)
    bucketizer=qds.fit(tmp)
    tmp=bucketizer.setHandleInvalid("skip").transform(tmp)
end = timer()
print('Computing time (in sec.):', round(end - start))

In [None]:
print('Count users in each status...')
start = timer()
df=predictions.withColumn('out_of_work',(sum([predictions[col] for col in out_of_work])>0).cast("int"))
df=df.withColumn('year',F.year('created_at').cast("string"))
df=df.withColumn('month',F.month('created_at').cast("string"))
df=df.groupBy('year','month','user_location','user_id').max()
df=df.toDF(*(col.replace('max(','').replace(')','') for col in df.columns))
df=df.join(tmp.select(*['user_id']+[feature+'_quantile' for feature in features]),on='user_id')
col2func=dict((label,'sum') for label in labels+['out_of_work'])
col2func.update({'user_id':'count'})
df=df.groupby(['year','month','user_location']+[feature+'_quantile' for feature in features]).agg(col2func)
df=df.toDF(*(col.replace('count(','').replace('sum(','').replace(')','') for col in df.columns))
df.write.mode("overwrite").parquet(os.path.join(path_to_data,'classification',country_code,'users_counts'))
end = timer()
print('Computing time (in sec.):', round(end - start))