In [5]:
import os
import numpy as np
from timeit import default_timer as timer
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.ml.feature import QuantileDiscretizer
from pyspark.sql.window import Window
from pyspark.sql.types import *
try:
    spark
except NameError:
    spark = SparkSession.builder.appName("").getOrCreate()

In [3]:
country_code="US"
labels=['loss', 'unemployed', 'search', 'hire', 'offer']
out_of_work = ['loss', 'unemployed', 'search']
path_to_data='/user/spf248/twitter/data'
print('Country:',country_code)
print('Labels:',', '.join(labels))

Country: CO
Labels: loss, unemployed, search, hire, offer


In [None]:
print('Load...')
start = timer()
users_attributes=spark.read.orc(os.path.join(path_to_data,'users','by-country','country_code='+country_code))
predictions=spark.read.orc(os.path.join(path_to_data,'classification',country_code,'keywords'))
end = timer()
print('Computing time (in sec.):', round(end - start))

In [None]:
def find_nearest(array, value):
    array = [float(x)-float(value) if float(x)-float(value)>0 else float(value)-float(x) for x in array]
    return int(np.argmin(array))
find_nearest_udf = F.udf(find_nearest, IntegerType())

In [None]:
tmp=predictions.groupby(['user_id','year']).agg(F.count('tweet_id').alias('n_tweets'))
tmp=tmp.join(predictions.groupby('user_id').agg(F.min('created_at').alias('oldest_tweet')),on='user_id')
tmp=tmp.join(users_attributes.select('user_id','created_at').withColumnRenamed('created_at','account_creation'),on='user_id')
tmp=tmp.withColumn("start",F.concat_ws("-",F.col("year"),F.lit(1),F.lit(1)).cast("timestamp"))
tmp=tmp.withColumn("end",F.concat_ws("-",F.col("year"),F.lit(12),F.lit(31)).cast("timestamp"))
tmp=tmp.withColumn('account_age_by_year', F.datediff(tmp['end'],F.least(tmp['account_creation'],tmp['oldest_tweet']))) # Some account creation date are posterior to the first tweet
tmp=tmp.withColumn('activity_period_by_year', F.datediff(tmp['end'],F.greatest(tmp['start'],tmp['oldest_tweet'])))
tmp=tmp.withColumn('activity_by_year',tmp['n_tweets']/tmp['activity_period_by_year'])
w_user = Window().partitionBy('user_id').orderBy(F.col("year"))
tmp=tmp.withColumn('lag_account_age_by_year',F.lag('account_age_by_year').over(w_user))
tmp=tmp.withColumn('lag_activity_by_year',F.lag('activity_by_year').over(w_user))
tmp=tmp.na.drop()
quantiles_account_age=tmp.groupBy('year').agg(F.expr('percentile_approx(lag_account_age_by_year, array(0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9))').alias('lag_account_age_by_year_quantiles'))
quantiles_activity=tmp.groupBy('year').agg(F.expr('percentile_approx(lag_activity_by_year, array(0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9))').alias('lag_activity_by_year_quantiles'))
tmp=tmp.join(quantiles_account_age,on='year')
tmp=tmp.join(quantiles_activity,on='year')
tmp=tmp.withColumn('lag_account_age_by_year_quantile',find_nearest_udf('lag_account_age_by_year_quantiles','lag_account_age_by_year'))
tmp=tmp.withColumn('lag_activity_by_year_quantile',find_nearest_udf('lag_activity_by_year_quantiles','lag_activity_by_year'))
tmp=tmp.select('year','user_id','lag_account_age_by_year_quantile','lag_activity_by_year_quantile')

In [None]:
print('Count users in each bucket...')
start = timer()
df=predictions.withColumn('out_of_work',(sum([predictions[col] for col in out_of_work])>0).cast("int"))
df=df.withColumn('year',F.year('created_at').cast("string"))
df=df.withColumn('month',F.month('created_at').cast("string"))
df=df.groupBy('year','month','user_location','user_id').max()
df=df.toDF(*(col.replace('max(','').replace(')','') for col in df.columns))
df=df.join(tmp,on=['year','user_id'])
col2func=dict((label,'sum') for label in labels+['out_of_work'])
col2func.update({'user_id':'count'})
df=df.groupby(['year','month','user_location','lag_account_age_by_year_quantile','lag_activity_by_year_quantile']).agg(col2func)
df=df.toDF(*(col.replace('count(','').replace('sum(','').replace(')','') for col in df.columns))
df.write.mode("overwrite").parquet(os.path.join(path_to_data,'classification',country_code,'users_predictions'))
end = timer()
print('Computing time (in sec.):', round(end - start))