In [3]:
import os
import socket
import pandas as pd
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql import Window
from pyspark.sql.types import *

In [4]:
try:
    spark
except NameError:
    spark = SparkSession.builder.appName("").getOrCreate()

In [5]:
if 'samuel' in socket.gethostname().lower():
    path_to_data = '../../data'
else:
    path_to_data = '/user/spf248/twitter/data'

In [6]:
print('Path to data:',path_to_data) 
country_code = "US"
print('Country:', country_code)
iterations=range(2)
models=['GLOVE','BERT']
labels=['is_hired_1mo', 'is_unemployed', 'job_offer', 'job_search', 'lost_job_1mo']
keywords=['fired', 'hired', 'job', 'laid_off', 'position', 'quit', 'unemployed', 'work']
targets=[
'target_anyone_hiring',
'target_here_is_a_job_opportunity_you_might_be_interested_in',
'target_i_am_currently_not_working',
'target_i_am_searching_for_a_new_position',
'target_i_got_hired_today',
'target_i_lost_my_job_today',
'target_i_recently_started_working_at_my_new_job',
'target_i_was_fired_earlier_this_week',
'target_looking_for_a_new_position',
'target_now_i_am_unemployed']

base_rates=[
1.7342911457049017e-05,
0.0003534645020523677,
0.005604641971672389,
0.00015839552996469054,
1.455338466552472e-05]
N_random=92114009
base_ranks=[int(x*N_random) for x in base_rates]
label2rank=dict(zip(labels,base_ranks))

Path to data: ../../data
Country: US


In [None]:
random_tweets = spark.read.parquet(os.path.join(path_to_data,'classification',country_code,'random-scored'))
df = random_tweets.select('tweet_id','text')
df.cache()

In [None]:
for iteration in iterations:
    print()
    print('********* Iteration:',iteration,'*********')
    for model in models:
        print()
        print('****** Model:',model,'******')
        for label in labels:
            print('*** Label:',label,'***')
            predictions=spark.read.option("header", "true").csv(os.path.join(path_to_data,'classification',country_code,'predictions','iteration_'+str(iteration),model,label,'random*'))
            tmp=predictions.withColumnRenamed('proba','score').withColumnRenamed('second','score').select('tweet_id','score')
            tmp=tmp.withColumn('score',F.col('score').cast('float'))
            tmp=tmp.join(df,on='tweet_id')
            tmp=tmp.where(~tmp.text.contains('RT @'))
            tmp=tmp.sort(F.col("score").desc()).limit(label2rank[label])
            sample_top_tweets=tmp.sample(False, 110/label2rank[label], seed=0)
            sample_top_tweets.coalesce(1).write.mode("ovserwrite").parquet(os.path.join(path_to_data,'classification',country_code,'sample_top_tweets','iteration_'+str(iteration),model,label))

In [9]:
for iteration in iterations:
    print()
    print('********* Iteration:',iteration,'*********')
    for model in models:
        print()
        print('****** Model:',model,'******')
        for label in labels:
            print('*** Label:',label,'***')
            df=spark.read.parquet(os.path.join(path_to_data,'classification',country_code,'sample_top_tweets','iteration_'+str(iteration),model,label)).toPandas()
            filename='high_scored_tweets_iteration'+str(iteration)+'_'+model+'_'+label+'.pkl'
            df.to_pickle(os.path.join(path_to_data,'classification',country_code,'labeling',str(iteration),'sampled','evalproc',filename))


********* Iteration: 0 *********

****** Model: GLOVE ******
*** Label: is_hired_1mo ***
*** Label: is_unemployed ***
*** Label: job_offer ***
*** Label: job_search ***
*** Label: lost_job_1mo ***

****** Model: BERT ******
*** Label: is_hired_1mo ***
*** Label: is_unemployed ***
*** Label: job_offer ***
*** Label: job_search ***
*** Label: lost_job_1mo ***

********* Iteration: 1 *********

****** Model: GLOVE ******
*** Label: is_hired_1mo ***
*** Label: is_unemployed ***
*** Label: job_offer ***
*** Label: job_search ***
*** Label: lost_job_1mo ***

****** Model: BERT ******
*** Label: is_hired_1mo ***
*** Label: is_unemployed ***
*** Label: job_offer ***
*** Label: job_search ***
*** Label: lost_job_1mo ***
