spark-submit --master yarn --deploy-mode cluster  --conf spark.yarn.submit.waitAppCompletion=false --conf spark.serializer=org.apache.spark.serializer.KryoSerializer --conf spark.speculation=false --conf spark.executorEnv.LANG=en_US.UTF-8 --conf spark.yarn.appMasterEnv.LANG=en_US.UTF-8 --driver-cores 20 --driver-memory 50G --num-executors 30 --executor-cores 20 --executor-memory 20G pyspark-sample-training-set.py

In [1]:
print('Computing Time (in sec):')
(1579053833248-1579051242747)/1000

Computing Time (in sec):


2590.501

In [2]:
import os
import sys
import socket
import re
import numpy as np
import string
from timeit import default_timer as timer
from datetime import datetime
import pandas as pd

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf,desc,row_number,col,year,month,dayofmonth,dayofweek,to_timestamp,size,isnan,lower,rand, lit
import pyspark.sql.functions as F
from pyspark.sql.functions import broadcast
from pyspark.sql.types import MapType, StringType, IntegerType, StructType, StructField, FloatType, ArrayType

In [3]:
print('Hostname:', socket.gethostname())
try:
    spark
except NameError:
    if 'samuel' in socket.gethostname().lower():
        print('Create Local SparkSession')
        spark = SparkSession.builder.config(
        "spark.driver.host", "localhost").appName(
        "sample-tweets-for-labeling").getOrCreate()
    else:
        print('Create Cluster SparkSession')
        spark = SparkSession.builder.appName(
        "sample-tweets-for-labeling").getOrCreate()
spark

Hostname: Samuels-MacBook-Pro.local


In [4]:
country_code = "US"
print('Country:', country_code)

# Local
if  'samuel' in socket.gethostname().lower():
    path_to_data = os.path.join('../../data/classification',country_code)
# Cluster
else:
    path_to_data = os.path.join('/user/spf248/twitter/data/classification',country_code)
print('Path to data:',path_to_data)

Country: US
Path to data: ../../data/classification/US


In [5]:
print('Import tweets containing keywords')
filtered = spark.read.parquet(os.path.join(path_to_data,'filtered-scored'))
filtered.cache()

Import tweets containing keywords


DataFrame[tweet_id: string, text: string, fired: boolean, hired: boolean, job: boolean, laid_off: boolean, position: boolean, quit: boolean, unemployed: boolean, work: boolean, keyword: boolean, target_anyone_hiring: float, target_here_is_a_job_opportunity_you_might_be_interested_in: float, target_i_am_currently_not_working: float, target_i_am_searching_for_a_new_position: float, target_i_got_hired_today: float, target_i_lost_my_job_today: float, target_i_recently_started_working_at_my_new_job: float, target_i_was_fired_earlier_this_week: float, target_looking_for_a_new_position: float, target_now_i_am_unemployed: float]

In [6]:
print('Import random tweets')
random = spark.read.parquet(os.path.join(path_to_data,'random-scored'))
random.cache()

Import random tweets


DataFrame[tweet_id: string, text: string, fired: boolean, hired: boolean, job: boolean, laid_off: boolean, position: boolean, quit: boolean, unemployed: boolean, work: boolean, keyword: boolean, target_anyone_hiring: float, target_here_is_a_job_opportunity_you_might_be_interested_in: float, target_i_am_currently_not_working: float, target_i_am_searching_for_a_new_position: float, target_i_got_hired_today: float, target_i_lost_my_job_today: float, target_i_recently_started_working_at_my_new_job: float, target_i_was_fired_earlier_this_week: float, target_looking_for_a_new_position: float, target_now_i_am_unemployed: float]

In [7]:
print('Drop duplicated texts')
filtered = filtered.drop_duplicates(subset=['text'])
random = random.drop_duplicates(subset=['text'])

Drop duplicated texts


In [8]:
keywords=sorted([keyword for keyword in filtered.columns 
                 if keyword not in ['tweet_id','text','keyword'] 
                 and 'target_' not in keyword])
print('Keywords:\n')
print('\n'.join(keywords))

Keywords:

fired
hired
job
laid_off
position
quit
unemployed
work


In [9]:
targets=sorted([target for target in filtered.columns if 'target_' in target])
print('Targets:\n')
print('\n'.join(targets))

Targets:

target_anyone_hiring
target_here_is_a_job_opportunity_you_might_be_interested_in
target_i_am_currently_not_working
target_i_am_searching_for_a_new_position
target_i_got_hired_today
target_i_lost_my_job_today
target_i_recently_started_working_at_my_new_job
target_i_was_fired_earlier_this_week
target_looking_for_a_new_position
target_now_i_am_unemployed


# Create Sample For Labeling

In [32]:
n_sample=100
n_labels=n_sample*((len(targets)+1)*(len(keywords)+1)-1)
print('# sampled tweets per group:', n_sample)
print('# labels:', n_labels)

# sampled tweets per group: 100
# labels: 9800


In [33]:
print('Create Sample for Labeling')
sampled_ids=[]
sampled_tweets=[]
sampled_keywords=[]
sampled_targets=[]

Create Sample for Labeling


In [34]:
for keyword in keywords:
    
    print(keyword)
    
    # Take Random Sample of Size n_sample from Non-sampled Tweets Containing Specific Keyword
    tmp=filtered.where(
    (filtered[keyword]==True)&(~filtered['tweet_id'].isin(sampled_ids))).sample(
    False,0.1,seed=0).limit(n_sample).select('tweet_id','text').rdd.map(lambda x: (x[0],x[1])).collect()
    
    # Keep Track of Sampled Ids
    sampled_ids.extend(list(list(zip(*tmp))[0]))
    
    # Keep Track of Sampled Tweets
    sampled_tweets.extend(list(list(zip(*tmp))[1]))
    
    # Keep Track of Sampling Properties
    sampled_keywords.extend([keyword]*len(tmp))
    sampled_targets.extend(['random']*len(tmp))

fired
hired
job
laid_off
position
quit
unemployed
work


In [35]:
for keyword in keywords:

    print(keyword)
    
    for target in targets:
        
        print(target)

        # Take n_sample Tweets Most Similar with Target Sentence From Non-sampled Tweets Containing Specific Keyword
        tmp=filtered.where(
        (filtered[keyword]==True)&(~filtered['tweet_id'].isin(sampled_ids))).orderBy(
        desc(target)).limit(n_sample).select('tweet_id','text').rdd.map(lambda x: (x[0],x[1])).collect()
    
        # Keep Track of Sampled Ids
        sampled_ids.extend(list(list(zip(*tmp))[0]))

        # Keep Track of Sampled Tweets
        sampled_tweets.extend(list(list(zip(*tmp))[1]))

        # Keep Track of Sampling Properties
        sampled_keywords.extend([keyword]*len(tmp))
        sampled_targets.extend([target]*len(tmp))
        
    print()

fired
target_anyone_hiring

hired
target_anyone_hiring

job
target_anyone_hiring

laid_off
target_anyone_hiring

position
target_anyone_hiring

quit
target_anyone_hiring

unemployed
target_anyone_hiring

work
target_anyone_hiring



In [36]:
for target in targets:
    
    print(target)
    
    # Take n_sample from Non-sampled Tweets Most Similar with Target Sentence
    tmp=random.where(~random['tweet_id'].isin(sampled_ids)).orderBy(
    desc(target)).limit(n_sample).select('tweet_id','text').rdd.map(lambda x: (x[0],x[1])).collect()
    
    # Keep Track of Sampled Ids
    sampled_ids.extend(list(list(zip(*tmp))[0]))

    # Keep Track of Sampled Tweets
    sampled_tweets.extend(list(list(zip(*tmp))[1]))

    # Keep Track of Sampling Properties
    sampled_keywords.extend(['random']*len(tmp))
    sampled_targets.extend([target]*len(tmp))

target_anyone_hiring
target_here_is_a_job_opportunity_you_might_be_interested_in
target_i_am_currently_not_working
target_i_am_searching_for_a_new_position
target_i_got_hired_today
target_i_lost_my_job_today
target_i_recently_started_working_at_my_new_job
target_i_was_fired_earlier_this_week
target_looking_for_a_new_position
target_now_i_am_unemployed


In [None]:
print('Save')

tweets_for_labeling = spark.createDataFrame(
zip(sampled_ids, sampled_tweets, sampled_keywords, sampled_targets), 
schema=['tweet_id', 'text', 'keyword', 'target'])

tweets_for_labeling.write.mode("overwrite").parquet(os.path.join(path_to_data,'labeling'))