In [1]:
import os
import sys
import socket
import re
import numpy as np
import string
from timeit import default_timer as timer
from datetime import datetime
import pandas as pd

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf,desc,row_number,col,year,month,dayofmonth,dayofweek,to_timestamp,size,isnan,lower,rand, lit
import pyspark.sql.functions as F
from pyspark.sql.types import MapType, StringType, IntegerType, StructType, StructField, FloatType, ArrayType

In [2]:
print('Hostname:', socket.gethostname())
try:
    spark
except NameError:
    if 'samuel' in socket.gethostname().lower():
        print('Create Local SparkSession')
        spark = SparkSession.builder.config(
        "spark.driver.host", "localhost").appName(
        "get-tweets-for-labeling").getOrCreate()
    else:
        print('Create Cluster SparkSession')
        spark = SparkSession.builder.appName(
        "get-tweets-for-labeling").getOrCreate()
spark

Hostname: Samuels-MBP


In [3]:
country_code = "US"
print('Country:', country_code)

# Local
if  'samuel' in socket.gethostname().lower():
    path_to_data = os.path.join('../../data/classification',country_code)
# Cluster
else:
    path_to_data = os.path.join('/user/spf248/twitter/data/classification',country_code)
print('Path to data:',path_to_data)

Country: US
Path to data: ../../data/classification/US


In [4]:
print('Import tweets containing keywords')
filtered = spark.read.parquet(os.path.join(path_to_data,'filtered'))
filtered.cache()

Import tweets containing keywords


DataFrame[tweet_id: string, text: string, fired: boolean, hired: boolean, job: boolean, laid_off: boolean, position: boolean, quit: boolean, unemployed: boolean, work: boolean, keyword: boolean]

In [5]:
print('Import random tweets')
random = spark.read.parquet(os.path.join(path_to_data,'random'))
random.cache()

Import random tweets


DataFrame[tweet_id: string, text: string, fired: boolean, hired: boolean, job: boolean, laid_off: boolean, position: boolean, quit: boolean, unemployed: boolean, work: boolean, keyword: boolean]

In [6]:
print('Import scores')

schema = StructType([StructField('tweet_id', StringType(), False),
                     StructField('score', FloatType(), False),
                     StructField('target', StringType(), False)])

scores = spark.read.option('header','true').schema(schema).csv(os.path.join(path_to_data,'similarity'))
scores.cache()

Import scores


DataFrame[tweet_id: string, score: float, target: string]

In [7]:
print('Drop Duplicated Scores (Random Sample Could Contain Keywords)')
scores = scores.drop_duplicates(subset=['tweet_id','target'])

Drop Duplicated Scores (Random Sample Could Contain Keywords)


In [8]:
keywords=sorted([keyword for keyword in filtered.columns if keyword not in ['tweet_id','text','keyword']])
print('Keywords:\n')
print('\n'.join(keywords))

Keywords:

fired
hired
job
laid_off
position
quit
unemployed
work


In [9]:
targets=sorted(scores.select("target").distinct().rdd.map(lambda r: r[0]).collect())
print('Targets:\n')
print('\n'.join(targets))

Targets:

I lost my job today


# Merge Tweets With Their Similarity Scores

In [10]:
print('Merge Tweets With Their Similarity Scores:')
for target in targets:
    
    print(target)
    
    filtered=filtered.join(
    scores.filter(scores['target']==target).selectExpr(
    "tweet_id", 
    "score as "+target.replace(' ','_').replace('?','').lower()),on='tweet_id')
    
    random=random.join(
    scores.filter(scores['target']==target).selectExpr(
    "tweet_id", 
    "score as "+target.replace(' ','_').replace('?','').lower()),on='tweet_id')

Merge Tweets With Their Similarity Scores:
I lost my job today


# Create Sample For Labeling

In [11]:
print('Create Sample for Labeling')

schema = StructType([StructField('tweet_id', StringType(), False),
                     StructField('text', StringType(), False),
                     StructField('keyword', StringType(), False),
                     StructField('target', StringType(), False)])

tweets_for_labeling = spark.createDataFrame(spark.sparkContext.emptyRDD(), schema)
tweets_for_labeling.cache()

Create Sample for Labeling


In [12]:
n_sample = 100
print('# sampled tweets per group:', n_sample)

# sampled tweets per group: 100


In [13]:
for keyword in keywords:
    
    print(keyword)
    
#     # Remove Those Which Have Already Been Sampled
#     filtered = filtered.join(tweets_for_labeling.select('tweet_id'),on='tweet_id',how='left_anti')  
    
    # Select Tweets Containing Specific Keyword
    tmp = filtered.where(filtered[keyword]==True)
    
    # Take Random Sample of Size n_sample
    tmp = tmp.orderBy(rand(seed=0)).limit(n_sample).select('tweet_id','text')
    
    # Keep Track of Sampling Properties
    tmp = tmp.withColumn("keyword",lit(keyword))
    tmp = tmp.withColumn("target",lit('random'))
    
    tweets_for_labeling = tweets_for_labeling.union(tmp)

fired
hired
job
laid_off
position
quit
unemployed
work


In [14]:
for keyword in keywords:

    print(keyword)
    
    for target in targets:
        
        print(target)
        
#         # Remove Tweets Which Have Already Been Sampled
#         filtered = filtered.join(tweets_for_labeling.select('tweet_id'),on='tweet_id',how='left_anti')  

        # Select Tweets Containing Specific Keyword
        tmp = filtered.where(filtered[keyword]==True)
    
        # Take n_sample Tweets Most Similar with Target Sentence
        tmp = tmp.sort(col(target.replace(' ','_').replace('?','').lower()).desc()).limit(n_sample).select('tweet_id','text')
        
        # Keep Track of Sampling Properties
        tmp = tmp.withColumn("keyword",lit(keyword))
        tmp = tmp.withColumn("target",lit(target.replace(' ','_').replace('?','').lower()))
    
        tweets_for_labeling = tweets_for_labeling.union(tmp)
        
    print()

fired
I lost my job today

hired
I lost my job today

job
I lost my job today

laid_off
I lost my job today

position
I lost my job today

quit
I lost my job today

unemployed
I lost my job today

work
I lost my job today



In [15]:
for target in targets:
    
    print(target)
    
#     # Remove Tweets Which Have Already Been Sampled
#     random = random.join(tweets_for_labeling.select('tweet_id'),on='tweet_id',how='left_anti')  
    
    # Take n_sample Tweets Most Similar with Target Sentence
    tmp = random.sort(col(target.replace(' ','_').replace('?','').lower()).desc()).limit(n_sample).select('tweet_id','text')
    
    # Keep Track of Sampling Properties
    tmp = tmp.withColumn("keyword",lit('random'))
    tmp = tmp.withColumn("target",lit(target.replace(' ','_').lower()))

    tweets_for_labeling = tweets_for_labeling.union(tmp)

I lost my job today


In [16]:
print('Save')
tweets_for_labeling.write.mode("overwrite").parquet(os.path.join(path_to_data,'labeling-v3'))

Save


Py4JJavaError: An error occurred while calling o551.parquet.
: java.lang.OutOfMemoryError: Java heap space
	at java.util.Arrays.copyOf(Arrays.java:3332)
	at java.lang.AbstractStringBuilder.ensureCapacityInternal(AbstractStringBuilder.java:124)
	at java.lang.AbstractStringBuilder.append(AbstractStringBuilder.java:448)
	at java.lang.StringBuilder.append(StringBuilder.java:136)
	at scala.collection.mutable.StringBuilder.append(StringBuilder.scala:210)
	at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$generateTreeString$1.apply(TreeNode.scala:542)
	at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$generateTreeString$1.apply(TreeNode.scala:541)
	at scala.collection.immutable.List.foreach(List.scala:392)
	at org.apache.spark.sql.catalyst.trees.TreeNode.generateTreeString(TreeNode.scala:541)
	at org.apache.spark.sql.catalyst.trees.TreeNode.generateTreeString(TreeNode.scala:568)
	at org.apache.spark.sql.catalyst.trees.TreeNode.generateTreeString(TreeNode.scala:568)
	at org.apache.spark.sql.catalyst.trees.TreeNode.generateTreeString(TreeNode.scala:568)
	at org.apache.spark.sql.catalyst.trees.TreeNode.generateTreeString(TreeNode.scala:568)
	at org.apache.spark.sql.catalyst.trees.TreeNode.generateTreeString(TreeNode.scala:568)
	at org.apache.spark.sql.catalyst.trees.TreeNode.generateTreeString(TreeNode.scala:568)
	at org.apache.spark.sql.catalyst.trees.TreeNode.generateTreeString(TreeNode.scala:568)
	at org.apache.spark.sql.catalyst.trees.TreeNode.generateTreeString(TreeNode.scala:568)
	at org.apache.spark.sql.catalyst.trees.TreeNode.generateTreeString(TreeNode.scala:568)
	at org.apache.spark.sql.catalyst.trees.TreeNode.generateTreeString(TreeNode.scala:568)
	at org.apache.spark.sql.catalyst.trees.TreeNode.generateTreeString(TreeNode.scala:568)
	at org.apache.spark.sql.catalyst.trees.TreeNode.generateTreeString(TreeNode.scala:568)
	at org.apache.spark.sql.catalyst.trees.TreeNode.generateTreeString(TreeNode.scala:568)
	at org.apache.spark.sql.catalyst.trees.TreeNode.generateTreeString(TreeNode.scala:568)
	at org.apache.spark.sql.catalyst.trees.TreeNode.generateTreeString(TreeNode.scala:568)
	at org.apache.spark.sql.catalyst.trees.TreeNode.generateTreeString(TreeNode.scala:568)
	at org.apache.spark.sql.catalyst.trees.TreeNode.generateTreeString(TreeNode.scala:568)
	at org.apache.spark.sql.catalyst.trees.TreeNode.generateTreeString(TreeNode.scala:568)
	at org.apache.spark.sql.catalyst.trees.TreeNode.generateTreeString(TreeNode.scala:568)
	at org.apache.spark.sql.catalyst.trees.TreeNode.generateTreeString(TreeNode.scala:568)
	at org.apache.spark.sql.catalyst.trees.TreeNode.generateTreeString(TreeNode.scala:568)
	at org.apache.spark.sql.catalyst.trees.TreeNode.generateTreeString(TreeNode.scala:568)
	at org.apache.spark.sql.catalyst.trees.TreeNode.generateTreeString(TreeNode.scala:568)
