# You have a programming challenge

We are going to process 5,000 tweets.  For each tweet we will:

1. Tokenize it using a natural language processing toolkit
2. Try to detect the (human) language used in the tweet using another toolkit


## The sumary of tradeoffs might be useful

| metric |**Processes**|**Threads**|**Async**|
|------|------|------|------|
|Use all cores|Yes|No|No
GIL interference|No|Yes|No
Memory model|Copy on write|Shared|Shared
Locking required|No|Frequent|Less
Switching|OS, Preemptive|OS, Preemptive|Cooperative
Switching cost|High|High|Low
Size|10s|100s|1000s



# Download the tweets from S3

In [None]:
import requests
import shutil

url = "https://s3-us-west-2.amazonaws.com/resero2/datasets/ml-foundations/emoji_tweets_5k.csv"

print('Downloading data...')

response = requests.get(url, stream=True)
with open("emoji_tweets_5k.csv", 'wb') as outfile:
    shutil.copyfileobj(response.raw, outfile)

print('Done.')

In [None]:
import csv
import json

texts = []
emojis = []

with open("emoji_tweets_5k.csv") as infile:
    for row in csv.reader(infile):
        text = json.loads(row[1]).strip()
        texts.append(text)
        emojis.append(json.loads(row[2]))

print(f'Text count: {len(texts)}')
print(f'Emojis count: {len(emojis)}')

# Download the NLTK toolkit suporting files

In [None]:
import nltk

nltk.download('punkt')
result = nltk.word_tokenize("The cat jumped over the frog.")
print(result)
    

# Demonstrate the langdetect package

In [None]:
from langdetect import detect
print(detect("The cat jumped over the frog."))

# You are given the function "analyze_tweet" which will process a single tweet

In [None]:
def analyze_tweet(tweet):
    tokens = nltk.word_tokenize(tweet)
    lang = 'en'
    try:
        lang=detect(tweet)
    except:
        pass
    return {
        'tokens': tokens,
        'lang': lang
    }

# Implement each of the following four functions in the way suggested by the function's name

In [None]:
# implement these using map() operations
# Return a map
# {
#      'tweet text': { 'tokens': [...], 'lang': 'en'},
#        ...
# }

def analyze_tweets_no_concurrency():
    return None

def analyze_tweets_thread_executor(worker_count):
    return None

def analyze_tweets_process_executor(worker_count):
    return None

def analyze_tweets_anything_goes():
    return None


# Use the following cell to evaluate the runtime of your functions

In [None]:
import random

#---------------------------------------------
# contest time.  Who can do it the fastest?
#---------------------------------------------

#%time results = analyze_tweets_no_concurrency()
#%time results = analyze_tweets_thread_executor(10)
%time results = analyze_tweets_process_executor(10)
#%time results = analyze_tweets_anything_goes()

#---------------------------------------------
# Display a random result
#---------------------------------------------
a_tweet = random.choice(list(results.keys()))
a_result = results[a_tweet]
print(a_tweet)
print(a_result)


## If you need some help, some worked examples are below

In [None]:
import concurrent.futures

def analyze_tweets_no_concurrency():
    results = {}
    for tweet in texts:
        results[tweet] = analyze_tweet(tweet)
    return results

def analyze_tweets_executor(executor):
    with executor:        
        results = {}
        for tweet in texts:
            a_future = executor.submit(analyze_tweet, tweet)
            results[tweet] = a_future
        for tweet in results:
            results[tweet] = results[tweet].result()
        return results

def analyze_tweets_thread_executor(worker_count):
    return analyze_tweets_executor( concurrent.futures.ThreadPoolExecutor(max_workers=worker_count))

def analyze_tweets_process_executor(worker_count):
    return analyze_tweets_executor( concurrent.futures.ProcessPoolExecutor(max_workers=worker_count))

def analyze_tweets_anything_goes():
    return None