# Twitter Style Transfer Parsing

In [2]:
import csv
import json
import re
import os
import ipdb
import string
import nltk
import ipdb
from typing import List
from collections import Counter
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/leviv/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Remove links and reply tweets from the corpus

In [3]:
# Ignore all Twitter metadata besides text
def tweet_text_only (filename):
    with open(filename) as f:
      data = json.load(f)

    tweets = data['tweets']
    cleaned_tweets = ''
        
    for tweet in tweets:
        tweet = parse_tweet(tweet['full_text'])
        if (len(tweet) > 0):
            cleaned_tweets += (tweet + "\n")
            
    return cleaned_tweets[:-1]

# Parse a single tweet
def parse_tweet (tweet):
    # If the tweet is a reply, don't include it
    if tweet[0] == '@':
        return ''
    
    # Remove links
    tweet = re.sub(r"http\S+", "", tweet)
    
    return tweet.replace("\n", " ")

Output each of the cleaned tweets as a new file

In [5]:
files = ['dril','dalai','elon','trump']

for file in files:
    tweets = tweet_text_only(file + '.json')
    
    # Output a cleaned version of the data
    with open(file + '_clean.txt', 'w') as clean_file:
        clean_file.write(tweets)

In [None]:

def clean_tweets(tweets):
    """
    cleans tweets for a single account and return unique tokens.
    """
    uniqueTokens = set()
    
    # clean tweets
    for tweet in tweets:

        tokens = tweet.lower().strip().split()

        # remove punctuation and stopwords
        table = str.maketrans('', '', string.punctuation)
        tokens = [w.translate(table) for w in tokens]

        # filter out non-alphabetic words and stopwords 
        sw = set(stopwords.words('english'))
        tokens = list(filter(lambda x: x.isalpha() and x not in sw, tokens))

        # filter out short tokens
        tokens = list(filter(lambda x: len(x) > 1, tokens))

        uniqueTokens.update(tokens)
        
    return uniqueTokens

In [None]:
# vocabulary for all twitter accounts

def addDocToVocab(account:str, vocab:Counter):
    """
    Reads tweets for given twitter account,
    cleans the tweets, and adds unique tokens
    to the global vocabulary.
    """
    tweets = tweet_text_only(account)
    tokens = clean_tweets(tweets)
    vocab.update(tokens)
    
def processAccounts(accounts:List):
    """
    Adds tweets in each account to the global vocabulary
    
    :arg accounts:list of account names in the data directory
    """
    vocab = Counter()
    
    for account in accounts:
        tweetFile = f'{account}.json'
        addDocToVocab(tweetFile, vocab)
        
    return vocab

In [None]:
vocab = processAccounts(['dril','dalai','elon','trump'])

In [None]:
print(len(vocab))

# remove words with a frequency less than 2
corpusVocab = [k for k,c in vocab.items() if c >= 2]

print(len(corpusVocab))

# save the vocab
with open('vocab.txt', 'w') as vocabFile:
    corpusVocab = '\n'.join(corpusVocab)
    vocabFile.write(corpusVocab)
