# Tweet Preprocessing

In [1]:
# DS essentials
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# python support
import re
import string
import datetime
import pickle 
from collections import Counter

# visualizations
from wordcloud import WordCloud
from nltk import FreqDist

# NLTK
import nltk
from nltk.corpus import words, stopwords, wordnet

# spacy
import spacy

from preprocessing_funcs import clean_tweet, get_hashtags, hashtag_dict

In [2]:
text = pd.read_csv('../data_files/tweets.csv.zip')

In [3]:
text.head()

Unnamed: 0,user_id,user_key,created_at,created_str,retweet_count,retweeted,favorite_count,text,tweet_id,source,hashtags,expanded_urls,posted,mentions,retweeted_status_id,in_reply_to_status_id
0,1868981000.0,ryanmaxwell_1,1458672000000.0,2016-03-22 18:31:42,,,,#IslamKills Are you trying to say that there w...,7.12346e+17,,"[""IslamKills""]",[],POSTED,[],,
1,2571870000.0,detroitdailynew,1476133000000.0,2016-10-10 20:57:00,0.0,False,0.0,"Clinton: Trump should’ve apologized more, atta...",7.855849e+17,"<a href=""http://twitterfeed.com"" rel=""nofollow...",[],"[""http://detne.ws/2e172jF""]",POSTED,[],,
2,1710805000.0,cookncooks,1487767000000.0,2017-02-22 12:43:43,,,,RT @ltapoll: Who was/is the best president of ...,8.343832e+17,,[],[],POSTED,[],,
3,2584153000.0,queenofthewo,1482765000000.0,2016-12-26 15:06:41,,,,RT @jww372: I don't have to guess your religio...,8.134006e+17,,"[""ChristmasAftermath""]",[],POSTED,[],,
4,1768260000.0,mrclydepratt,1501987000000.0,2017-08-06 02:36:24,,,,RT @Shareblue: Pence and his lawyers decided w...,8.940243e+17,,[],[],POSTED,[],,


In [4]:
tweets = text[['text']].astype(str)

In [5]:
tweets.head()

Unnamed: 0,text
0,#IslamKills Are you trying to say that there w...
1,"Clinton: Trump should’ve apologized more, atta..."
2,RT @ltapoll: Who was/is the best president of ...
3,RT @jww372: I don't have to guess your religio...
4,RT @Shareblue: Pence and his lawyers decided w...


## Tweet cleaning

In [6]:
%%time
tweets['clean'] = tweets['text'].map(clean_tweet)

CPU times: user 6.13 s, sys: 33.4 ms, total: 6.16 s
Wall time: 6.18 s


In [7]:
tweets.head()

Unnamed: 0,text,clean
0,#IslamKills Are you trying to say that there w...,islamkills are you trying to say that there we...
1,"Clinton: Trump should’ve apologized more, atta...",clinton trump should ve apologized more attack...
2,RT @ltapoll: Who was/is the best president of ...,who was is the best president of the past year...
3,RT @jww372: I don't have to guess your religio...,i don t have to guess your religion christmasa...
4,RT @Shareblue: Pence and his lawyers decided w...,pence and his lawyers decided which of his off...


## Hashtag segmenting and acronym parsing

`hashtag_dict` contains hashtags and acronyms that keep popping up as being very frequently used and/or in topic modeling. Creating a big dictionary to segment or parse these as part of the cleaning process.

In [8]:
%%time
tweets['clean'] = tweets['clean'].replace(hashtag_dict, regex=True)

CPU times: user 6.49 s, sys: 30.4 ms, total: 6.52 s
Wall time: 6.54 s


In [9]:
tweets.head()

Unnamed: 0,text,clean
0,#IslamKills Are you trying to say that there w...,islam kills are you trying to say that there w...
1,"Clinton: Trump should’ve apologized more, atta...",clinton trumpp should ve apologized more attac...
2,RT @ltapoll: Who was/is the best president of ...,who was is the best president of the past year...
3,RT @jww372: I don't have to guess your religio...,i don t have to guess your religion christmasa...
4,RT @Shareblue: Pence and his lawyers decided w...,pence and his lawyers decided which of his off...


## Remove unwanted words

- lots of german
- stop words
- other weird words

### Uniquely German Words

In [10]:
# a list of all english words known to nltk
english_words = list(nltk.corpus.words.words())
english_words = [word.lower() for word in english_words]
len(english_words), type(english_words)

(236736, list)

In [11]:
word_net = list(nltk.corpus.wordnet.words())
word_net = [word.lower() for word in word_net]
len(word_net), type(word_net)

(147306, list)

In [12]:
many_english_words = set(english_words + word_net)
len(many_english_words), type(many_english_words)

(323592, set)

In [13]:
from nltk.corpus.europarl_raw import german

german = list(german.words())
german = [word.lower() for word in german]
len(german), type(german)

(556226, list)

In [14]:
german_stop_words = stopwords.words("german")
len(german_stop_words), type(german_stop_words)

(232, list)

In [15]:
german_words = set(german + german_stop_words)
len(german_words)

29839

In [16]:
unique_german = set([word.lower() for word in german if word not in many_english_words])
len(unique_german)

28154

### Stop Words

In [17]:
standard_stop_words = stopwords.words("english")
print(standard_stop_words)
len(standard_stop_words), type(standard_stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

(179, list)

In [18]:
# Some twitter stopwords

with open('../data_files/twitter_stopwords.txt') as f:
    words = f.read().split(',')
    twitter_stopwords = list(words)
    twitter_stopwords = [word.lower() for word in twitter_stopwords]

len(twitter_stopwords)

624

In [19]:
# other things that have popped up in preprocessing
other_stops = ['amp', '…', 'll']

In [20]:
english_stops = set(standard_stop_words + twitter_stopwords + other_stops)

In [21]:
stopwords_eng_germ = set(list(english_stops) + list(german_stop_words))

In [22]:
len(stopwords_eng_germ)

840

In [23]:
# save stopwords for later
# with open("../data_files/stopwords_eng_germ.txt", "w") as outfile:
#     outfile.write(",".join(str(word) for word in stopwords_eng_germ))

### Big word removal set

In [24]:
words_to_remove = set(list(unique_german) + list(english_stops))

In [25]:
len(words_to_remove), type(words_to_remove)

(28767, set)

## spaCy Pipeline

In [26]:
# keep ner, entity_linker
disabled_components = ['parser', 'tagger']

nlp = spacy.load('en_core_web_sm', disable=disabled_components)

In [27]:
#nlp.add_pipe(nlp.create_pipe('sentencizer'), n_threads=-1)

In [28]:
#print(nlp.pipe_names)

In [29]:
# as much as it pains me...
def get_me_mike_pence(text):
    return text.replace("penny", "pence")

In [30]:
def lemmatize_pipe(doc):
    lemma_list = [str(tok.lemma_) for tok in doc
                 if tok.is_alpha and tok.text not in words_to_remove]
    lem_string = " ".join(lemma_list)
    # get back mike pence
    lem_string = get_me_mike_pence(lem_string)
    return lem_string

def preprocess_pipe(texts, batch_size=100):
    preproc_pipe = []
    for doc in nlp.pipe(texts, batch_size=batch_size, n_threads=-1):
        preproc_pipe.append(lemmatize_pipe(doc))
    return preproc_pipe

In [31]:
%%time
tweets['processed'] = preprocess_pipe(tweets['clean'], batch_size=10000)

CPU times: user 1min 38s, sys: 23 s, total: 2min 1s
Wall time: 2min 1s


In [32]:
tweets.head()

Unnamed: 0,text,clean,processed
0,#IslamKills Are you trying to say that there w...,islam kills are you trying to say that there w...,islam kill try say terrorist attack europe ref...
1,"Clinton: Trump should’ve apologized more, atta...",clinton trumpp should ve apologized more attac...,clinton trumpp apologize attack little
2,RT @ltapoll: Who was/is the best president of ...,who was is the best president of the past year...,well president past retweet
3,RT @jww372: I don't have to guess your religio...,i don t have to guess your religion christmasa...,guess religion christmasaftermath
4,RT @Shareblue: Pence and his lawyers decided w...,pence and his lawyers decided which of his off...,pence lawyer decide official email public can see


## Word Frequencies

Finding most frequent words that might need cleaning still, especially hashtags and acronyms that might need to be added to the dictionary above.

In [33]:
# top words in processed that are NOT in many_english_words
def word_freq(text, num_words=10, freq_thresh=1000):
    all_words = []
    for tweet in list(text):
        words = tweet.split()
        for word in words:
            if word not in many_english_words:
                all_words.append(word.lower())
    
    top_words = Counter(all_words).most_common(num_words)
    
    results_list = []
    for pair in top_words:
        if pair[1] > freq_thresh:
            results_list.append(pair)
            
    return results_list

In [45]:
word_freq(tweets['clean'], num_words=10, freq_thresh=500)

[('obama', 9788),
 ('has', 5687),
 ('tcot', 3945),
 ('says', 3105),
 ('pjnet', 2858),
 ('potus', 2120),
 ('merkel', 2092),
 ('women', 2058),
 ('rt', 2051),
 ('ll', 1913)]

In [44]:
word_freq(tweets['processed'], num_words=10, freq_thresh=500)

[('obama', 9788),
 ('tcot', 3945),
 ('pjnet', 2858),
 ('potus', 2120),
 ('merkel', 2092),
 ('gt', 1657),
 ('ccot', 1608),
 ('cnn', 1547),
 ('americans', 1517),
 ('dnc', 1453)]

## Pickle clean tweets for later use!

In [36]:
tweets.head(3)

Unnamed: 0,text,clean,processed
0,#IslamKills Are you trying to say that there w...,islam kills are you trying to say that there w...,islam kill try say terrorist attack europe ref...
1,"Clinton: Trump should’ve apologized more, atta...",clinton trumpp should ve apologized more attac...,clinton trumpp apologize attack little
2,RT @ltapoll: Who was/is the best president of ...,who was is the best president of the past year...,well president past retweet


In [43]:
tweets['processed'] = tweets['processed'].replace({'trumpp':'trump'}, regex=True)
tweets['clean'] = tweets['clean'].replace({'trumpp':'trump'}, regex=True)

tweets['processed'] = tweets['processed'].replace({"\ufeff1":''}, regex=True)
tweets['clean'] = tweets['clean'].replace({"\ufeff1":''}, regex=True)

In [48]:
tweets = tweets[['clean', 'processed']]

In [49]:
tweets.head()

Unnamed: 0,clean,processed
0,islam kills are you trying to say that there w...,islam kill try say terrorist attack europe ref...
1,clinton trump should ve apologized more attack...,clinton trump apologize attack little
2,who was is the best president of the past year...,well president past retweet
3,i don t have to guess your religion christmasa...,guess religion christmasaftermath
4,pence and his lawyers decided which of his off...,pence lawyer decide official email public can see


In [50]:
# with open('../data_files/processed_tweets.pickle', 'wb') as to_write:
#    pickle.dump(tweets, to_write)