# Russian Troll Tweets: Data Cleaning

**EDA Ideas** 
- want to see if rate of tweets accelerates as the election comes up
    - bar graph of # tweets per month in this dataset
    - same as above but for a single user (maybe plot 3-5 together?)
- same idea as above but for use of hashtags 
    - what is the rate of their use? 
    - a racing bar chart would be cool here (see [Lucy's resource](https://observablehq.com/@d3/gallery))

In [40]:
# Basics
from collections import Counter
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import re
import string
import datetime
import pickle 

# visualizations
from wordcloud import WordCloud
from nltk import FreqDist

import nltk
from nltk.tokenize import word_tokenize, sent_tokenize, MWETokenizer
from nltk.util import ngrams
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.porter import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.corpus import words
from nltk.corpus import wordnet

import spacy

from preprocessing_funcs import clean_tweet, get_hashtags

In [None]:
# nltk.download("punkt")
# nltk.download("stopwords")
# nltk.download("words")
# nltk.download("wordnet")
# nltk.download("averaged_perceptron_tagger")

In [2]:
raw_df = pd.read_csv('../data_files/tweets.csv.zip')

In [None]:
raw_df.head()

In [None]:
raw_df.info()

In [None]:
# Get date information
raw_df['date'] = pd.to_datetime(raw_df['created_str']).dt.date

start_date = min(raw_df.date)
end_date = max(raw_df.date)
time_delta_years = (max(raw_df.date) - min(raw_df.date)).days / 365

print(f'Data date range: {start_date} to {end_date}')
print(f'Time period: {time_delta_years:.2f} years')

In [None]:
num_users = len(raw_df.user_key.unique())
num_tweets = len(raw_df)

print(f'Number of unique users: {num_users}')
print(f'Number of Tweets: {num_tweets}\n')
print(f'Average Tweets per User: {num_tweets/num_users:.2f}')
print(f'Average Tweet per User per Day: {(num_tweets/num_users)/(time_delta_years*365):.2f}')
print(f'Average Tweet per User per Week: {(num_tweets/num_users)/(time_delta_years*52):.2f}')
print(f'Average Tweet per User per Month: {(num_tweets/num_users)/(time_delta_years*12):.2f}')
print(f'Average Tweet per User per Year: {(num_tweets/num_users)/(time_delta_years):.2f}')

# Just *CLEAN* Tweets!

In [3]:
df = raw_df[['text']].astype(str)

In [4]:
df.head()

Unnamed: 0,text
0,#IslamKills Are you trying to say that there w...
1,"Clinton: Trump should’ve apologized more, atta..."
2,RT @ltapoll: Who was/is the best president of ...
3,RT @jww372: I don't have to guess your religio...
4,RT @Shareblue: Pence and his lawyers decided w...


In [None]:
#df['hashtags'] = df['text'].map(get_hashtags)

In [5]:
df['clean'] = df['text'].map(clean_tweet)

In [6]:
df.head()

Unnamed: 0,text,clean
0,#IslamKills Are you trying to say that there w...,islamkills are you trying to say that there we...
1,"Clinton: Trump should’ve apologized more, atta...",clinton trump should ve apologized more attack...
2,RT @ltapoll: Who was/is the best president of ...,who was is the best president of the past year...
3,RT @jww372: I don't have to guess your religio...,i don t have to guess your religion christmasa...
4,RT @Shareblue: Pence and his lawyers decided w...,pence and his lawyers decided which of his off...


## Remove non-english words

This didn't work well -- removed too many important words

In [None]:
# a list of all english words known to nltk
words = list(nltk.corpus.words.words())
len(words), type(words)

In [None]:
word_net = list(nltk.corpus.wordnet.words())
len(word_net), type(word_net)

In [None]:
many_words = set(words + word_net)

In [None]:
len(many_words), type(many_words)

In [None]:
'email' in many_words

In [None]:
delete_non_english = lambda x: " ".join(w for w in nltk.word_tokenize(x) if w in many_words)

In [None]:
# drop non-english words
df['clean'] = df['clean'].apply(delete_non_english)

# drop any tweet rows 
df.dropna(inplace=True)

In [None]:
df.head()

In [None]:
for tweet in df['clean'][5:15]:
    print(tweet, '\n')

In [None]:
for tweet in df['text'][5:15]:
    print(tweet, '\n')

## Removing stop words

In [8]:
standard_stop_words = stopwords.words("english")
print(standard_stop_words)
type(standard_stop_words), len(standard_stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

(list, 179)

In [9]:
with open('../data_files/twitter_stopwords.txt') as f:
    words = f.read().split(',')
    twitter_stopwords = list(words)

len(twitter_stopwords)    

623

In [10]:
other_words = ['amp', '…']

In [11]:
all_the_stops = set(standard_stop_words + twitter_stopwords + other_words)

In [12]:
len(all_the_stops)

616

In [15]:
remove_stop_words = lambda x: " ".join(w for w in nltk.word_tokenize(x) if w not in all_the_stops)

In [16]:
# drop stop-words
df['no_stops'] = df['clean'].apply(remove_stop_words)

In [17]:
punctuations = '''!’()-![]{};:+'""\,<>./?@#$%^&*_~'''                                   
no_punc = lambda x: " ".join(w for w in nltk.word_tokenize(x) if w not in punctuations)

In [18]:
df['no_stops'] = df['no_stops'].apply(no_punc)

In [19]:
df.head()

Unnamed: 0,text,clean,no_stops
0,#IslamKills Are you trying to say that there w...,islamkills are you trying to say that there we...,islamkills trying say terrorist attacks europe...
1,"Clinton: Trump should’ve apologized more, atta...",clinton trump should ve apologized more attack...,clinton trump apologized attacked less
2,RT @ltapoll: Who was/is the best president of ...,who was is the best president of the past year...,best president past retweet
3,RT @jww372: I don't have to guess your religio...,i don t have to guess your religion christmasa...,guess religion christmasaftermath
4,RT @Shareblue: Pence and his lawyers decided w...,pence and his lawyers decided which of his off...,pence lawyers decided official emails public c...


## spaCy

In [20]:
nlp = spacy.load('en_core_web_sm', disable=["parser", "ner"])

In [21]:
spacy_lemmatize = lambda x: " ".join([token.lemma_ for token in nlp(x)])

In [26]:
df["lem"] = df["no_stops"].apply(spacy_lemmatize)

In [27]:
df.head()

Unnamed: 0,text,clean,no_stops,lem
0,#IslamKills Are you trying to say that there w...,islamkills are you trying to say that there we...,islamkills trying say terrorist attacks europe...,islamkill try say terrorist attack europe refu...
1,"Clinton: Trump should’ve apologized more, atta...",clinton trump should ve apologized more attack...,clinton trump apologized attacked less,clinton trump apologize attack less
2,RT @ltapoll: Who was/is the best president of ...,who was is the best president of the past year...,best president past retweet,good president past retweet
3,RT @jww372: I don't have to guess your religio...,i don t have to guess your religion christmasa...,guess religion christmasaftermath,guess religion christmasaftermath
4,RT @Shareblue: Pence and his lawyers decided w...,pence and his lawyers decided which of his off...,pence lawyers decided official emails public c...,pence lawyer decide official email public coul...


In [28]:
for i in range(0, 5):
    print(df.text[i])
    print(df.lem[i], '\n')

#IslamKills Are you trying to say that there were no terrorist attacks in Europe before refugees were let in?
islamkill try say terrorist attack europe refugee let 

Clinton: Trump should’ve apologized more, attacked less https://t.co/eJampkoHFZ
clinton trump apologize attack less 

RT @ltapoll: Who was/is the best president of the past 25 years? (Vote &amp; Retweet)
good president past retweet 

RT @jww372: I don't have to guess your religion! #ChristmasAftermath
guess religion christmasaftermath 

RT @Shareblue: Pence and his lawyers decided which of his official emails the public could see

https://t.co/HjhPguBK1Y by @alisonrose711
pence lawyer decide official email public could see 



In [29]:
all_words = []
for tweet in list(df['lem']):
    words = tweet.split()
    for word in words:
        all_words.append(word.lower())
    
Counter(all_words).most_common(10)

[('trump', 30886),
 ('clinton', 13015),
 ('hillary', 11780),
 ('obama', 9790),
 ('get', 8862),
 ('say', 8193),
 ('people', 6845),
 ('go', 6260),
 ('make', 6113),
 ('like', 5796)]

## Vectorization

In [31]:
cv = CountVectorizer()
doc_word = cv.fit_transform(df["lem"])
vect = pd.DataFrame(doc_word.toarray(),columns=cv.get_feature_names())
vect

Unnamed: 0,aa,aaa,aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaannnnnnnnnnnnnnnnnnnnnnnnnnnnnnndddddddddddddddddddddddddddddddddd,aaaaaaaaaaaaaaaaaaaaannnnnnnnnnnnnnnnnnnnddddddddddddddddddddddddddddddd,aaaaaaaaannnnnnnnnnnddddddddddddd,aaaaaaaaassssss,aaaaaand,aaaaaayyyuuuummmmmm,aaaaand,aaaand,...,zynalturist,zynischen,zyoritv,zzcrane,zzion,zzjwmc,zzzs,zzzzzz,zzzzzzz,zzzzzzzzzzzzzzzzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203477,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
203478,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
203479,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
203480,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [35]:
vect.columns[-20:]

Index(['zweiten', 'zweiter', 'zweitwichtigste', 'zwilling', 'zwillingen',
       'zwingt', 'zwischen', 'zwischendurch', 'zwischenstationen', 'zyka',
       'zynalturist', 'zynischen', 'zyoritv', 'zzcrane', 'zzion', 'zzjwmc',
       'zzzs', 'zzzzzz', 'zzzzzzz', 'zzzzzzzzzzzzzzzzz'],
      dtype='object')

In [34]:
# previously: 94809 columns
# spacy lem: 89679
94809-86277

8532

In [37]:
tweets = df[['text', 'lem']]
tweets.head()

Unnamed: 0,text,lem
0,#IslamKills Are you trying to say that there w...,islamkill try say terrorist attack europe refu...
1,"Clinton: Trump should’ve apologized more, atta...",clinton trump apologize attack less
2,RT @ltapoll: Who was/is the best president of ...,good president past retweet
3,RT @jww372: I don't have to guess your religio...,guess religion christmasaftermath
4,RT @Shareblue: Pence and his lawyers decided w...,pence lawyer decide official email public coul...


In [41]:
with open('../data_files/tweets.pickle', 'wb') as to_write:
   pickle.dump(tweets, to_write)

## Word Clouds

### Most frequent tweeters

In [None]:
fdist = FreqDist(df['user_key'])

wc = WordCloud(width=600, height=400, max_words=50).generate_from_frequencies(fdist)
plt.figure(figsize=(12, 10))
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.show()

### Most used hashtags in dataset

In [None]:
fdist = FreqDist(df['hashtags'].apply(lambda x: " ".join(x)))

wc = WordCloud(width=800, height=600, max_words=150).generate_from_frequencies(fdist)
plt.figure(figsize=(12, 10))
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.show()

Merkelmussbleiben = 'Merkel must stay'

## NLP Pipeline

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem import PorterStemmer

In [None]:
class NLPPipe:
    
    def __init__(self, vectorizer=CountVectorizer(), cleaning_function=None, tokenizer=None, stemmer=None):
        '''
        Create a pipeline that vectorizes an arbitary list of documents.
        '''
        self.vectorizer = vectorizer
        self.cleaning_function = cleaning_function
        self.tokenizer = tokenizer
        self.stemmer = stemmer
     
    def clean_text(self, text):
        text = text.astype(str)
        cleaned_text = text.map(self.cleaning_function)
        return cleaned_text
    
    def fit(self, text):
        pass
        
    def transform(self, text):
        pass

In [None]:
nlp = NLPPipe(vectorizer=CountVectorizer(), 
              cleaning_function=clean_tweet, 
              tokenizer=TreebankWordTokenizer().tokenize, 
              stemmer=PorterStemmer())

In [None]:
df2 = raw_df[['text']]

tweets = df2['text']

df2.head()

In [None]:
clean_tweets = nlp.clean_text(tweets)

In [None]:
df2.head()

In [None]:
for tweet in clean_tweets[0:5]:
    print(tweet, '\n')

In [None]:
for tweet in raw_df.text[0:5]:
    print(tweet, '\n')

In [None]:
# df.to_csv('data_files/clean_tweets.zip', index=False)