In [1]:
import ssl
import re
import nltk
import warnings
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
warnings.filterwarnings('ignore')
ssl._create_default_https_context = ssl._create_unverified_context

# Import custom functions
from modify_df import *
from custom_regex import *
from sentence_processing import *

# Merging Datasets, Data Cleaning

There are currently four different datasets we are using (along with some of their columns):

| Oct 2017  | Nov 2017 - Dec 2017 | Sept 2018 - Feb 2019 | Oct 2019 |
| --- | --- | --- | --- |
| 350K | 390K | 695K | 15K |
| id, date of tweet, text | text, favorited, created | text, location, created | text, language, created at |

We want the date of the tweet and the text content of the tweet itself. We want to focus on tweets are in English.

In [2]:
# Import the datasets
df_oct17 = pd.read_csv('raw_data/oct2017.csv')
df_novdec17 = pd.read_csv('raw_data/novdec17.csv')
df_sept18feb19 = pd.read_csv('raw_data/sept2018feb2019.csv')
df_oct19 = pd.read_csv('raw_data/oct2019.csv')

# Drop columns from each dataframe as necessary
df_oct17 = df_oct17.drop(columns = ['id', 'insertdate', 'twitterhandle', 'followers', 'hashtagsearched',
                                   'tweetid', 'lastcontactdate', 'lasttimelinepull', 'lasttimetweetsanalyzed',
                                   'numberoftweetsanalysed', 'numberoftweetsabouthash', 'actualtwitterdate'])
df_oct17 = df_oct17.loc[:, ~df_oct17.columns.str.contains('^Unnamed')]
df_oct17 = df_oct17.dropna()
df_oct17 = df_oct17.reset_index()
df_novdec17 = df_novdec17.drop(columns = ['favorited', 'favoriteCount', 'replyToSN', 'truncated', 'replyToSID',
                                         'id', 'replyToUID', 'statusSource', 'screenName', 'retweetCount',
                                         'isRetweet', 'retweeted', 'longitude', 'latitude'])
df_novdec17 = df_novdec17.loc[:, ~df_novdec17.columns.str.contains('^Unnamed')]
df_novdec17 = df_novdec17.dropna()
df_novdec17 = df_novdec17.reset_index()
df_sept18feb19 = df_sept18feb19.drop(columns = ['status_id', 'favorite_count', 'retweet_count', 'location',
                                                'followers_count', 'friends_count', 'statuses_count', 'category'])
df_sept18feb19 = df_sept18feb19.dropna()
df_sept18feb19 = df_sept18feb19.reset_index()
df_oct19 = df_oct19.drop(columns = ['Id', 'Lenght', 'Source', 'Favorite_count', 'Retweet_count'])
df_oct19 = df_oct19[df_oct19['Lang'] == 'en']
df_oct19 = df_oct19.drop(columns = ['Lang'])
df_oct19 = df_oct19.dropna()
df_oct19 = df_oct19.reset_index()

In [3]:
df_oct17 = get_oct17_data(df_oct17, 'dateoftweet')
df_novdec17 = get_novdec17_data(df_novdec17, 'created')
df_sept18feb19 = get_sept18feb19_data(df_sept18feb19, 'created_at')
df_oct19 = get_oct19_data(df_oct19, 'Created_at')

# Concatenate all frames
data = pd.concat([df_oct17, df_novdec17, df_sept18feb19, df_oct19])
data = data.reset_index()
data = data.drop(columns = ['index'])

We perform the following operations on the 'Text' column of the dataframe:
* lowercase
* duplicates (keeping duplicate tweets could lead to bias)
* retweets
* hyperlinks
* emojis
* mentions
* length greater than 280
* whitespaces

We also ensure there are no missing values in our dataframe at the end.

In [4]:
# Converting tweets to lowercase
data['Tweet'] = data['Tweet'].apply(lambda x: x.lower() if type(x) == str else x)

# Removing duplicates
data = data.drop_duplicates(subset = 'Tweet', keep = 'first')
data = data.reset_index()
data = data.drop(columns = ['index'])

# Removing 'RT'
data['Tweet'] = data['Tweet'].apply(lambda x: re.sub(r'http://t(?!$)', '', x) if type(x) == str else x)
data['Tweet'] = data['Tweet'].apply(lambda x: x.replace('rt ', '') if type(x) == str else x)
data['Tweet'] = data['Tweet'].apply(lambda x: x.replace('rt', '') if type(x) == str else x)

# Removing emojis
data['Tweet'] = data['Tweet'].apply(lambda x: de_emojify(x) if type(x) == str else x)

# Delete URLs
data['Tweet'] = data['Tweet'].apply(lambda x: re.sub(r'^https?:\/\/.*[\r\n]*', '', x, flags=re.MULTILINE) if type(x) == str else x)

# Remove mentions
data['Tweet'] = np.vectorize(remove_regex)(data['Tweet'], "@[\w]*")

# Remove special characters (except hashtags and apostrophes), replace with whitespace
data['Tweet'] = data['Tweet'].str.replace("[^a-zA-Z#']", " ")

# Remove single hashtags with nothing following them
data['Tweet'] = np.vectorize(remove_regex)(data['Tweet'], " # ")

# Check that there are no tweets greater than 280 characters
data['Tweet Length'] = data['Tweet'].apply(lambda x: len(x))
data = data.loc[data['Tweet Length'] > 280]
data.sort_values(by = ['Tweet Length'], ascending = False)

# Ensure no missing values
data.isna().sum()

Tweet           0
Years           0
Tweet Length    0
dtype: int64

# Lemmatization of Parts of Speech
Lemmatizing a part of speech means that we classify each word as an adjective, adverb, noun, or verb. Each word in the sentence is treated as a token and a tag is given vased off the lexical database Wordnet
(https://wordnet.princeton.edu/). Tuples of tokens and wordnet tags are then crated, and we look for a match. If there is a match present, the word is classified (lemmatized) as one of the parts of speech. One exception exists ('ass'), which has been tweaked using the `get_lemma` function)

In [5]:
# Apply get_lemma function
data['Lemmatized'] = data['Tweet'].apply(lambda x: get_lemma(x))

# Removing spaces after hashtags
data['Lemmatized'] = data['Lemmatized'].str.replace('# ', '#')

# Removing spaces after apostrophes
data['Lemmatized'] = data['Lemmatized'].str.replace(" '", "'")

In [6]:
# Import stopwords from English
stop = stopwords.words('english')
data['Tweets with no Stopwords'] = data['Lemmatized'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [7]:
# Removing words shorter than two characters because they will likely not be relevant
data['Short Tweets'] = data['Tweets with no Stopwords'].apply(lambda x: ' '.join([word for word in x.split() if len(word) > 2]))

In [11]:
print(pd.unique(data['Years']))

['2017' '2018' '2019']


In [15]:
data.isna().sum()

Tweet                       0
Years                       0
Tweet Length                0
Lemmatized                  0
Tweets with no Stopwords    0
Short Tweets                0
dtype: int64

In [16]:
# Save to file
data.to_csv(r'processed_data/clean_data.csv', index = False)