In [31]:
import ssl
import re
import nltk
import warnings
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')
#nltk.download('wordnet')
warnings.filterwarnings('ignore')
ssl._create_default_https_context = ssl._create_unverified_context

# Import custom functions
from modify_df import *
from custom_regex import *
from sentence_processing import *

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\shilp\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


# Merging Datasets, Data Cleaning

There are currently four different datasets we are using (along with some of their columns):

| Oct 2017  | Nov 2017 - Dec 2017 | Sept 2018 - Feb 2019 | Oct 2019 |
| --- | --- | --- | --- |
| 350K | 390K | 695K | 15K |
| id, date of tweet, text | text, favorited, created | text, location, created | text, language, created at |

We want the date of the tweet and the text content of the tweet itself. We want to focus on tweets are in English.

In [3]:
# Import the datasets
df_oct17 = pd.read_csv('raw_data/oct2017.csv')
df_novdec17 = pd.read_csv('raw_data/novdec17.csv')
df_sept18feb19 = pd.read_csv('raw_data/sept2018feb2019.csv')
df_oct19 = pd.read_csv('raw_data/oct2019.csv')

# Drop columns from each dataframe as necessary
df_oct17 = df_oct17.drop(columns = ['id', 'insertdate', 'twitterhandle', 'followers', 'hashtagsearched',
                                   'tweetid', 'lastcontactdate', 'lasttimelinepull', 'lasttimetweetsanalyzed',
                                   'numberoftweetsanalysed', 'numberoftweetsabouthash', 'actualtwitterdate'])
df_oct17 = df_oct17.loc[:, ~df_oct17.columns.str.contains('^Unnamed')]
df_oct17 = df_oct17.dropna()
df_oct17 = df_oct17.reset_index()
df_novdec17 = df_novdec17.drop(columns = ['favorited', 'favoriteCount', 'replyToSN', 'truncated', 'replyToSID',
                                         'id', 'replyToUID', 'statusSource', 'screenName', 'retweetCount',
                                         'isRetweet', 'retweeted', 'longitude', 'latitude'])
df_novdec17 = df_novdec17.loc[:, ~df_novdec17.columns.str.contains('^Unnamed')]
df_novdec17 = df_novdec17.dropna()
df_novdec17 = df_novdec17.reset_index()
df_sept18feb19 = df_sept18feb19.drop(columns = ['status_id', 'favorite_count', 'retweet_count', 'location',
                                                'followers_count', 'friends_count', 'statuses_count', 'category'])
df_sept18feb19 = df_sept18feb19.dropna()
df_sept18feb19 = df_sept18feb19.reset_index()
df_oct19 = df_oct19.drop(columns = ['Id', 'Lenght', 'Source', 'Favorite_count', 'Retweet_count'])
df_oct19 = df_oct19[df_oct19['Lang'] == 'en']
df_oct19 = df_oct19.drop(columns = ['Lang'])
df_oct19 = df_oct19.dropna()
df_oct19 = df_oct19.reset_index()

In [4]:
df_oct17 = get_oct17_data(df_oct17, 'dateoftweet')
df_novdec17 = get_novdec17_data(df_novdec17, 'created')
df_sept18feb19 = get_sept18feb19_data(df_sept18feb19, 'created_at')
df_oct19 = get_oct19_data(df_oct19, 'Created_at')

# Concatenate all frames
data = pd.concat([df_oct17, df_novdec17, df_sept18feb19, df_oct19])
data = data.reset_index()
data = data.drop(columns = ['index'])

We perform the following operations on the 'Text' column of the dataframe:
* lowercase
* duplicates (keeping duplicate tweets could lead to bias)
* retweets
* hyperlinks
* emojis
* mentions
* length greater than 280
* whitespaces

We also ensure there are no missing values in our dataframe at the end.

In [5]:
len(data)

1606028

In [6]:
# Converting tweets to lowercase
data['Tweet'] = data['Tweet'].apply(lambda x: x.lower() if type(x) == str else x)
len(data)

1606028

In [8]:
# Removing duplicates
data_nodup = data.drop_duplicates(subset = 'Tweet', keep = 'first')
data_nodup = data_nodup.reset_index()
data_nodup = data_nodup.drop(columns = ['index'])
len(data_nodup)

1048258

In [9]:
# Removing 'RT'
data['Tweet'] = data_nodup['Tweet'].apply(lambda x: re.sub(r'http://t(?!$)', '', x) if type(x) == str else x)
data_nodup['Tweet'] = data_nodup['Tweet'].apply(lambda x: x.replace('rt ', '') if type(x) == str else x)
data_nodup['Tweet'] = data_nodup['Tweet'].apply(lambda x: x.replace('rt', '') if type(x) == str else x)
len(data_nodup)

1048258

In [10]:
# Removing emojis
data_nodup['Tweet'] = data_nodup['Tweet'].apply(lambda x: de_emojify(x) if type(x) == str else x)
len(data_nodup)

1048258

In [11]:
# Delete URLs
data_nodup['Tweet'] = data_nodup['Tweet'].apply(lambda x: re.sub(r'^https?:\/\/.*[\r\n]*', '', x, flags=re.MULTILINE) if type(x) == str else x)
len(data_nodup)

1048258

In [12]:
# Remove mentions
data_nodup['Tweet'] = np.vectorize(remove_regex)(data_nodup['Tweet'], "@[\w]*")
len(data_nodup)

1048258

In [13]:
# Remove special characters (except hashtags and apostrophes), replace with whitespace
data_nodup['Tweet'] = data_nodup['Tweet'].str.replace("[^a-zA-Z#']", " ")
len(data_nodup)

1048258

In [14]:
# Remove single hashtags with nothing following them
data_nodup['Tweet'] = np.vectorize(remove_regex)(data_nodup['Tweet'], " # ")
len(data_nodup)

1048258

In [19]:
# Remove trailing whitespace
data_nodup['Tweet'] = data_nodup.apply(lambda x: x.strip() if type(x) == str else x)
len(data_nodup)

1048258

In [24]:
# Ensure no missing values
data_nodup.isna().sum()

Tweet           0
Years           0
Tweet Length    0
dtype: int64

In [25]:
data_nodup.head(15)

Unnamed: 0,Tweet,Years,Tweet Length
0,cuando esta se ora habla es como leer los...,2018,83
1,will require institutions that receive gra...,2018,123
2,listening to the awesome feminist scholar cynt...,2018,140
3,...,2018,84
4,a ver donde est n todas las voceras colomb...,2018,121
5,we cant romanticize the same things we rally...,2018,120
6,db is a new initiative by a group of german...,2018,121
7,very proud to become a white ribbon uk champ...,2018,124
8,#metoo movement lawmaker investigated for sexu...,2018,116
9,#geraldbutts is #justintrudeau amp #liber...,2018,107


# Lemmatization of Parts of Speech
Lemmatizing a part of speech means that we classify each word as an adjective, adverb, noun, or verb. Each word in the sentence is treated as a token and a tag is given vased off the lexical database Wordnet
(https://wordnet.princeton.edu/). Tuples of tokens and wordnet tags are then crated, and we look for a match. If there is a match present, the word is classified (lemmatized) as one of the parts of speech. One exception exists ('ass'), which has been tweaked using the `get_lemma` function)

In [None]:
# Apply get_lemma function
data_nodup['Lemmatized'] = data_nodup['Tweet'].apply(lambda x: get_lemma(x))

# Removing spaces after hashtags
data_nodup['Lemmatized'] = data_nodup['Lemmatized'].str.replace('# ', '#')

# Removing spaces after apostrophes
data_nodup['Lemmatized'] = data_nodup['Lemmatized'].str.replace(" '", "'")

In [None]:
len(data_nodup)

In [6]:
# Import stopwords from English
stop = stopwords.words('english')
data_nodup['Tweets with no Stopwords'] = data_nodup['Lemmatized'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [7]:
# Removing words shorter than two characters because they will likely not be relevant
data_nodup['Short Tweets'] = data_nodup['Tweets with no Stopwords'].apply(lambda x: ' '.join([word for word in x.split() if len(word) > 2]))

In [11]:
print(pd.unique(data_nodup['Years']))

['2017' '2018' '2019']


In [15]:
data_nodup.isna().sum()

Tweet                       0
Years                       0
Tweet Length                0
Lemmatized                  0
Tweets with no Stopwords    0
Short Tweets                0
dtype: int64

In [16]:
# Save to file
data_nodup.to_csv(r'processed_data/clean_data.csv', index = False)