In [16]:
import pandas as pd
import re
import os
import glob

We load all datasets and concat it into one

In [17]:
PATH = "./dataset"
filenames = glob.glob(os.path.join(PATH, "*.csv"))
dataset = pd.concat((pd.read_csv(f) for f in filenames))
dataset.reset_index(drop=True, inplace=True)

dataset.__len__()

2973371

Checking for N/A values for each category

In [18]:
column_names = dataset.columns

for column in column_names:
   print(f"{column} : {dataset[column].isna().sum()}")

external_author_id : 4
author : 0
content : 1
region : 8843
language : 0
publish_date : 0
harvested_date : 0
following : 0
followers : 0
updates : 0
post_type : 1662425
account_type : 363
new_june_2018 : 0
retweet : 0
account_category : 0


I choose to drop the rows that contain N/A for categories `external_author_id`, `content`, `account_type`

N/A values in the `region` category can be set to unknown (an already present class)

Half of the post_type rows are N/A and thus I am going to drop the category completely

In [19]:
dataset.dropna(subset=['external_author_id', 'content', 'account_type'], inplace=True)

dataset.drop(labels=['post_type'], axis=1, inplace=True)

dataset['region'].fillna(value = 'Unknown', inplace=True)
dataset.__len__()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset['region'].fillna(value = 'Unknown', inplace=True)


2973003

In [20]:
# printing all unique values in region and langauge categories
print(f"Unique region values : {dataset['region'].unique()}")
print((f"Unique langauge values : {dataset['language'].unique()}"))

Unique region values : ['United States' 'Unknown' 'United Arab Emirates' 'Azerbaijan'
 'Russian Federation' 'Belarus' 'Japan' 'Samoa' 'Ukraine' 'Iraq' 'Israel'
 'India' 'Germany' 'Italy' 'France' 'United Kingdom' 'Spain' 'Egypt'
 'Iran, Islamic Republic of' 'Afghanistan' 'Saudi Arabia' 'Mexico'
 'Canada' 'Malaysia' 'Sweden' 'Denmark' 'Switzerland' 'Greece'
 'Czech Republic' 'Finland' 'Latvia' 'Estonia' 'Turkey' 'Serbia'
 'Hong Kong' 'Austria']
Unique langauge values : ['English' 'Italian' 'Lithuanian' 'Norwegian' 'French' 'Spanish'
 'Romanian' 'Icelandic' 'German' 'Estonian' 'Catalan' 'Somali' 'Arabic'
 'Hungarian' 'Vietnamese' 'Portuguese' 'Latvian' 'Tagalog (Filipino)'
 'Swedish' 'Dutch' 'Uzbek' 'Farsi (Persian)' 'Kurdish' 'Croatian'
 'Japanese' 'Albanian' 'Pushto' 'Czech' 'Finnish' 'Danish' 'Slovak'
 'Korean' 'Malay' 'Polish' 'LANGUAGE UNDEFINED' 'Russian' 'Ukrainian'
 'Macedonian' 'Bulgarian' 'Serbian' 'Turkish' 'Indonesian' 'Hebrew'
 'Slovenian' 'Hindi' 'Urdu' 'Greek' 'Thai' 'Guja

There are tweets in many languages. I want to only focus on English tweets on this project. I will drop rest of the rows

In [21]:
dataset = dataset[dataset['language'] == 'English']
dataset.__len__()

2128608

Some more data exploration to understand the dataset

In [22]:
# Unique account_categories
print(f"Unique classes : \n{dataset['account_category'].unique()} \n")

# Unique account_type
print(f"Unique account types : \n{dataset['account_type'].unique()} \n")

# Total authors
print(f"total authors : {dataset['author'].unique().__len__()}")

Unique classes : 
['NewsFeed' 'LeftTroll' 'RightTroll' 'HashtagGamer' 'Fearmonger'
 'NonEnglish' 'Unknown' 'Commercial'] 

Unique account types : 
['local' 'left' 'Right' 'Hashtager' 'Koch' 'Russian' '?' 'news' 'right'
 'German' 'Commercial' 'Italian' 'Arabic' 'French' 'Ebola ' 'Spanish'
 'Portuguese' 'ZAPOROSHIA'] 

total authors : 2161


In [23]:
url_pattern = re.compile(r"(?:(?:https?://)?pic.twitter.com/|https?://t\S+|https?://\S+)")

# Finding all links in the values of 'content' and storing them in a list

twitter_urls = []
picture_urls = []
other_urls = []

for value in dataset['content']:
    urls = url_pattern.findall(value)
    for url in urls:
        if re.match(r"https?://t\S+", url):
            twitter_urls.append(url)
        elif re.match(r"(?:https?://)?pic.twitter.com/", url):
            picture_urls.append(url)
        else:
            other_urls.append(url)
    
    
    
print(f"Total twitter links : {twitter_urls.__len__()}")
print(f"Total picture links : {picture_urls.__len__()}")
print(f"Total other links : {other_urls.__len__()}")

for url in other_urls[:10]:
    print(url)

Total twitter links : 1838999
Total picture links : 41
Total other links : 321
http://false
https://…
https://…
https://youNas
https://…
http://...
http://bit.ly/1ux8KLv
http://https://t.co/Un0Ml9tTpC
https://goo.gl/w0xLa1
https://…


I see there is mutiple types of links. I decided to make a column category 'Link' that has classes 'tweet_link', 'picture_link', 'other_link', 'no_link'

Some links are not present completely and end with '...'. I try my best to classify these links.

In [27]:
# Creating a new column 'Link Type' to store the type of link
dataset['link_type'] = 'None'

# Filling the 'Link Type' column with the type of link
for index, value in dataset['content'].items():
    urls = url_pattern.findall(value)
    for url in urls:
        if re.match(r"https?://t\S+", url):
            dataset.at[index, 'link_type'] = 'tweet_link'
        elif re.match(r"(?:https?://)?pic.twitter.com/", url):
            dataset.at[index, 'link_type'] = 'picture_link'
        elif re.match(r"https?://\S+", url):
            dataset.at[index, 'link_type'] = 'other_link'
            
    if len(urls) == 0:
            dataset.at[index, 'link_type'] = 'no_link'

In [28]:
dataset.head()

Unnamed: 0,external_author_id,author,content,region,language,publish_date,harvested_date,following,followers,updates,account_type,new_june_2018,retweet,account_category,link_type
0,2494112000.0,DAILYSANJOSE,#sports Ex-Stanford player Stanley Wilson II s...,United States,English,6/27/2016 14:39,6/27/2016 14:39,11955,13225,20394,local,1,0,NewsFeed,no_link
1,2494112000.0,DAILYSANJOSE,The Mountain View gets ready to close its door...,United States,English,6/27/2016 14:56,6/27/2016 14:56,11955,13225,20395,local,1,0,NewsFeed,no_link
2,2494112000.0,DAILYSANJOSE,#politics Supreme Court strikes down Texas abo...,United States,English,6/27/2016 14:59,6/27/2016 14:59,11955,13225,20396,local,1,0,NewsFeed,no_link
3,2494112000.0,DAILYSANJOSE,#health Supreme Court strikes down Texas abort...,United States,English,6/27/2016 15:01,6/27/2016 15:01,11955,13225,20397,local,1,0,NewsFeed,no_link
4,2494112000.0,DAILYSANJOSE,#sports Lochte Fails to Qualify for Rio in Sig...,United States,English,6/27/2016 15:24,6/27/2016 15:24,11955,13225,20398,local,1,0,NewsFeed,no_link


We can now clean the links of the regex since they add no value to classification

In [29]:
# Removing url's from the content
for index, value in dataset['content'].items():
    dataset.at[index, 'content'] = url_pattern.sub('', value)
    
for index, value in dataset['content'].items():
    dataset.at[index, 'content'] = re.sub(r"htt\S*...", '', value)

Adding a `has_emoji` column and Removing emoji's from the text

In [30]:
emoji_pattern = re.compile("["
                            u"\U0001F600-\U0001F64F"  # emoticons
                            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                            u"\U0001F680-\U0001F6FF"  # transport & map symbols
                            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                            u"\U00002500-\U00002BEF"  # chinese char
                            u"\U00002702-\U000027B0"
                            u"\U00002702-\U000027B0"
                            u"\U000024C2-\U0001F251"
                            u"\U0001f926-\U0001f937"
                            u"\U00010000-\U0010ffff"
                            u"\u2640-\u2642"
                            u"\u2600-\u2B55"
                            u"\u200d"
                            u"\u23cf"
                            u"\u23e9"
                            u"\u231a"
                            u"\ufe0f"  # dingbats
                            u"\u3030"
                            "]+", flags=re.UNICODE)

dataset['has_emoji'] = 0

for index, value in dataset['content'].items():
    if emoji_pattern.search(value):
        dataset.at[index, 'has_emoji'] = 1
    
    dataset.at[index, 'content'] = emoji_pattern.sub('', value)    

In [31]:
dataset.__len__()

2128608

In [32]:
for content in dataset['content'][:10]:
    print(content)

#sports Ex-Stanford player Stanley Wilson II shot while breaking in to home
The Mountain View gets ready to close its doors for good  #SanJose
#politics Supreme Court strikes down Texas abortion clinic regulation
#health Supreme Court strikes down Texas abortion clinic regulation
#sports Lochte Fails to Qualify for Rio in Signature Race
#sports Women's gymnastics: Simone Biles cruises into San Jose's Olympic trials
#SanJose San Francisco Holds Annual Gay Pride Parade
#sports Draymond Green, Klay Thompson, Harrison Barnes all make 2016 Olympic team
#SanJose Warriors Send Three to Olympics
#politics Donald Trump's $10 email: Does he have the magic to refill campaign coffers?


In [33]:
dataset['content'].isna().sum()

0

Saving the dataset as a csv for use

In [34]:
dataset.to_csv('troll_dataset.csv', float_format='%.f', index=False)