# 1.1 Import Libraries

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
import string
import matplotlib.pyplot as plt
plt.rc('figure',figsize=(17,13))
import seaborn as sns
sns.set_style('darkgrid')

pd.options.display.max_rows = 999
pd.options.display.max_columns = 999
pd.options.display.max_colwidth = 999

import warnings
warnings.filterwarnings("ignore")
print("Library Setup Complete.")

Library Setup Complete.


# 1.2 Merging Dataframes

Here, we will import two dataframes:
- dataframe consisting of rehydrated tweets and ID, but without misinformation labels 
- dataframe consisting only of ID and misinformation labels

In [2]:
df_withoutlabel = pd.read_excel('../Dataset/final_antivax.xlsx')

In [3]:
labels = pd.read_excel('../Dataset/Untitled spreadsheet copy.xlsx')

In [4]:
df_withoutlabel.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12989 entries, 0 to 12988
Data columns (total 23 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   created_at                  12989 non-null  object 
 1   hashtags                    3565 non-null   object 
 2   favorite_count              12989 non-null  int64  
 3   id                          12989 non-null  int64  
 4   lang                        12989 non-null  object 
 5   possibly_sensitive          5577 non-null   float64
 6   retweet_count               12989 non-null  int64  
 7   source                      12989 non-null  object 
 8   text                        12989 non-null  object 
 9   tweet_url                   12989 non-null  object 
 10  user_created_at             12989 non-null  object 
 11  user_id                     12989 non-null  int64  
 12  user_default_profile_image  12989 non-null  bool   
 13  user_description            118

In [5]:
labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15070 entries, 0 to 15069
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   id          15070 non-null  int64
 1   is_misinfo  15070 non-null  int64
dtypes: int64(2)
memory usage: 235.6 KB


In [6]:
df = pd.merge(df_withoutlabel, labels, on='id')#merge df_withoutlabel and labels dataframe on 'id' to create complete dataframe

In [7]:
df.info() #merge successful!

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12989 entries, 0 to 12988
Data columns (total 24 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   created_at                  12989 non-null  object 
 1   hashtags                    3565 non-null   object 
 2   favorite_count              12989 non-null  int64  
 3   id                          12989 non-null  int64  
 4   lang                        12989 non-null  object 
 5   possibly_sensitive          5577 non-null   float64
 6   retweet_count               12989 non-null  int64  
 7   source                      12989 non-null  object 
 8   text                        12989 non-null  object 
 9   tweet_url                   12989 non-null  object 
 10  user_created_at             12989 non-null  object 
 11  user_id                     12989 non-null  int64  
 12  user_default_profile_image  12989 non-null  bool   
 13  user_description            118

# 1.3 Saving new dataframe

In [8]:
df.to_csv('../Dataset/rehydrated_tweets_with_misinfo_label.csv')

# 1.4 Missing Values

In [9]:
df.isna().sum() # Checking the number of missing values in each columns

created_at                       0
hashtags                      9424
favorite_count                   0
id                               0
lang                             0
possibly_sensitive            7412
retweet_count                    0
source                           0
text                             0
tweet_url                        0
user_created_at                  0
user_id                          0
user_default_profile_image       0
user_description              1140
user_favourites_count            0
user_followers_count             0
user_friends_count               0
user_listed_count                0
user_location                 3463
user_name                        2
user_screen_name                 0
user_statuses_count              0
user_verified                    0
is_misinfo                       0
dtype: int64

### 1.4.1 Hashtags

In [10]:
df['hashtags'].head(20) #Checking what kind of hashtags are there

0                                          NaN
1                               417fightscovid
2                                          NaN
3                                  StandStrong
4                                          NaN
5                                  Idaho covid
6                    VaccinesWork sciencebitch
7                                          NaN
8                                          NaN
9                                          NaN
10                                       covid
11                                         NaN
12                           MGH stopthespread
13                                    lockdown
14                                         NaN
15    Covidvaccine CalltoArms BeGreatVaccinate
16                                         NaN
17                                         NaN
18                                         NaN
19                                         NaN
Name: hashtags, dtype: object

There are many missing values in the columns 'hashtags'. We will replace null values with NA. 

In [11]:
df['hashtags']= df['hashtags'].replace(np.nan, 'NA') 

### 1.4.2 Possibly_Sensitive
- Possibly_sensitive here is a field which will only surface when a tweet contains a link. The meaning of the field doesn’t pertain to the tweet content itself, but instead it is an indicator that the URL contained in the tweet may contain content or media identified as sensitive content

In [12]:
df['possibly_sensitive'].isnull().sum()

7412

- Given that there are many missing values, and the column itself may not be useful. We may drop it! 

In [13]:
df = df.drop(columns=['possibly_sensitive'])

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12989 entries, 0 to 12988
Data columns (total 23 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   created_at                  12989 non-null  object
 1   hashtags                    12989 non-null  object
 2   favorite_count              12989 non-null  int64 
 3   id                          12989 non-null  int64 
 4   lang                        12989 non-null  object
 5   retweet_count               12989 non-null  int64 
 6   source                      12989 non-null  object
 7   text                        12989 non-null  object
 8   tweet_url                   12989 non-null  object
 9   user_created_at             12989 non-null  object
 10  user_id                     12989 non-null  int64 
 11  user_default_profile_image  12989 non-null  bool  
 12  user_description            11849 non-null  object
 13  user_favourites_count       12989 non-null  in

### 1.4.3 User Description

In [15]:
null_columns=df.columns[df.isnull().any()]

In [16]:
df['user_description'].isnull().sum()

1140

- We will replace null values with NA since some profiles may not have a user description

In [17]:
df['user_description']= df['user_description'].replace(np.nan, 'NA') 

### 1.4.4 User Location

In [18]:
df['user_location'].head(30)

0                     Washington, DC
1                    Springfield, Mo
2                      Maryland, USA
3                                NaN
4                 Your mom's bedroom
5           United States Of America
6        Columbia University, NY, US
7                 Virgo Supercluster
8                                NaN
9                      Nashville, TN
10          United States Of America
11                               NaN
12                           Toronto
13            Wirral, United Kingdom
14                             Idaho
15                  Toronto, Ontario
16                    London,England
17                               NaN
18               South West, England
19                               NaN
20                  none ya business
21                     Greenwood, SC
22    Eno, Shakori & Skaruhreh Land 
23      Niagara-on-the-Lake, Ontario
24                               NaN
25          Poitou-Charentes, France
26                               NaN
2

- We will replace null values with NA since some profiles may not have a user description

In [19]:
df['user_location']= df['user_location'].replace(np.nan, 'NA') 

### 1.4.5 User name

In [20]:
df[df['user_name'].isnull()]

Unnamed: 0,created_at,hashtags,favorite_count,id,lang,retweet_count,source,text,tweet_url,user_created_at,user_id,user_default_profile_image,user_description,user_favourites_count,user_followers_count,user_friends_count,user_listed_count,user_location,user_name,user_screen_name,user_statuses_count,user_verified,is_misinfo
1266,Fri Dec 25 00:13:57 +0000 2020,,10,1342262267484794886,en,1,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",If you’ve ever held my hand you do not need to worry about what’s in the vaccine,https://twitter.com/TravisnBurns/status/1342262267484794886,Tue Apr 17 15:52:15 +0000 2012,556154139,False,finesser professor,3300,279,502,0,,,TravisnBurns,2060,False,0
12290,Fri Jul 09 19:33:39 +0000 2021,,0,1413582137383657474,en,1,"<a href=""https://mobile.twitter.com"" rel=""nofollow"">Twitter Web App</a>",@RecoveringLibe9's account is temporarily unavailable because it violates the Twitter Media Policy. Learn more.,https://twitter.com/RecoveringLibe9/status/1413582137383657474,Tue Jun 22 01:57:24 +0000 2021,1407155663063617537,False,@RecoveringLibe9's account is temporarily unavailable because it violates the Twitter Media Policy. Learn more.,971,102,519,0,,,RecoveringLibe9,2565,False,1


- Text does not seem relevant. Will remove these two entries.

In [21]:
df.dropna(subset = ["user_name"], inplace=True)

### 1.4.5 Check all fields

In [22]:
df.isnull().sum()

created_at                    0
hashtags                      0
favorite_count                0
id                            0
lang                          0
retweet_count                 0
source                        0
text                          0
tweet_url                     0
user_created_at               0
user_id                       0
user_default_profile_image    0
user_description              0
user_favourites_count         0
user_followers_count          0
user_friends_count            0
user_listed_count             0
user_location                 0
user_name                     0
user_screen_name              0
user_statuses_count           0
user_verified                 0
is_misinfo                    0
dtype: int64

In [23]:
df.reset_index(drop=True, inplace=True)

# 1.5 Removing Columns

In [24]:
df = df.drop(columns=['hashtags', 'id', 'lang', 'tweet_url', 
                      'user_created_at','user_id', 'user_default_profile_image',
                      'user_description', 'user_favourites_count', 'user_listed_count',
                     'user_location', 'user_name', 'user_screen_name', 'user_statuses_count'
                     ,'user_verified', 'user_followers_count', 'user_friends_count', 'favorite_count',
                     'retweet_count', 'source', 'created_at'])

In [25]:
df.columns #left 8 columns after dropping all the unnecessary columns

Index(['text', 'is_misinfo'], dtype='object')

In [26]:
df.head()

Unnamed: 0,text,is_misinfo
0,"My 87 year old grandmother has been quarantining in her Florida house for close to a year. She was so excited to go to @CityDaytona to get her COVID vaccine next week, only to find out @CountyOfVolusia only has 2000 doses spread out over 2 days. For a county of 500,000+. 1/3",0
1,I’m so excited that with this vaccine I can see the day where we get back to the work of thanking and celebrating (in person and face to face) all the amazing work our healthcare heroes do every day. I’m very ready to throw our “Pandemic is over” party! #417fightscovid https://t.co/T7O6ypn8Ax,0
2,So excited that I was able to close out 2020 on a positive note with getting the first dose of the COVID-19 vaccine! https://t.co/Y1mSXVPBMD,0
3,"Well, it happened. My daughter used my grandchildren to try to guilt me in to taking the vaccine for the CCP bioweapon. Satan is disguised in so many forms. Be on your guard...even when it feels like your heart is being ripped out. #StandStrong 🙏🏾",1
4,"I’m getting my COVID-19 vaccine, baby! I’m so fucking excited!",0


# 1.7 Drop Duplicates

In [27]:
print(len(df))

12987


In [28]:
df = df.drop_duplicates('text', keep='last')

In [29]:
print(len(df))

12877


# 1.8 Saving dataframe

In [None]:
df.to_csv('../Dataset/cleaned_dataset.csv')