# 1.1 Import Libraries

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
import string
import matplotlib.pyplot as plt
plt.rc('figure',figsize=(17,13))
import seaborn as sns
sns.set_style('darkgrid')

pd.options.display.max_rows = 999
pd.options.display.max_columns = 999
pd.options.display.max_colwidth = 999

import warnings
warnings.filterwarnings("ignore")
print("Library Setup Complete.")

# 1.2 Merging Dataframes

Here, we will import two dataframes:
- dataframe consisting of rehydrated tweets and ID, but without misinformation labels 
- dataframe consisting only of ID and misinformation labels

In [None]:
df_withoutlabel = pd.read_excel('../Dataset/final_antivax.xlsx')

In [None]:
labels = pd.read_excel('../Dataset/Untitled spreadsheet copy.xlsx')

In [None]:
df_withoutlabel.info()

In [None]:
labels.info()

In [None]:
df = pd.merge(df_withoutlabel, labels, on='id')#merge df_withoutlabel and labels dataframe on 'id' to create complete dataframe

In [None]:
df.info() #merge successful!

# 1.3 Saving new dataframe

In [None]:
df.to_csv('../Dataset/rehydrated_tweets_with_misinfo_label.csv')

# 1.4 Missing Values

In [None]:
df.isna().sum() # Checking the number of missing values in each columns

### 1.4.1 Hashtags

In [None]:
df['hashtags'].head(20) #Checking what kind of hashtags are there

There are many missing values in the columns 'hashtags'. We will replace null values with NA. 

In [None]:
df['hashtags']= df['hashtags'].replace(np.nan, 'NA') 

### 1.4.2 Possibly_Sensitive
- Possibly_sensitive here is a field which will only surface when a tweet contains a link. The meaning of the field doesn’t pertain to the tweet content itself, but instead it is an indicator that the URL contained in the tweet may contain content or media identified as sensitive content

In [None]:
df['possibly_sensitive'].isnull().sum()

- Given that there are many missing values, and the column itself may not be useful. We may drop it! 

In [None]:
df = df.drop(columns=['possibly_sensitive'])

In [None]:
df.info()

### 1.4.3 User Description

In [None]:
null_columns=df.columns[df.isnull().any()]

In [None]:
df['user_description'].isnull().sum()

- We will replace null values with NA since some profiles may not have a user description

In [None]:
df['user_description']= df['user_description'].replace(np.nan, 'NA') 

### 1.4.4 User Location

In [None]:
df['user_location'].head(30)

- We will replace null values with NA since some profiles may not have a user description

In [None]:
df['user_location']= df['user_location'].replace(np.nan, 'NA') 

### 1.4.5 User name

In [None]:
df[df['user_name'].isnull()]

- Text does not seem relevant. Will remove these two entries.

In [None]:
df.dropna(subset = ["user_name"], inplace=True)

### 1.4.5 Check all fields

In [None]:
df.isnull().sum()

In [None]:
df.reset_index(drop=True, inplace=True)

# 1.5 Removing Columns

In [None]:
df = df.drop(columns=['hashtags', 'id', 'lang', 'tweet_url', 
                      'user_created_at','user_id', 'user_default_profile_image',
                      'user_description', 'user_favourites_count', 'user_listed_count',
                     'user_location', 'user_name', 'user_screen_name', 'user_statuses_count'
                     ,'user_verified', 'user_followers_count', 'user_friends_count', 'favorite_count',
                     'retweet_count', 'source', 'created_at'])

In [None]:
df.columns #left 8 columns after dropping all the unnecessary columns

In [None]:
df.head()

# 1.7 Drop Duplicates

In [None]:
print(len(df))

In [None]:
df = df.drop_duplicates('text', keep='last')

In [None]:
print(len(df))

# 1.8 Saving dataframe

In [None]:
df.to_csv('../Dataset/cleaned_dataset.csv')