In [86]:
import pandas as pd
import string
import re

df = pd.read_csv('data/fakeNews.csv')

print("Rows and Columns: ", df.shape)

#check duplicates
print("Duplicate?:" , df.duplicated().any())

#check missing values
has_missing = df.isna().any().any()
print("Any missing values?", has_missing)


#drop unwanted columns
df = df.drop(["Date Posted", "Link", "Region","Country","Explanation","Origin","Origin_URL","Fact_checked_by","Poynter_Label"], axis=1)
print("Rows and Columns after cleaning:", df.shape)

# Create translation table once
killpunctuation = str.maketrans('', '', string.punctuation + "‘’“”")

#clean text
def clean_text(text):
    text = text.lower() #convert to lower case
    text = re.sub(r"(@\S+|https?\S+|#\S+)", "", text) # remove @s, hashtags, URLS
    text = text.translate(killpunctuation)   # remove punctuation
    text = re.sub(r"\s+", " ", text).strip()
    return text
    
   
# Apply cleaning to 'text' column
df["Text"] = df["Text"].apply(clean_text)

#rename columns
df = df.rename(columns={'Text': 'text', 'Binary Label': 'fake'})

df["fake"] = 1 # change classifcation of fake to 1

df.drop_duplicates(inplace = True)

#save cleaned data
df.to_csv('data/fakeNews_CLEAN.csv', index=False)

#check duplicates
print("Duplicate? after cleaning:" , df.duplicated().any())

#check missing values
has_missing = df.isna().any().any()
print("Any missing values?", has_missing)






Rows and Columns:  (3795, 11)
Duplicate?: True
Any missing values? False
Rows and Columns after cleaning: (3795, 2)
Duplicate? after cleaning: False
Any missing values? False


In [87]:

df = pd.read_csv('data/trueNews.csv')

print("Rows and Columns: ", df.shape)

#check duplicates
print("Duplicate?:" , df.duplicated().any())

#check missing values
has_missing = df.isna().any().any()
print("Any missing values?", has_missing)

#check data types
print(df.dtypes)


#drop unwanted columns
df = df.drop(["Date Posted","Link","Region","Username","Publisher"], axis=1)
print("Rows and Columns after cleaning:", df.shape)

# Create translation table once
killpunctuation = str.maketrans('', '', string.punctuation + "‘’“”")

#clean text
def clean_text(text):
    text = text.lower() #convert to lower case
    text = re.sub(r"(@\S+|https?\S+|#\S+)", "", text) # remove @s, hashtags, URLS
    text = re.sub(r"\b(pictwitter\S*|pic\S*)\b", "", text) #remove pic twitter 
    text = text.translate(killpunctuation)   # remove punctuation
    text = re.sub(r"\s+", " ", text).strip()
    return text
    
   
# Apply cleaning to 'text' column
df["Text"] = df["Text"].apply(clean_text)

#rename columns
df = df.rename(columns={'Text': 'text', 'Label': 'fake'})

df["fake"] = 0 # change classifcation of fake to 1, real = 0

#save cleaned data
df.to_csv('data/trueNews_CLEAN.csv', index=False)


    

Rows and Columns:  (3793, 7)
Duplicate?: False
Any missing values? False
Date Posted    object
Link           object
Text           object
Region         object
Username       object
Publisher      object
Label           int64
dtype: object
Rows and Columns after cleaning: (3793, 2)


In [91]:
import string
import re

df = pd.read_csv('data/usable/covid19_Constraint_Val.csv')

print("Rows and Columns: ", df.shape)

#check duplicates
print("Duplicate?:" , df.duplicated().any())

#check missing values
has_missing = df.isna().any().any()
print("Any missing values?", has_missing)

#check data types
print(df.dtypes)

# Create translation table once
killpunctuation = str.maketrans('', '', string.punctuation + "‘’“”")


def remove_emoji(txt): #remove emoji function from: https://appdividend.com/remove-emoji-from-the-text-in-python/
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001F900-\U0001F9FF"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', txt)

def clean_tweet(text):
    text = text.lower() #convert to lower case
    text = re.sub(r"(@\S+|https?\S+|#\S+)", "", text) # remove @s, hashtags, URLS
    text = text.translate(killpunctuation)   # remove punctuation
    text = remove_emoji(text)  # remove emojis
    text = re.sub(r"\s+", " ", text).strip()
    return text
    
   
# Apply cleaning to 'tweet' column
df["tweet"] = df["tweet"].apply(clean_tweet)

df = df.rename(columns={'tweet': 'text'})


#convert label to binary
df["fake"] = df["label"].map({"fake": 1, "real": 0})
df = df.drop("label", axis=1)

df = df.drop("id", axis=1) #drop id column




#save cleaned data
df.to_csv('data/usable/covid19_Constraint_Val_CLEAN.csv', index=False)


print("Rows and Columns: ", df.shape)

#check duplicates
print("Duplicate?:" , df.duplicated().any())

#check missing values
has_missing = df.isna().any().any()
print("Any missing values?", has_missing)

#check data types
print(df.dtypes)




Rows and Columns:  (2140, 3)
Duplicate?: False
Any missing values? False
id        int64
tweet    object
label    object
dtype: object
Rows and Columns:  (2140, 2)
Duplicate?: True
Any missing values? False
text    object
fake     int64
dtype: object


In [89]:
df = pd.read_csv('data/usable/covid19_english_test_with_labels.csv')

print("Rows and Columns: ", df.shape)

#check duplicates
print("Duplicate?:" , df.duplicated().any())

#check missing values
has_missing = df.isna().any().any()
print("Any missing values?", has_missing)

#check data types
print(df.dtypes)


# Create translation table once
killpunctuation = str.maketrans('', '', string.punctuation + "‘’“”")


def remove_emoji(txt): #remove emoji function from: https://appdividend.com/remove-emoji-from-the-text-in-python/
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', txt)

def clean_tweet(text):
    text = text.lower() #convert to lower case
    text = re.sub(r"(@\S+|https?\S+|#\S+)", "", text) # remove @s, hashtags, URLS
    text = text.translate(killpunctuation)   # remove punctuation
    text = remove_emoji(text)  # remove emojis
    text = re.sub(r"\s+", " ", text).strip()
    return text
    
   
# Apply cleaning to 'tweet' column
df["tweet"] = df["tweet"].apply(clean_tweet)
df = df.rename(columns={'tweet': 'text'})


#convert label to binary
df["fake"] = df["label"].map({"fake": 1, "real": 0})
df = df.drop("label", axis=1)

df = df.drop("id", axis=1) #drop id column



#save cleaned data
df.to_csv('data/usable/covid19_english_test_with_labels_CLEAN.csv', index=False)


print("Rows and Columns: ", df.shape)

#check duplicates
print("Duplicate?:" , df.duplicated().any())

#check missing values
has_missing = df.isna().any().any()
print("Any missing values?", has_missing)

#check data types
print(df.dtypes)

Rows and Columns:  (2140, 3)
Duplicate?: False
Any missing values? False
id        int64
tweet    object
label    object
dtype: object
Rows and Columns:  (2140, 2)
Duplicate?: True
Any missing values? False
text    object
fake     int64
dtype: object


In [90]:
df = pd.read_csv('data/usable/covid19labeldataset.csv')

print("Rows and Columns: ", df.shape)

#check duplicates
print("Duplicate?:" , df.duplicated().any())

#check missing values
has_missing = df.isna().any().any()
print("Any missing values?", has_missing)

#check data types
print(df.dtypes)


# Create translation table once
killpunctuation = str.maketrans('', '', string.punctuation + "‘’“”")


def remove_emoji(txt): #remove emoji function from: https://appdividend.com/remove-emoji-from-the-text-in-python/
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', txt)

def clean_tweet(text):
    text = text.lower() #convert to lower case
    text = re.sub(r"(@\S+|https?\S+|#\S+)", "", text) # remove @s, hashtags, URLS
    text = text.translate(killpunctuation)   # remove punctuation
    text = remove_emoji(text)  # remove emojis
    text = re.sub(r"\s+", " ", text).strip()
    return text
    
   
# Apply cleaning to 'tweet' column
df["tweet"] = df["tweet"].apply(clean_tweet)
df = df.rename(columns={'tweet': 'text'})

#convert label to binary
df["fake"] = df["label"].map({"fake": 1, "real": 0})
df = df.drop("label", axis=1)

df = df.drop("id", axis=1) #drop id column



#save cleaned data
df.to_csv('data/usable/covid19labeldataset_CLEAN.csv', index=False)


print("Rows and Columns: ", df.shape)

#check duplicates
print("Duplicate?:" , df.duplicated().any())

#check missing values
has_missing = df.isna().any().any()
print("Any missing values?", has_missing)

#check data types
print(df.dtypes)


Rows and Columns:  (6424, 3)
Duplicate?: False
Any missing values? False
id        int64
tweet    object
label    object
dtype: object
Rows and Columns:  (6424, 2)
Duplicate?: True
Any missing values? False
text    object
fake     int64
dtype: object
