In [45]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [265]:
uncleaned_data = pd.read_csv("../data/train.csv")
uncleaned_data.head(3)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1


In [266]:
uncleaned_data.shape

(7613, 5)

In [267]:
uncleaned_data.columns

Index(['id', 'keyword', 'location', 'text', 'target'], dtype='object')

In [268]:
uncleaned_data.dtypes

id           int64
keyword     object
location    object
text        object
target       int64
dtype: object

In [269]:
uncleaned_data.isna().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [270]:
list(uncleaned_data.location.unique())[:5]

[nan,
 'Birmingham',
 'Est. September 2012 - Bristol',
 'AFRICA',
 'Philadelphia, PA']

We can conclude that the values in `Location` column are total garbage and we can discard them. We will also drop the `Keyword` column as we can get all the information from the `Text` column.

In [271]:
uncleaned_data.drop(columns=['location'], inplace=True)

In [272]:
uncleaned_data[-uncleaned_data.keyword.isna()].isna().sum()

id         0
keyword    0
text       0
target     0
dtype: int64

We can observe that less than 1% of the data dont have the keyword value associated to it. We can ignore them. Keyword might provide some valuable information about the tweets.

In [273]:
uncleaned_data = uncleaned_data[-uncleaned_data.keyword.isna()]
uncleaned_data.head(3)

Unnamed: 0,id,keyword,text,target
31,48,ablaze,@bbcmtd Wholesale Markets ablaze http://t.co/l...,1
32,49,ablaze,We always try to bring the heavy. #metal #RT h...,0
33,50,ablaze,#AFRICANBAZE: Breaking news:Nigeria flag set a...,1


In [281]:
allText = ' '.join(uncleaned_data.text.tolist())
print(sorted(list(set(allText))))
for c in sorted(list(set(allText))):
    print(f"{c} --> {allText[allText.find(c)-10:allText.find(c)+10]}")

[' ', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
  --> 
a --> 
b -->  markets ablaze we a
c --> ze in aba crying out
d --> on plus side look at
e --> 
f --> s nigeria flag set a
g --> ry to bring the heav
h --> 
i -->  try to bring the he
j -->  barbados jamaica tw
k --> lesale markets ablaz
l --> 
m --> wholesale markets ab
n --> try to bring the hea
o --> 
p --> ablaze on plus side 
q --> und new acquisitions
r --> olesale markets abla
s --> 
t --> sale markets ablaze 
u --> a crying out for mor
v --> ng the heavy breakin
w --> 
x --> for the next year at
y --> ze we always try to 
z --> rkets ablaze we alwa


In [275]:
contractions = {
    "haven't": "have not",
    "hasn't": "has not",
    "hadn't": "had not",
    "don't": "do not",
    "doesn't": "does not",
    "didn't": "did not",
    "won't": "will not",
    "wouldn't": "would not",
    "can't": "cannot",
    "couldn't": "could not",
    "they've": "they have",
    "you're": "you are",
    "it's": "it is",
    "i'm": "i am"
    # Add more contractions as needed
}

def expand_contractions(text):
    for contraction, expansion in contractions.items():
        text = text.replace(contraction, expansion)
    return text

def remove_unwanted_chars(text):
    allowed_chars = [
        " ", "'", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", 
        "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "\n"
    ]
    pattern = f"[^{''.join(allowed_chars)}]"  # Create pattern from allowed characters
    text = re.sub(pattern, " ", text)
    return text

def clean_text(text):
    text = text.lower()
    text = re.sub(r"#\w+", "", text)
    text = ' '.join(text.split())
    text = re.sub(r"@\w+", "", text)
    text = ' '.join(text.split())
    text = re.sub(r"(http|https)://\S+", "", text)
    text = ' '.join(text.split())
    text = re.sub(r"\d+", "", text)
    text = ' '.join(text.split())
    text = remove_unwanted_chars(text)
    text = ' '.join(text.split())
    text = expand_contractions(text)
    text = ' '.join(text.split())
    text = text.replace(" ' ", " ")
    text = text.replace("' ", " ")
    text = text.replace(" '", " ")
    text = text.replace('\'', " ")
    text = ' '.join(text.split())
    return text

In [276]:
uncleaned_data.text = uncleaned_data.text.apply(clean_text)

In [277]:
uncleaned_data.drop(columns=['keyword'], inplace=True)

In [289]:
uncleaned_data.head(3)

Unnamed: 0,id,text,target
31,48,wholesale markets ablaze,1
32,49,we always try to bring the heavy,0
33,50,breaking news nigeria flag set ablaze in aba,1


In [293]:
allWords = (' '.join(uncleaned_data.text.tolist())).split()

In [296]:
np.unique(allWords)

array(['a', 'aa', 'aaaa', ..., 'zumiez', 'zurich', 'zzzz'], dtype='<U23')

We have completed the basic cleaning so we will now save the cleaned dataset to build a model.

In [299]:
uncleaned_data.to_csv("../data/cleaned_data.csv", index=False)