In [246]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
%matplotlib inline

In [289]:
uncleaned_data = pd.read_csv("../data/Corona_NLP_train.csv", encoding='ISO-8859-1')
# uncleaned_data = pd.read_csv("../data/Corona_NLP_train.csv", encoding='latin-1')

In [290]:
uncleaned_data.shape

(41157, 6)

In [291]:
uncleaned_data.head(3)

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive


In [292]:
uncleaned_data.columns

Index(['UserName', 'ScreenName', 'Location', 'TweetAt', 'OriginalTweet',
       'Sentiment'],
      dtype='object')

In [293]:
uncleaned_data.dtypes

UserName          int64
ScreenName        int64
Location         object
TweetAt          object
OriginalTweet    object
Sentiment        object
dtype: object

In [294]:
uncleaned_data.isna().sum()

UserName            0
ScreenName          0
Location         8590
TweetAt             0
OriginalTweet       0
Sentiment           0
dtype: int64

In [295]:
uncleaned_data.Location.unique()[:20]

array(['London', 'UK', 'Vagabonds', nan, 'Ã\x9cT: 36.319708,-82.363649',
       '35.926541,-78.753267', 'Austria', 'Atlanta, GA USA',
       'BHAVNAGAR,GUJRAT', 'Makati, Manila', 'Pitt Meadows, BC, Canada ',
       'Horningsea', 'Chicago, IL', 'Houston, Texas', 'Saudi Arabia',
       'Ontario, Canada', 'North America', 'Denver, CO',
       'southampton soxx xxx', 'Global'], dtype=object)

Based on initial observation, we can see that `Location` is a very abstract concept here and it can see multiple values. We are going to drop it as we dont see any significance to its use in our future model of text classification.

In [296]:
uncleaned_data.drop(columns=['Location'], inplace=True)

In [297]:
uncleaned_data.TweetAt.value_counts()[:10]

20-03-2020    3448
19-03-2020    3215
25-03-2020    2979
18-03-2020    2742
21-03-2020    2653
22-03-2020    2114
23-03-2020    2062
17-03-2020    1977
08-04-2020    1881
07-04-2020    1843
Name: TweetAt, dtype: int64

`Tweet at` date might have some significance, but in the grander scheme of things it might not hold any significant value. We can safely drop it. In any case, the tweets are collected for a very short duration of around 20 days. If we look back, the information regarding COVID constantly kept on evolving. All these observations can be taken into consideration and `Tweet at` date can be safely dropped.

In [298]:
uncleaned_data.drop(columns=['TweetAt'], inplace=True)

In [299]:
len(uncleaned_data.UserName.unique())

41157

We can see that the total unique `UserName` in the dataset is equal to the total data points. Since, we dont actually get any information from the `UserName`, we can safely drop it to simplify the dimensions of the dataset.

Same is the case with `ScreenName`. So we will drop it too.

In [300]:
uncleaned_data.drop(columns=['UserName', 'ScreenName'], inplace=True)

In [301]:
uncleaned_data.columns

Index(['OriginalTweet', 'Sentiment'], dtype='object')

In [302]:
sorted(uncleaned_data.Sentiment.unique())

['Extremely Negative', 'Extremely Positive', 'Negative', 'Neutral', 'Positive']

The unique value in Sentiment is what we expected, it is clean and there is no need to clean it further. In the later sections we can convert it One-hot encoding or some other types based on the model we choose.

In [303]:
uncleaned_data.OriginalTweet[10], uncleaned_data.Sentiment[10]

("All month there hasn't been crowding in the supermarkets or restaurants, however reducing all the hours and closing the malls means everyone is now using the same entrance and dependent on a single supermarket. #manila #lockdown #covid2019 #Philippines https://t.co/HxWs9LAnF9",
 'Neutral')

In [304]:
allText = ' '.join(uncleaned_data.OriginalTweet.tolist())

In [305]:
print(sorted(list(set(allText))))

['\n', '\r', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '=', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~', '\x80', '\x84', '\x85', '\x87', '\x89', '\x91', '\x92', '\x93', '\x94', '\x95', '\x96', '\x97', '\x98', '\x99', '\x9a', '\x9e', '\x9f', '\xa0', '¡', '¢', '£', '¤', '¥', '¦', '§', '¨', '©', '«', '\xad', '®', '¯', '°', '±', '²', '³', '´', '¶', '·', '¸', '¹', 'º', '»', '¼', '½', 'Â', 'Ã']


As we can see there are too unwanted characters in the tweet. We are going to remove them and focus on the content of the text.

In [264]:
def clean_text(text):
    # remove digits
    text = re.sub(r'\d+',' ', text)
    
    # remove urls
    text = re.sub(r'http\S+', ' ', text)
    
    # remove html tags
    text = re.sub(r'<.*?>',' ', text)
    
    # remove digits
    text = re.sub(r'\d+',' ', text)
    
    # remove hashtags
    text = re.sub(r'#\w+',' ', text)
    
    # remove mentions
    text = re.sub(r'@\w+',' ', text)
    
    # Remove any extra spaces
    text = re.sub(' +', ' ', text.strip())
    
    # Remove everything that is not space and alphabets
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    # Converting to lower case
    cleaned_text = text.lower()

    return cleaned_text

In [265]:
uncleaned_data.OriginalTweet = uncleaned_data.OriginalTweet.apply(clean_text)

In [266]:
# import nltk
# from nltk.corpus import stopwords

# def remove_stopwords(text):
#     stop_words = set(stopwords.words('english'))
#     words = text.split()
#     filtered_words = [word for word in words if word.lower() not in stop_words]
#     return ' '.join(filtered_words)

In [267]:
# uncleaned_data.OriginalTweet = uncleaned_data.OriginalTweet.apply(remove_stopwords)

In [268]:
# def remove_words_with_less_than_2_characters(text):
#     list_of_words = text.split(' ')
#     list_of_words = [x for x in list_of_words if len(x) > 2]
#     return ' '.join(list_of_words)

In [269]:
# uncleaned_data.OriginalTweet = uncleaned_data.OriginalTweet.apply(remove_words_with_less_than_2_characters)

In [270]:
# from nltk.stem import PorterStemmer
# from nltk.tokenize import word_tokenize

# def stemming(text):
#     # Tokenize the text
#     words = word_tokenize(text)

#     # Create a Porter Stemmer
#     porter = PorterStemmer()

#     # Perform stemming
#     stemmed_words = [porter.stem(word) for word in words]
#     return ' '.join(stemmed_words)

In [271]:
# uncleaned_data.OriginalTweet = uncleaned_data.OriginalTweet.apply(stemming)

In [272]:
# we are just checking whether the word exist in the dictionary or not

import nltk
# Download the words dataset from NLTK (if not already downloaded)
nltk.download('words')
valid_words = set(nltk.corpus.words.words())

def allow_valid_words(text):
    words = [x for x in text.split(' ') if x in valid_words]
    return ' '.join(words)

[nltk_data] Downloading package words to
[nltk_data]     /Users/adityasingh/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [273]:
uncleaned_data.OriginalTweet = uncleaned_data.OriginalTweet.apply(allow_valid_words)

In [274]:
tmp = list(set(' '.join(uncleaned_data.OriginalTweet.tolist()).split(' ')))
print(f"Total unique words are: {len(tmp)}")

Total unique words are: 13026


We have removed any unwanted characters using the `clean_text` function above. We kept only the characters between a-z. Now that we have cleaned the text, we can save it for later use in different models 

In [275]:
uncleaned_data.isnull().sum()

OriginalTweet    0
Sentiment        0
dtype: int64

In [276]:
uncleaned_data.to_csv('../data/cleaned_data.csv', header=True, index=False)

In [277]:
cleaned_data = pd.read_csv("../data/cleaned_data.csv")

We can see that after saving there are few entries that became Nan, that is because those entries were emply when they were saved. We can filter them out and rewrite over the existing saved file.

In [278]:
len(cleaned_data.OriginalTweet.iloc[[i for i, x in enumerate(cleaned_data.OriginalTweet.isna()) if x]])

53

In [279]:
cleaned_data.dropna(inplace=True)

In [280]:
cleaned_data.to_csv("../data/cleaned_data.csv", index=False, header=True)

In [281]:
cleaned_data = pd.read_csv("../data/cleaned_data.csv")
cleaned_data.head(3)

Unnamed: 0,OriginalTweet,Sentiment
0,fan pa and and,Neutral
1,advice talk to your family to exchange phone c...,Positive
2,to give elderly disabled shopping amid covid o...,Positive


In [282]:
texts = cleaned_data.OriginalTweet.tolist()
smallTweets = [x for x in texts if len(x.split())<15]
textLen = [len(x) for x in texts]
sorted(textLen)[:10]

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [283]:
moreThan8Words = [x for x in texts if len(x.split(' ')) > 7]
moreThan8Words = sorted(moreThan8Words, key=lambda x: len(x.split(' ')))
len(moreThan8Words)

37701

Based on a small experiment, we can see that there are around 39346 samples with total words of more than 7. We are going to go with that.

In [284]:
def count_words(x):
    return len(x.split(' '))

In [285]:
# super_cleaned_data = cleaned_data[cleaned_data['OriginalTweet'].apply(count_words) > 7]
super_cleaned_data = cleaned_data
super_cleaned_data.shape

(41104, 2)

In [286]:
super_cleaned_data.to_csv("../data/super_cleaned_data.csv", index=False)

In [288]:
tmp = list(set(' '.join(super_cleaned_data.OriginalTweet.tolist()).split(' ')))
print(f"Total unique words are: {len(tmp)}")

Total unique words are: 13025


In [287]:
super_cleaned_data.Sentiment.value_counts()

Positive              11409
Negative               9916
Neutral                7675
Extremely Positive     6623
Extremely Negative     5481
Name: Sentiment, dtype: int64

**Now we have the cleaned data and we can go ahead and start working on building model**