In [44]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
%matplotlib inline

In [258]:
uncleaned_data = pd.read_csv("../data/Corona_NLP_train.csv", encoding='iso-8859-1')

In [259]:
uncleaned_data.shape

(41157, 6)

In [260]:
uncleaned_data.head(3)

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive


In [261]:
uncleaned_data.columns

Index(['UserName', 'ScreenName', 'Location', 'TweetAt', 'OriginalTweet',
       'Sentiment'],
      dtype='object')

In [262]:
uncleaned_data.dtypes

UserName          int64
ScreenName        int64
Location         object
TweetAt          object
OriginalTweet    object
Sentiment        object
dtype: object

In [263]:
uncleaned_data.isna().sum()

UserName            0
ScreenName          0
Location         8590
TweetAt             0
OriginalTweet       0
Sentiment           0
dtype: int64

In [264]:
uncleaned_data.Location.unique()[:20]

array(['London', 'UK', 'Vagabonds', nan, 'Ã\x9cT: 36.319708,-82.363649',
       '35.926541,-78.753267', 'Austria', 'Atlanta, GA USA',
       'BHAVNAGAR,GUJRAT', 'Makati, Manila', 'Pitt Meadows, BC, Canada ',
       'Horningsea', 'Chicago, IL', 'Houston, Texas', 'Saudi Arabia',
       'Ontario, Canada', 'North America', 'Denver, CO',
       'southampton soxx xxx', 'Global'], dtype=object)

Based on initial observation, we can see that `Location` is a very abstract concept here and it can see multiple values. We are going to drop it as we dont see any significance to its use in our future model of text classification.

In [265]:
uncleaned_data.drop(columns=['Location'], inplace=True)

In [266]:
uncleaned_data.TweetAt.value_counts()[:10]

20-03-2020    3448
19-03-2020    3215
25-03-2020    2979
18-03-2020    2742
21-03-2020    2653
22-03-2020    2114
23-03-2020    2062
17-03-2020    1977
08-04-2020    1881
07-04-2020    1843
Name: TweetAt, dtype: int64

`Tweet at` date might have some significance, but in the grander scheme of things it might not hold any significant value. We can safely drop it. In any case, the tweets are collected for a very short duration of around 20 days. If we look back, the information regarding COVID constantly kept on evolving. All these observations can be taken into consideration and `Tweet at` date can be safely dropped.

In [267]:
uncleaned_data.drop(columns=['TweetAt'], inplace=True)

In [268]:
len(uncleaned_data.UserName.unique())

41157

We can see that the total unique `UserName` in the dataset is equal to the total data points. Since, we dont actually get any information from the `UserName`, we can safely drop it to simplify the dimensions of the dataset.

Same is the case with `ScreenName`. So we will drop it too.

In [269]:
uncleaned_data.drop(columns=['UserName', 'ScreenName'], inplace=True)

In [270]:
uncleaned_data.columns

Index(['OriginalTweet', 'Sentiment'], dtype='object')

In [271]:
sorted(uncleaned_data.Sentiment.unique())

['Extremely Negative', 'Extremely Positive', 'Negative', 'Neutral', 'Positive']

The unique value in Sentiment is what we expected, it is clean and there is no need to clean it further. In the later sections we can convert it One-hot encoding or some other types based on the model we choose.

In [272]:
uncleaned_data.OriginalTweet[10], uncleaned_data.Sentiment[10]

("All month there hasn't been crowding in the supermarkets or restaurants, however reducing all the hours and closing the malls means everyone is now using the same entrance and dependent on a single supermarket. #manila #lockdown #covid2019 #Philippines https://t.co/HxWs9LAnF9",
 'Neutral')

In [273]:
allText = ' '.join(uncleaned_data.OriginalTweet.tolist())

In [274]:
print(sorted(list(set(allText))))

['\n', '\r', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '=', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~', '\x80', '\x84', '\x85', '\x87', '\x89', '\x91', '\x92', '\x93', '\x94', '\x95', '\x96', '\x97', '\x98', '\x99', '\x9a', '\x9e', '\x9f', '\xa0', '¡', '¢', '£', '¤', '¥', '¦', '§', '¨', '©', '«', '\xad', '®', '¯', '°', '±', '²', '³', '´', '¶', '·', '¸', '¹', 'º', '»', '¼', '½', 'Â', 'Ã']


As we can see there are too unwanted characters in the tweet. We are going to remove them and focus on the content of the text.

In [297]:
def clean_text(text):
    
    # Remove hashtags
    text_without_hashtags = re.sub(r'#\S+', '', text)
    
    # Remove URLs
    text_without_urls = re.sub(r'http\S+|www\S+|https\S+', '', text_without_hashtags)
    
    # Remove mentions
    text_without_mentions = re.sub(r'@\S+', '', text_without_urls)
    
    # Remove any extra spaces
    text_without_extra_spaces = ' '.join(text_without_mentions.split())
    # Converting to lower case
    lower_case_text = text_without_extra_spaces.lower()
    cleaned_text = re.sub(r'[^a-z0-9 ]', '', lower_case_text.lower())

    return cleaned_text

In [298]:
uncleaned_data.OriginalTweet = uncleaned_data.OriginalTweet.apply(clean_text)

We have removed any unwanted characters using the `clean_text` function above. We kept all the characters between 0-9 and a-z. Now that we have cleaned the text, we can save it for later use in different models 

In [305]:
uncleaned_data.isnull().sum()

OriginalTweet    0
Sentiment        0
dtype: int64

In [278]:
uncleaned_data.to_csv('../data/cleaned_data.csv', header=True, index=False)

In [279]:
cleaned_data = pd.read_csv("../data/cleaned_data.csv")

We can see that after saving there are few entries that became Nan, that is because those entries were emply when they were saved. We can filter them out and rewrite over the existing saved file.

In [325]:
cleaned_data.OriginalTweet.iloc[[i for i, x in enumerate(cleaned_data.OriginalTweet.isna()) if x]]

186      NaN
2190     NaN
5946     NaN
13777    NaN
22994    NaN
26007    NaN
28549    NaN
30345    NaN
30473    NaN
31293    NaN
31440    NaN
31627    NaN
31657    NaN
35563    NaN
35565    NaN
37646    NaN
Name: OriginalTweet, dtype: object

In [328]:
cleaned_data.dropna(inplace=True)

In [330]:
cleaned_data.to_csv("../data/cleaned_data.csv", index=False, header=True)

In [331]:
cleaned_data = pd.read_csv("../data/cleaned_data.csv")
cleaned_data

Unnamed: 0,OriginalTweet,Sentiment
0,and and,Neutral
1,advice talk to your neighbours family to excha...,Positive
2,coronavirus australia woolworths to give elder...,Positive
3,my food stock is not the only one which is emp...,Positive
4,me ready to go at supermarket during the outbr...,Extremely Negative
...,...,...
41136,airline pilots offering to stock supermarket s...,Neutral
41137,response to complaint not provided citing covi...,Extremely Negative
41138,you know its getting tough when is rationing t...,Positive
41139,is it wrong that the smell of hand sanitizer i...,Neutral


In [346]:
texts = cleaned_data.OriginalTweet.tolist()
smallTweets = [x for x in texts if len(x.split())<15]
textLen = [len(x) for x in texts]
sorted(textLen)[:10]

[2, 3, 4, 5, 6, 7, 7, 7, 7, 8]

In [353]:
moreThan8Words = [x for x in texts if len(x.split(' ')) > 7]
moreThan8Words = sorted(moreThan8Words, key=lambda x: len(x.split(' ')))
len(moreThan8Words)

39346

Based on a small experiment, we can see that there are around 39346 samples with total words of more than 7. We are going to go with that.

In [354]:
def count_words(x):
    return len(x.split(' '))

In [355]:
super_cleaned_data = cleaned_data[cleaned_data['OriginalTweet'].apply(count_words) > 7]
super_cleaned_data.shape

(39346, 2)

In [356]:
super_cleaned_data.to_csv("../data/super_cleaned_data.csv", index=False)

In [358]:
super_cleaned_data.Sentiment.value_counts()

Positive              11033
Negative               9625
Neutral                6711
Extremely Positive     6566
Extremely Negative     5411
Name: Sentiment, dtype: int64

**Now we have the cleaned data and we can go ahead and start working on building model**