# Coronavirus tweets NLP - Text Classification 

$1$. Remove URL from our raw text.

$2$. Remove HTML tags from our raw text.

$3.$ Tokenize our raw text.

$4.$ Find contraction in our raw text and expand these contraction.

$5.$ Lemmatized our text based on POS tagged. 

$6$. Remove special characters from our text.

$7.$ Remove stopwords from our text.

$8.$ Remove unicode from our text.

In [1]:
import nltk
import string
import re
import numpy as np
import pandas as pd
from contractions import contractions_dict
from pattern.en import tag
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
wnl = WordNetLemmatizer()

In [3]:
stopwords_list = nltk.corpus.stopwords.words('english')

In [4]:
data = pd.read_csv('Corona_NLP_train.csv',encoding='latin1')

In [5]:
data.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


Here we are droping some unwanted columns:

$UserName$ , $ScreenName$, $Location$,$TweetAt$.

In [6]:
data.drop(['UserName','ScreenName','Location','TweetAt'],axis = 1,inplace=True)

In [7]:
data.head()

Unnamed: 0,OriginalTweet,Sentiment
0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,advice Talk to your neighbours family to excha...,Positive
2,Coronavirus Australia: Woolworths to give elde...,Positive
3,My food stock is not the only one which is emp...,Positive
4,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [8]:
data.tail()

Unnamed: 0,OriginalTweet,Sentiment
41152,Airline pilots offering to stock supermarket s...,Neutral
41153,Response to complaint not provided citing COVI...,Extremely Negative
41154,You know itÂs getting tough when @KameronWild...,Positive
41155,Is it wrong that the smell of hand sanitizer i...,Neutral
41156,@TartiiCat Well new/used Rift S are going for ...,Negative


In [9]:
def removed_url(text):
    """function for removed url"""
    url_pattern = re.compile(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*')
    filtered_text = re.sub(url_pattern,'',text)
    return filtered_text

In [10]:
filter_url_text = data['OriginalTweet'].apply(lambda x : removed_url(x))

In [11]:
def removed_html(text):
    """function for removed html"""
    html_pattern = re.compile(r'<.*?>')
    filtered_text = re.sub(html_pattern,'',text)
    return filtered_text

In [12]:
filter_html_text = filter_url_text.apply(lambda x:removed_html(x))

In [13]:
def tokenized_text(text):
    """function for tokenized tweets text"""
    tokens = nltk.word_tokenize(text)
    tokens = [token.strip() for token in tokens]
    return tokens

In [14]:
def expand_contraction(text,contractions_dict):
    """function for removed the contraction words"""
    contraction_pattern = re.compile('({})'.format('|'.join(contractions_dict.keys())),
                                    flags = re.IGNORECASE|re.DOTALL)
    """function for expand match"""
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expand_contraction = contractions_dict.get(match) if contractions_dict.get(match) else contractions_dict.get(match.lower())
        
        #expanded_contraction = first_char + contraction[1:]
        return expand_contraction
    
    expanded_text = contraction_pattern.sub(expand_match,text)
    expanded_text = re.sub("'","",expanded_text)
    return expanded_text

In [15]:
expand_contraction_text = filter_html_text.apply(lambda x : expand_contraction(x,contractions_dict))

In [16]:
"""Annotate text token with POS tags"""
def pos_tag_text(text):
    """Convert Penn treebank tag to wordnet tag"""
    def penn_to_wn_tags(pos_tag):
        if pos_tag.startswith('J'):
            return wn.ADJ
        elif pos_tag.startswith('V'):
            return wn.VERB
        elif pos_tag.startswith('N'):
            return wn.NOUN
        elif pos_tag.startswith('R'):
            return wn.ADV
        else:
            return None
    tagged_text = tag(text)
    tagged_lower_text = [(word.lower(),penn_to_wn_tags(pos_tag)) for word , pos_tag in tagged_text]
    
    return tagged_lower_text

In [17]:
def lemmatize_text(text):
    """function for lemmatize text based on POS tags"""
    pos_tagged_text = pos_tag_text(text)
    lemmatized_tokens = [wnl.lemmatize(word,pos_tag) if pos_tag else word for word , pos_tag in pos_tagged_text]
    lemmatized_text = ' '.join(lemmatized_tokens)
    return lemmatized_text

In [20]:
lemmatized_text = expand_contraction_text.apply(lambda x : lemmatize_text(x))

In [21]:
def removed_special_characters(text):
    """function for removed special characters from the text"""
    tokens = tokenized_text(text)
    special_characters_pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
    filter_tokens = filter(None,[special_characters_pattern.sub('',token) for token in tokens])
    filtered_text = ' '.join(filter_tokens)
    return filtered_text

In [22]:
removed_special_characters_text = lemmatized_text.apply(lambda x:removed_special_characters(x))

In [23]:
def removed_stopwords(text):
    """function for removing stopwords from the text"""
    tokens = tokenized_text(text)
    filtered_text = [token for token in tokens if token not in stopwords_list]
    filtered_text = ' '.join(filtered_text)
    return filtered_text

In [24]:
removed_stopwords_text = removed_special_characters_text.apply(lambda x:removed_stopwords(x))

In [25]:
def removed_unicode(text):
    """function for removing the unicode from the text"""
    unicode_pattern = re.compile(r'\w+[^\x00-\x7F]\w+')
    filtered_text = re.sub(unicode_pattern,'',text)
    return filtered_text

In [26]:
removed_unicode_text = removed_stopwords_text.apply(lambda x : removed_unicode(x))

In [27]:
def removed_remain_unicode(text):
    """function for removed the remaining unicode from text"""
    remain_unicode_pattern = re.compile(r'â')
    filtered_text = re.sub(remain_unicode_pattern,'',text)
    return filtered_text

In [28]:
removed_remain_unicode_text = removed_unicode_text.apply(lambda x : removed_remain_unicode(x))

In [29]:
removed_remain_unicode_text[3]

'food stock one empty please panic enough food everyone take need stay calm stay safe covid19france covid19 covid19 coronavirus confinement confinementotal confinementgeneral'

In [30]:
def tfidf_extractor(text , ngram_range = (1,1)):
    """function for converting text into the TF-IDF vectores"""
    vectorize = TfidfVectorizer(min_df = 1,
                               norm = 'l2',
                               smooth_idf = True,
                               use_idf = True ,
                               ngram_range = ngram_range)
    features = vectorize.fit_transform(text)
    return vectorize , features

In [None]:
tfidf_vectorizer , tfidf_features = tfidf_extractor(corpus)

In [None]:
feature_names = tfidf_vectorizer.get_feature_names()

In [None]:
def display_features(features,feature_names):
    """function for showing this TF-IDF vectors as DataFrame"""
    df = pd.DataFrame(data = features,
                     columns = feature_names)
    return df

In [None]:
display_features(np.round(tfidf_features.todense(),2),feature_names)