# Coronavirus tweets NLP - Text Classification

- (2020/12) I am a self-taught learner of data science and finished my NLP online course. Try to apply what i have learnt to this project.

### Corona Virus Tagged Data

Data from:https://www.kaggle.com/datatattle/covid-19-nlp-text-classification


Perform Text Classification on the data. The tweets have been pulled from Twitter and manual tagging has been done then.
The names and usernames have been given codes to avoid any privacy concerns.


Columns in Data:
- Location
- Tweet At
- Original Tweet
- Label

In [None]:
# import the tools 
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

In [None]:
# Read the train and test file 
train_df = pd.read_csv('../input/covid-19-nlp-text-classification/Corona_NLP_train.csv', encoding='ISO-8859-1', parse_dates=['TweetAt'])
test_df = pd.read_csv('../input/covid-19-nlp-text-classification/Corona_NLP_test.csv', encoding='ISO-8859-1',parse_dates=['TweetAt'])

In [None]:
# Check missing data
train_df.isnull().sum(), train_df.isnull().sum()

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
train_df.shape, test_df.shape

In [None]:
# we combined the train_df and test_df into one dataframe for preprocessing

# Create new column to identify the test data
train_df['is_test'] = 0
test_df['is_test'] = 1

# combine 
comp_df = pd.concat([train_df, test_df])
comp_df.reset_index(drop=True, inplace=True)

# Data EDA and formatting

### Grouping the labels to positive(2), negative(0) and neutral (1)

In [None]:
# Have a look on the target features
comp_df.Sentiment.value_counts()

In [None]:
comp_df['Sentiment'] = comp_df.Sentiment.str.replace('Extremely Positive', 'Positive')
comp_df['Sentiment'] = comp_df.Sentiment.str.replace('Extremely Negative', 'Negative')

In [None]:
comp_df.Sentiment.value_counts().plot.bar(figsize=(7,4))
plt.xticks(rotation=None)
plt.title('Number of tweets in different sentiments',fontsize=12)
plt.xlabel('Number of tweets', fontsize=12)
plt.ylabel('Sentiment')

In [None]:
# Map the sentiment into 0 , 1, 2
comp_df['Sentiment'] = comp_df['Sentiment'].map({'Positive':2, 'Negative':0, 'Neutral':1})

### Get the month of the tweets

In [None]:
comp_df['month'] = comp_df['TweetAt'].dt.month

In [None]:
# Visual the date with the labels
pd.crosstab(comp_df.month, comp_df.Sentiment).plot.bar()
plt.ylabel('Number of tweets')
plt.xticks(rotation=None)
plt.show()

### Drop the other columns

In [None]:
# In this task we will focus on the text data only, so we drop the other columns
comp_df = comp_df[['OriginalTweet','Sentiment','is_test']]

### Modify the tweet contents

#### Have a look on the  tweets's content

In [None]:
comp_df['OriginalTweet'][0]

### The elements we would like to remove from the tweet's content

- URL
- punctuations
- \# tags
- @ tags
- extra space

In [None]:
# Change columns name for easy access
comp_df.columns =['tweet','label','is_test']

In [None]:
#Remove @ tags
comp_df.tweet = comp_df.tweet.str.replace(r'(@\w*)','')

#Remove URL
comp_df.tweet = comp_df.tweet.str.replace(r"http\S+", "")

#Remove # tag
comp_df.tweet = comp_df.tweet.str.replace(r'#\w+',"")

#Remove all non-character
comp_df.tweet = comp_df.tweet.str.replace(r"[^a-zA-Z ]","")

# Remove extra space
comp_df.tweet = comp_df.tweet.str.replace(r'( +)'," ")
comp_df.tweet = comp_df.tweet.str.strip()

# Change to lowercase
comp_df.tweet = comp_df.tweet.str.lower()

In [None]:
comp_df.tweet[60]

### Tokenize and Lemmatize the word in data

In [None]:
# Create new columns for storing
comp_df['corpus'] = [nltk.word_tokenize(text) for text in comp_df.tweet]
lemma = nltk.WordNetLemmatizer()
comp_df.corpus = comp_df.apply(lambda x: [lemma.lemmatize(word) for word in x.corpus], axis=1)
comp_df.corpus = comp_df.apply(lambda x: " ".join(x.corpus),axis=1)

### Visualize the text data using wordcloud

In [None]:
stop_words = stopwords.words('english')

In [None]:
text = comp_df.corpus.values
wordcloud = WordCloud(max_words=500,background_color='white', stopwords=stop_words, colormap='rainbow',height=300)
wordcloud.generate(str(text))

In [None]:
fig = plt.figure()
fig.set_figheight(6)
fig.set_figwidth(10)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

# Start modeling

In [None]:
# Import the tools we need
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import classification_report

In [None]:
# Split the data back to train and test set
train_df = comp_df[comp_df.is_test==0]
test_df = comp_df[comp_df.is_test==1]
train_df.drop('is_test',axis=1, inplace=True)
test_df.drop('is_test',axis=1, inplace=True)
test_df.reset_index(drop=True,inplace=True)

In [None]:
#Split the data in X and y dataset

x_df = train_df.corpus
y_df = train_df['label']

x_test = test_df.corpus
y_test =test_df['label']

# Split to train and validation
x_train, x_val, y_train, y_val = train_test_split(x_df,y_df, test_size=0.2,random_state=42)

In [None]:
# Check the shape
x_train.shape, x_val.shape, y_train.shape, y_val.shape

# Using CountVectorizer

In [None]:
# Create the vectorizer
vectorizer = CountVectorizer(stop_words='english',ngram_range=(1,2),min_df=5).fit(comp_df.corpus)

# transform both train and valid data
x_train_vector = vectorizer.transform(x_train)
x_val_vector = vectorizer.transform(x_val)

### Start training models
- Logistric Regression
- Naive Bayes

In [None]:
cross_val_score(LogisticRegression(random_state=42), x_train_vector, y_train, cv=10, verbose=1, n_jobs=-1).mean()

In [None]:
cross_val_score(MultinomialNB(alpha=0.01), x_train_vector, y_train, cv=10, verbose=1, n_jobs=-1).mean()

In [None]:
model = LogisticRegression(random_state=42).fit(x_train_vector, y_train)
print(classification_report(y_val, model.predict(x_val_vector)))

### The performance of logistric regression is better, now try to tune the hyperparameters.

In [None]:
#params = {
    #'solver':['liblinear','saga','newton-cg','lbfgs'],
   # 'C':[0.001,0.01,0.1,1,10,100],
    # 'penalty':['l1','l2']
}

#lr_grid = GridSearchCV(LogisticRegression(random_state=42),params, cv=5, verbose=2, n_jobs=-1)
#lr_grid.fit(x_train_vector, y_train)

#print(classification_report(y_val, lr_grid.predict(x_val_vector)))

# Use tf-idf as vectorizer

In [None]:
vectorizer = TfidfVectorizer(min_df=5, ngram_range=(1,2),stop_words='english').fit(comp_df.corpus)

x_train_tf = vectorizer.transform(x_train)
x_val_tf = vectorizer.transform(x_val)

In [None]:
# Show the top 20 words 
feature_weight = x_train_tf.sum(axis=0).tolist()[0]
features = pd.DataFrame(feature_weight)
features.index =  list(vectorizer.get_feature_names())
features.sort_values(by=[0],ascending=False).head(30).plot.barh(figsize=(20,10))
plt.xlabel('Weight')

### Model training
- logistric regression

In [None]:
model = LogisticRegression(random_state=42).fit(x_train_tf,y_train)
print(classification_report(y_val, model.predict(x_val_tf)))

### Hyperparameters tuning using gridsearch

In [None]:
#params = {
    #'solver':['liblinear','saga','newton-cg','lbfgs'],
    #'C':[0.001,0.01,0.1,1,10,100],
    #'penalty':['l1','l2']
}

#lr_grid02 = GridSearchCV(LogisticRegression(random_state=42),params, cv=10, verbose=2, n_jobs=-1)
#lr_grid02.fit(x_train_tf, y_train)

#print(classification_report(y_val, lr_grid02.predict(x_val_tf)))

In [None]:
#lr_grid.best_estimator_

# Prediction on test data

In [None]:
#To skip the training time on kaggle, I use the best parameter found in my notebook directly
best_model = LogisticRegression(C=1, penalty='l1', random_state=42, solver='saga')
best_model.fit(x_train_tf, y_train)

In [None]:
# The best model performance on validation dataset
print(classification_report(y_val, best_model.predict(x_val_tf)))

### Now do prediction on the test data

In [None]:
x_test_tf = vectorizer.transform(x_test)

In [None]:
y_pred = best_model.predict(x_test_tf)
print(classification_report(y_test, y_pred))

### Heat map of the prediction

In [None]:
from sklearn.metrics import confusion_matrix
plt.figure(figsize=(6,4))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d',annot_kws={'size':17}, cmap='Reds')
plt.ylabel('True')
plt.xlabel('Predicted')

## Thank you very much