In [None]:
import pandas as pd
import numpy as np

# EDA

In [None]:
train_path = '../input/covid-19-nlp-text-classification/Corona_NLP_train.csv'
test_path = '../input/covid-19-nlp-text-classification/Corona_NLP_test.csv'

In [None]:
#load file
train = pd.read_csv(train_path, encoding='latin-1')
test = pd.read_csv(test_path, encoding='latin-1')
print(f'training set size: {train.shape}, test set size: {test.shape}')

In [None]:
train['category'] = 'train'
test['category'] = 'test'
data = pd.concat([train, test])
data.shape

In [None]:
data.head()

In [None]:
data.info()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import wordcloud

In [None]:
data['Sentiment'].value_counts().plot(kind='bar', figsize=(15,7))
plt.xlabel('Sentiment')
plt.ylabel('Number of tweets');

In [None]:
import nltk
#nltk.download('stopwords')
#nltk.download('wordnet')
stopwords = nltk.corpus.stopwords.words('english')
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
lemmatizer = nltk.stem.WordNetLemmatizer()

In [None]:
import re

def clean_tweet(text):
    text = re.sub(r'http\S+', ' ', text)
    text = re.sub(r'@\w+', ' ', text)
    text = re.sub(r'#\w+', ' ', text)
    text = re.sub(r'\d+', ' ', text)
    text = re.sub('r<.*?>', ' ', text)
    text = ' '.join(text.split()) #remove duplicate space
    return ' '.join([lemmatizer.lemmatize(w) for w in tokenizer.tokenize(text.lower()) if w not in stopwords])

In [None]:
data['OriginalTweet'] = data['OriginalTweet'].apply(clean_tweet)

In [None]:
# conver all tweets into a single string to generate wordcloud
tweets_str = ' '.join([i for i in data['OriginalTweet']])

In [None]:
wc = wordcloud.WordCloud(stopwords=stopwords)
wc.generate(tweets_str)
display(wc.to_image())

In [None]:
covid_pos = 0
covid_neg = 0
covid_expos = 0
covid_exneg = 0
covid_neu = 0
for tweet, sentiment in np.nditer([data['OriginalTweet'], data['Sentiment']], flags=['refs_ok']):
    if re.search(r'\w*corona\w*|\w*covid\w*', str(tweet)) and sentiment=='Positive':
        covid_pos+=1
    elif re.search(r'\w*corona\w*|\w*covid\w*', str(tweet)) and sentiment=='Negative':
        covid_neg+=1
    elif re.search(r'\w*corona\w*|\w*covid\w*', str(tweet)) and sentiment=='Neutral':
        covid_neu += 1
    elif re.search(r'\w*corona\w*|\w*covid\w*', str(tweet)) and sentiment=='Extremely Positive':
        covid_expos += 1
    elif re.search(r'\w*corona\w*|\w*covid\w*', str(tweet)) and sentiment=='Extremely Negative':
        covid_exneg +=1
covid_pos, covid_neg, covid_expos, covid_exneg, covid_neu

In [None]:
fig = plt.figure(figsize=(10,7))
plt.bar(
    ['Positive', 'Negative', 'Neutral', 'Extremely Positive', 'Extremely Negative'],
    height=[covid_pos, covid_neg, covid_neu, covid_exneg, covid_expos]
)
plt.title('Frequencies of tweets sentiments that contains the word corona or covid');

#### from the chart above, since the frequency of the word covid or corona is fairly distributed, I won't bother adding it to the stopwords. Unlike in some nlp task where the frequency of some key words is not fairly distributed which in turn affect the prediction of the sentiments as the prediction will be biased towards the sentiment with the highest frequency.

In [None]:
data.drop(['UserName', 'ScreenName', 'Location', 'TweetAt'], axis=1, inplace=True)
n_data = data #keep data for lstm model
d = {'Neutral':0, 'Extremely Positive':1, 'Extremely Negative':-1, 'Positive':1,
     'Negative':-1
    }
data['Sentiment'] = data['Sentiment'].map(d)
data.head()

In [None]:
data.isna().sum()

In [None]:
lr_train = data[data['category']=='train'].drop('category', axis=1)
lr_test = data[data['category']=='test'].drop('category', axis=1)
X_train = lr_train['OriginalTweet']
y_train = lr_train['Sentiment']
X_test = lr_test['OriginalTweet']
y_test = lr_test['Sentiment']

# Logistic Regression

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [None]:
count_vector = CountVectorizer(stop_words='english')

In [None]:
x_train = count_vector.fit_transform(X_train)
print(f'number of unique words: {len(count_vector.get_feature_names())}')
print(x_train.shape)

In [None]:
np.max(x_train), np.min(x_train)

In [None]:
x_test = count_vector.transform(X_test)

In [None]:
lreg = LogisticRegression(C=1, max_iter=1000, random_state=3)

In [None]:
lreg.fit(x_train, y_train)

In [None]:
print(f'Accuracy on training set: {lreg.score(x_train, y_train)}')
print(f'Accuracy on test set: {lreg.score(x_test, y_test)}')

In [None]:
print(classification_report(y_test, lreg.predict(x_test)))

#### having issue using fastai lstm learner...
### my whole work can be viwed here:https://colab.research.google.com/drive/1MXntVq8_B-d76mLc550PxsTL5P4WlhNe?usp=sharing