In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords 

from wordcloud import WordCloud, STOPWORDS
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import preprocessing
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score

# Data importing and EDA

In [None]:

data_test = pd.read_csv("../input/covid-19-nlp-text-classification/Corona_NLP_test.csv", encoding='ISO-8859-1')
data_train = pd.read_csv("../input/covid-19-nlp-text-classification/Corona_NLP_train.csv", encoding='ISO-8859-1')

In [None]:
data_test.head()

In [None]:
data_train.head()

In [None]:
data_train.shape

In [None]:
data_test.shape

In [None]:
data_train.isnull().sum()

In [None]:
data_train.duplicated().sum()

In [None]:
data_train.nunique()

In [None]:
plt.figure(figsize=(14,6))
sns.countplot(data=data_train, x='Sentiment', 
              order=["Extremely Negative", "Negative", "Neutral", "Positive", "Extremely Positive"])

In [None]:
data_train['TweetAt'].unique()

In [None]:
location_top10 = data_train['Location'].value_counts()[:10]
location_top10

In [None]:
plt.figure(figsize=(16,6))
sns.countplot(data=data_train[data_train['Location'].isin(location_top10.index)], x='Location')

In [None]:
plt.figure(figsize=(16,6))
plt.grid()

plt.hist(data_train['OriginalTweet'].str.len())

In [None]:
data_train.OriginalTweet[10]

In [None]:
def wordCloud(sentiment):
    text = ",".join(
               review for review in data_train[data_train['Sentiment'] == sentiment].OriginalTweet 
        if 'COVID' not in review and 'https' not in review and 'Covid' not in review)

    wordcloud = WordCloud(max_words=200, colormap='Set2', background_color="black").generate(text)
    plt.figure(figsize=(15,10))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.figure(1,figsize=(12, 12))
    plt.title('Prevalent words in ' + sentiment + ' tweets', fontsize=19)
    plt.show()

In [None]:
wordCloud("Extremely Negative")

In [None]:
wordCloud("Negative")

In [None]:
wordCloud("Neutral")

In [None]:
wordCloud("Positive")

In [None]:
wordCloud("Extremely Positive")

We see that for all sentiments words like 'coronavirus', 'people', 'store', 'supermarket', 'price' are often used.

# Data preprocessing

In [None]:
data_train.OriginalTweet[8]

In [None]:
X_train = data_train['OriginalTweet'].str.replace(r'http\S+', "")
X_train = X_train.str.replace(r"#\S+", "")
X_train = X_train.str.replace(r"@\S+", "")
X_train = X_train.str.replace("\r", "")
X_train = X_train.str.replace("\n", "")
X_train = X_train.str.replace(r"[^\w\s]", "")
X_train = X_train.str.lower()

In [None]:
X_train[8]

In [None]:
def remove_stopwords(text):
    text_without_stopwords = list(filter(lambda i: i.strip() not in set(stopwords.words('english')), text.split()))
    return " ".join(text_without_stopwords)

In [None]:
X_train = X_train.apply(lambda word: remove_stopwords(word))

In [None]:
X_train

In [None]:
count_vectorizer = TfidfVectorizer()
X_train= count_vectorizer.fit_transform(X_train)
X_test = count_vectorizer.transform(data_test['OriginalTweet'])

le = preprocessing.LabelEncoder()
le.fit(data_train['Sentiment'])
y_train = le.transform(data_train['Sentiment'])
y_test = le.transform(data_test['Sentiment'])

In [None]:
le.classes_

In [None]:
clf = LogisticRegression(random_state = 1)
clf.fit(X_train, y_train)

In [None]:
cross_val_score(clf, X_train, y_train, scoring='accuracy', cv=5)

In [None]:
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
print(confusion_matrix(y_test, y_pred))

Looks like the classifier don't distinguish Extremely Negative and Extremely Positive from Negative and Positive. Let's reduce sentiments shades.

In [None]:
def classes_def(x):
    if x == "Extremely Positive" or x == "Positive":
        return 2
    elif x == "Extremely Negative" or x == "Negative":
        return 0
    else:
        return 1
    
y_train_3classes = data_train['Sentiment'].apply(lambda x:classes_def(x))
y_test_3classes = data_test['Sentiment'].apply(lambda x:classes_def(x))

In [None]:
clf.fit(X_train, y_train_3classes)
cross_val_score(clf, X_train, y_train_3classes, scoring='accuracy', cv=5)

In [None]:
y_pred = clf.predict(X_test)
print(classification_report(y_test_3classes, y_pred))

Let's look at the most important words for negative classification.

In [None]:
index_to_word = {v:k for k,v in count_vectorizer.vocabulary_.items()}
words_coef = {(index_to_word[i], clf.coef_[0][i]) for i in range(clf.coef_.shape[1])}
sorted(words_coef, key=lambda word_coef: word_coef[1], reverse=True)[:10]

Words look correct.