In [None]:
import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
import seaborn as sns
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train = pd.read_csv('/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_train.csv', encoding = "ISO-8859-1")
train['Sentiment'] = train['Sentiment'].map({'Neutral': 0, 
                                             'Positive': 1, 
                                             'Negative': -1,
                                             'Extremely Positive': 1, 
                                             'Extremely Negative': -1,
                                            })
train.head()

In [None]:
test = pd.read_csv('/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_test.csv', encoding = "ISO-8859-1")
test['Sentiment'] = test['Sentiment'].map({'Neutral': 0, 
                                           'Positive': 1, 
                                           'Negative': -1,
                                           'Extremely Positive': 1, 
                                           'Extremely Negative': -1,
                                          })
test.head()

In [None]:
stopwords = set(STOPWORDS)
stopwords.update(["https", "amp", "coronavirus", "covid", "covid19", "t", "co", "people", "will"])

for i, target in enumerate(train['Sentiment'].unique()):
    text = " ".join(tweet for tweet in train[train['Sentiment']==target]['OriginalTweet'].str.lower().fillna(''))
    wordcloud = WordCloud(stopwords=stopwords, background_color="white").generate(text)
    plt.subplots(figsize=(20,5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title("OriginalTweet where Sentiment == "+str(target))
    plt.axis("off")
    plt.show()

In [None]:
pattern = '[a-zA-Z]+'
vectorizer = TfidfVectorizer(stop_words='english',
                             token_pattern=pattern,
                             min_df=1,
                             max_features=5000)
vectorizer.fit(train['OriginalTweet'])
X_train = pd.DataFrame(vectorizer.transform(train['OriginalTweet']).toarray(), columns = vectorizer.get_feature_names())
X_train.head()

In [None]:
X_test = pd.DataFrame(vectorizer.transform(test['OriginalTweet']).toarray(), columns = vectorizer.get_feature_names())
X_test.head()

In [None]:
clf = LogisticRegression(random_state=42, max_iter=400)
clf.fit(X_train, train['Sentiment'])
y_pred = clf.predict(X_test)
cf = confusion_matrix(test['Sentiment'], y_pred, labels=train['Sentiment'].unique())

In [None]:
sns.heatmap(cf, annot=True)
plt.show()

In [None]:
results = pd.DataFrame()
results['Sentiment'] = test['Sentiment']
results['Prediction'] = y_pred
results['Date'] = test['TweetAt']
results.to_csv('results.csv', index=False)
