In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing Packages

In [None]:
import pandas as pd
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import re
import category_encoders as ce
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
import pickle

# Train and Test data

In [None]:
train = pd.read_csv("../input/covid-19-nlp-text-classification/Corona_NLP_train.csv", encoding = "ISO-8859-1")
test = pd.read_csv("../input/covid-19-nlp-text-classification/Corona_NLP_test.csv")

# Description of the data

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train["Sentiment"].value_counts()

# Data Wrangling

In [None]:
corpus = []
stemmer = PorterStemmer() 
for i in range(len(train)):
    review = re.sub('[^a-zA-Z]'," ",train["OriginalTweet"][i])
    review = review.lower()
    review = review.split()
    review = [stemmer.stem(word) for word in review if word.lower() not in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [None]:
corpus[1:10]

In [None]:
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(corpus).toarray()

In [None]:
def ordinal_encoding(df,col,mapping):
    ordinal_encoder = ce.OrdinalEncoder(cols = [col],return_df = True,mapping = [{'col':col,'mapping':mapping}])
    df_final = ordinal_encoder.fit_transform(df)
    return df_final

# Data Visualization

In [None]:
plot = sns.countplot(x='Sentiment', data=train).set_xticklabels(labels=['Neutral', 'Positive', 'Extremely Negative', 'Negative','Extremely Positive'],rotation=20)

In [None]:
dict = {'Neutral':3, 'Positive':2, 'Extremely Negative':5, 'Negative':4,
       'Extremely Positive':1}

In [None]:
train_final = ordinal_encoding(train,"Sentiment",dict)

In [None]:
def minimize_categories(val):
    if val == 1 or val == 2:
        return 1
    elif val == 3:
        return 2
    elif val == 4 or val == 5:
        return 3

In [None]:
train_final["Sentiment"] = train_final["Sentiment"].apply(minimize_categories)

In [None]:
sns.countplot(x='Sentiment', data=train_final)

# Common Keywords Visualizer

In [None]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import numpy as npy
from PIL import Image
import requests
import io
response = requests.get("https://res.cloudinary.com/maxie/image/upload/v1617197755/TEMP/covid_ywd7ph.jpg")
image_bytes = io.BytesIO(response. content)
dataset = " ".join(corpus)
def create_word_cloud(string):

    maskArray = npy.array(Image.open(image_bytes))
    cloud = WordCloud(background_color = "black", max_words = 150, mask = maskArray, stopwords = set(STOPWORDS),contour_width=1, contour_color='#333')
    cloud.generate(string)
#     cloud.to_file("wordCloud.png")
    return cloud
dataset = dataset.lower()
wordcloud=create_word_cloud(dataset)
plt.figure(figsize=[20,10])
plt.imshow(wordcloud) # image show
plt.axis('off') # to off the axis of x and y
plt.show()

# Training Models

In [None]:
Y = train_final["Sentiment"]
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size = 0.2,random_state= 0,stratify =Y)

In [None]:
multinb = MultinomialNB()
multinb.fit(X_train,Y_train)

In [None]:
Y_pred_multinb = multinb.predict(X_test)

In [None]:
log = LogisticRegression()
log.fit(X_train,Y_train)
Y_pred_log = log.predict(X_test)

In [None]:
accuracy_score(Y_pred_multinb,Y_test)

In [None]:
print(classification_report(Y_pred_multinb,Y_test))

In [None]:
accuracy_score(Y_pred_log,Y_test)

In [None]:
print(classification_report(Y_pred_log,Y_test))

# Testing the prediction model

In [None]:
text = "T 3590 -I have tested CoviD positive ..  shifted to Hospital  .. hospital informing  authorities .. family and staff undergone tests , results awaited ..All that have been in close proximity to me in the last 10 days are requested to please get themselves tested !"
# predict(text)

In [None]:
def preprocess_text(text):
    corpus = []
    stemmer = PorterStemmer()
    review = re.sub('[^a-zA-Z]'," ",text)
    review = review.lower()
    review = review.split()
    review = [stemmer.stem(word) for word in review if word.lower() not in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)
    return corpus

In [None]:
corpus = preprocess_text(text)
X = tfidf.transform(corpus).toarray()
prediction = log.predict(X)[0]
if prediction == 1:
    print("Your Tweet is Negative")
elif prediction == 2:
    print("Your Tweet is Neutral")
elif prediction == 3:
    print("your Tweet is Positive")