In [None]:
import pandas as pd
import re
import nltk
from numpy import random
import wordcloud
from matplotlib import pyplot as plt
import os
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,confusion_matrix

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
dfTrain=pd.read_csv("../input/covid-19-nlp-text-classification/Corona_NLP_train.csv",encoding='latin1')
dfTrain

In [None]:
seed=random.randint(0,1000,1)[0]
def randomTweetsSelection(series,randomSeedSelection=True):
    if randomSeedSelection:
        random.seed(seed)
    for i in random.randint(0,len(series),10):
        print(f"{i})\t{series[i]}\n")
randomTweetsSelection(dfTrain.OriginalTweet)

In [None]:
dfTrain.isnull().sum()

In [None]:
spaceSplitData=dfTrain.OriginalTweet.apply(lambda x: re.split("`| |'",x.lower()))
randomTweetsSelection(spaceSplitData)

In [None]:
stop = stopwords.words('english')
preprocessedData=spaceSplitData.apply(lambda x:[i for i in x if i not in stop])
for i in ['@\w+','(http)s?:[/\w.]+','#\w+','[\n\r\t\b\f]','[^\w\s]']:
    r = re.compile(i)
    preprocessedData=preprocessedData.apply(lambda x:[r.sub("", i) for i in x])
preprocessedData=preprocessedData.apply(lambda x:[i for i in x if i!=""])
randomTweetsSelection(preprocessedData)

In [None]:
word='us'
finalWord=PorterStemmer()
print(f"Porter Stemmer: {finalWord.stem(word)}")
from nltk.stem.snowball import EnglishStemmer
finalWord=EnglishStemmer()
print(f"Snowball Stemmer: {finalWord.stem(word)}")
from nltk.stem import WordNetLemmatizer
finalWord = WordNetLemmatizer()
print(f"Word Net Lemmatizer: {finalWord.lemmatize(word,'v')}")

In [None]:
def POSTags(x):
    lemmatizerList=[]
    finalWord = WordNetLemmatizer()
    for word,tags in nltk.pos_tag(x):
        if tags.startswith('J'):
            lemmatizerList.append(finalWord.lemmatize(word,wordnet.ADJ))
        elif tags.startswith('V'):
            lemmatizerList.append(finalWord.lemmatize(word,wordnet.VERB))
        elif tags.startswith('N'):
            lemmatizerList.append(finalWord.lemmatize(word,wordnet.NOUN))
        elif tags.startswith('R'):
            lemmatizerList.append(finalWord.lemmatize(word,wordnet.ADV))
        else:          
            lemmatizerList.append(finalWord.lemmatize(word,random.choice([wordnet.ADJ,wordnet.VERB,wordnet.ADV])))
    return lemmatizerList
preprocessedData=preprocessedData.apply(POSTags)
randomTweetsSelection(preprocessedData)

In [None]:
wordCloudList=[]
for i in preprocessedData:
    for j in i:
        wordCloudList.append(j)
wordCloudText=" ".join(wordCloudList)
covidWordCloud = wordcloud.WordCloud(width = 800, height = 800,
                background_color ='white',
                stopwords = wordcloud.STOPWORDS,
                min_font_size = 10).generate(wordCloudText)
  
# plot the WordCloud image                       
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(covidWordCloud)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.show()

In [None]:
sentenceFromTokens=preprocessedData.apply(lambda x:" ".join(x))
cv=CountVectorizer()
cv.fit(sentenceFromTokens)
cvTransformer=lambda:cv.transform(sentenceFromTokens).toarray()
mnb=MultinomialNB()
mnb.fit(cvTransformer(),dfTrain.Sentiment)
dfTest=pd.read_csv("../input/covid-19-nlp-text-classification/Corona_NLP_test.csv",encoding='latin1')
dfTest

In [None]:
dfTest.isnull().sum()

In [None]:
spaceSplitData=dfTest.OriginalTweet.apply(lambda x: re.split("`| |'",x.lower()))
stop = stopwords.words('english')
preprocessedData=spaceSplitData.apply(lambda x:[i for i in x if i not in stop])
for i in ['@\w+','(http)s?:[/\w.]+','#\w+','[\n\r\t\b\f]','[^\w\s]']:
    r = re.compile(i)
    preprocessedData=preprocessedData.apply(lambda x:[r.sub("", i) for i in x])
preprocessedData=preprocessedData.apply(lambda x:[i for i in x if i!=""])
preprocessedData=preprocessedData.apply(POSTags)
sentenceFromTokens=preprocessedData.apply(lambda x:" ".join(x))
pred=mnb.predict(cvTransformer())
pred

In [None]:
accuracy_score(dfTest.Sentiment,pred)

In [None]:
confusion_matrix(dfTest.Sentiment,pred)