In [None]:
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
import string

In [None]:
train=pd.read_csv("../input/covid-19-nlp-text-classification/Corona_NLP_train.csv",encoding='latin1')
test=pd.read_csv("../input/covid-19-nlp-text-classification/Corona_NLP_test.csv",encoding='latin1')

In [None]:
train.head()

In [None]:
test.head()

In [None]:
data_train=train['OriginalTweet']
sentiment_train=train['Sentiment']
data_test=test['OriginalTweet']
sentiment_test=test['Sentiment']

In [None]:
full_data=pd.concat([data_test,data_train])

# Sample Testing

In [None]:
sample=data_train[:50]
sample_target=sentiment_train[:50]
sample_test=data_test[:10]

In [None]:
from nltk.stem import WordNetLemmatizer
WNL=WordNetLemmatizer()

Removing punctuations, stopwords and lemmatizing

In [None]:
def text_process(data):
    msg=[c for c in data if c not in string.punctuation]
    msg=''.join(msg)
    msg=[word for word in msg.split() if word.lower() not in stopwords.words('english')]
    msg=[WNL.lemmatize(word) for word in msg ]
    return msg

In [None]:
sample

In [None]:
sample.apply(text_process)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer

In [None]:
CV=CountVectorizer(analyzer=text_process)

In [None]:
CV.fit(sample)

In [None]:
CV.vocabulary_

In [None]:
sample=CV.transform(sample)

In [None]:
#non zero values
sample.nnz

In [None]:
Tfidf=TfidfTransformer()
Tfidf.fit(sample)
Tfidf_val=Tfidf.transform(sample)

In [None]:
sample_test=CV.transform(sample_test)
sample_test=Tfidf.transform(sample_test)

### a) Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
sentiment=MultinomialNB()
sentiment.fit(Tfidf_val,sample_target)
result1=sentiment.predict(sample_test)

In [None]:
result1

### b) Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
RFC=RandomForestClassifier()
RFC.fit(Tfidf_val,sample_target)
result2=RFC.predict(sample_test)

In [None]:
result2

In [None]:
sentiment_test.head(10)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix

In [None]:
from sklearn.pipeline import Pipeline

# 1) With IDF

### a) Naive Bayes

In [None]:
estimator1=[('CV',CountVectorizer(analyzer=text_process)),('Tfidf',TfidfTransformer()),('Final Analysis',MultinomialNB())]

In [None]:
pipe1=Pipeline(estimator1)

In [None]:
pipe1.fit(data_train,sentiment_train)

In [None]:
predict1=pipe1.predict(data_test)

In [None]:
print(classification_report(sentiment_test,predict1))

In [None]:
print(confusion_matrix(sentiment_test,predict1))

In [None]:
df1=pd.DataFrame()
df1['actual']=sentiment_test
df1['predicted']=predict1

In [None]:
df1.head(15)

### b) Random Forest Classifier

In [None]:
estimator2=[('CV',CountVectorizer(analyzer=text_process)),('Tfidf',TfidfTransformer()),
            ('Final Analysis',RandomForestClassifier())]

In [None]:
pipe2=Pipeline(estimator2)

In [None]:
pipe2.fit(data_train,sentiment_train)

In [None]:
predict2=pipe2.predict(data_test)

In [None]:
print(classification_report(sentiment_test,predict2))

In [None]:
print(confusion_matrix(sentiment_test,predict2))

In [None]:
df2=pd.DataFrame()
df2['actual']=sentiment_test
df2['predicted']=predict2

In [None]:
df2.head(15)

# 2) Without IDF

### a) Naive Bayes

In [None]:
estimator3=[('CV',CountVectorizer(analyzer=text_process)),('Tfidf',TfidfTransformer(use_idf=False)),
            ('Final Analysis',MultinomialNB())]

In [None]:
pipe3=Pipeline(estimator3)

In [None]:
pipe3.fit(data_train,sentiment_train)

In [None]:
predict3=pipe3.predict(data_test)

In [None]:
print(classification_report(sentiment_test,predict3))

In [None]:
print(confusion_matrix(sentiment_test,predict3))

In [None]:
df3=pd.DataFrame()
df3['actual']=sentiment_test
df3['predicted']=predict3

In [None]:
df3.head(15)

### b) Random Forest Classifier

In [None]:
estimator4=[('CV',CountVectorizer(analyzer=text_process)),('Tfidf',TfidfTransformer(use_idf=False)),
            ('Final Analysis',RandomForestClassifier())]

In [None]:
pipe4=Pipeline(estimator4)

In [None]:
pipe4.fit(data_train,sentiment_train)

In [None]:
predict4=pipe4.predict(data_test)

In [None]:
print(classification_report(sentiment_test,predict4))

In [None]:
print(confusion_matrix(sentiment_test,predict4))

In [None]:
df4=pd.DataFrame()
df4['actual']=sentiment_test
df4['predicted']=predict4

In [None]:
df4.head(15)

# Conclusion

### Best Prediction By:  Random Forest Classifier

# Word Cloud

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt 
%matplotlib inline

In [None]:
def one_sentence(data):
    msg=' '.join(data)
    return msg
def join_all(data):
    msg = " "
    for i in data:
        msg = msg + i
    return msg

In [None]:
full_data=full_data.apply(text_process)
full_data=full_data.apply(one_sentence)

In [None]:
whole=join_all(full_data)

In [None]:
whole

In [None]:
plt.figure(figsize=(18,10))
wordcloud=WordCloud(background_color='white').generate(whole)
plt.imshow(wordcloud)
plt.axis('off')