<h1 style="background-color:red;font-family:newtimeroman;font-size:550%;text-align:center;border-radius: 15px 50px;padding: 5px">Corona Tweet Analysis</h1>


<h1 style="background-color:skyblue;font-family:newtimeroman;font-size:550%;text-align:center;border-radius: 15px 50px;padding: 5px">Importing Library and Data</h1>


In [None]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from tqdm import tqdm
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import warnings
import keras
from keras.layers import Dense,LSTM,Embedding,Input,GlobalMaxPool1D
from keras.models import Sequential
from sklearn.naive_bayes import CategoricalNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from keras.callbacks import EarlyStopping,ReduceLROnPlateau
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
warnings.filterwarnings('ignore')


In [None]:
data=pd.read_csv('../input/covid-19-nlp-text-classification/Corona_NLP_train.csv',encoding='latin1',parse_dates=['TweetAt'])
data.head()

In [None]:
data.describe(include='O')

In [None]:
data.info()

In [None]:
data.Sentiment.value_counts()

## There is very high probability that 'Extremely postivie' or 'Extremely Negative' would be mispredicted as 'Postive' or 'Negative', or vice-versa. Because words used in those tweets are very much similiar to each other. So in order to avoid these types of confusion we could assign them same.

In [None]:
data.Sentiment=data.Sentiment.replace({'Extremely Positive':'Positive','Extremely Negative':'Negative'})

In [None]:
train_text,val_text,train_label,val_label=train_test_split(data.OriginalTweet,data.Sentiment,
                                                             test_size=0.15,random_state=42)

In [None]:
lbl_encoder=LabelEncoder()
train_label_codes=lbl_encoder.fit_transform(train_label)

In [None]:
lbl_encoder.classes_

In [None]:
class Lemmatizer(object):
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
    def __call__(self, sentence):
        sentence=re.sub('(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)',' ',sentence)
        sentence=re.sub('[^0-9a-z]',' ',sentence)
        return [self.lemmatizer.lemmatize(word) for word in sentence.split() if len(word)>1]

<h1 style="background-color:skyblue;font-family:newtimeroman;font-size:550%;text-align:center;border-radius: 15px 50px;padding: 5px">Two Processes to predict our required classes</h1>


<h1 style="background-color:lightgreen;font-family:newtimeroman;font-size:550%;text-align:center;border-radius: 15px 50px;padding: 5px">Process 1</h1>


In [None]:
tokenizer=CountVectorizer(max_features=5000,stop_words='english',lowercase=True,tokenizer=Lemmatizer())


In [None]:
train_x=tokenizer.fit_transform(train_text).toarray()

In [None]:
tokenizer.get_params()

In [None]:
feature_names=tokenizer.get_feature_names()

In [None]:
val_x=tokenizer.transform(val_text).toarray()
val_label_codes=lbl_encoder.transform(val_label)

In [None]:
from sklearn.linear_model import LogisticRegression
model_p1=LogisticRegression()
model_p1.fit(train_x,train_label_codes)

In [None]:
print('Validation classification report',classification_report(val_label_codes,model_p1.predict(val_x)))
print('Training classification report',classification_report(train_label_codes,model_p1.predict(train_x)))

<h1 style="background-color:skyblue;font-family:newtimeroman;font-size:550%;text-align:center;border-radius: 15px 50px;padding: 5px">Data Visualization</h1>

In [None]:
data.Sentiment=lbl_encoder.transform(data.Sentiment)

In [None]:
df=data.set_index('TweetAt').resample('W').count()

In [None]:
plt.figure(figsize=(15,10))
df['OriginalTweet'].plot()
plt.title('Number of Tweet on Weekly basis in year 2020',fontdict={'size':'20'})
plt.plot()

In [None]:
plt.figure(figsize=(15,10))
plt.pie(train_label.value_counts(),explode=[0.01,0.01,0.001],colors=['green','red','blue'],
        labels=['Positive','Negative','Neutral'],autopct='%0.2f%%',radius=1,startangle=45)
plt.title('Sentiments',fontdict={'size':'20'})
plt.show()

In [None]:
twitter_mask=np.array(Image.open('../input/corona-virus/twitter.png'))
wc=WordCloud(max_words=300,mask=twitter_mask,background_color='white')
wc.generate(' '.join(word for word in feature_names[1500:3500] ))
plt.figure(figsize=(20,15))
plt.axis('off')
plt.imshow(wc)

In [None]:
corona_mask=np.array(Image.open('../input/corona-virus/coronav.jpg'))
wc_corona=WordCloud(max_words=300,mask=corona_mask,background_color='white')
wc_corona.generate(' '.join(word for word in feature_names[3500:] ))
plt.figure(figsize=(20,15))
plt.axis('off')
plt.imshow(wc_corona)

<h1 style="background-color:lightgreen;font-family:newtimeroman;font-size:550%;text-align:center;border-radius: 15px 50px;padding: 5px">Process 2</h1>


In [None]:
early_stop=EarlyStopping(monitor='val_accuracy',patience=3)
reduceLR=ReduceLROnPlateau(monitor='val_accuarcy',patience=2)

In [None]:
token=Tokenizer(num_words=5000,oov_token=Lemmatizer())
token.fit_on_texts(train_text)
train_x_2=token.texts_to_sequences(train_text)
train_x_2=pad_sequences(train_x_2,maxlen=60,padding='post',truncating='post')

In [None]:
val_x_2=token.texts_to_sequences(val_text)
val_x_2=pad_sequences(val_x_2,maxlen=60,padding='post',truncating='post')

In [None]:
embedding_dimension=32
v=len(token.word_index)
model=Sequential()
model.add(Input(shape=(60,)))
model.add(Embedding(v+1,embedding_dimension))
# model.add(Input(shape=(train_x.shape[1],)))
model.add(LSTM(64,return_sequences=True))
# model.add(Dense(128))
model.add(GlobalMaxPool1D())
model.add(Dense(64))
model.add(Dense(3,activation='softmax'))

In [None]:
model.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])
r=model.fit(train_x_2,train_label_codes,validation_data=(val_x_2,val_label_codes),
            epochs=20,batch_size=50,callbacks=[reduceLR,early_stop])

In [None]:
plt.plot(r.history['loss'])
plt.plot(r.history['val_loss'])
plt.title('LOSS',fontdict={'size':'22'})
plt.plot()

In [None]:
plt.plot(r.history['accuracy'])
plt.plot(r.history['val_accuracy'])
plt.title('Accuracy',fontdict={'size':'22'})
plt.plot()

<h1 style="background-color:lightblue;font-family:newtimeroman;font-size:550%;text-align:center;border-radius: 15px 50px;padding: 5px">Predicting on test dataset</h1>


In [None]:
test_data=pd.read_csv('../input/covid-19-nlp-text-classification/Corona_NLP_test.csv',encoding='latin1')

In [None]:
test_data['Sentiment']=test_data['Sentiment'].replace({'Extremely Positive':'Positive','Extremely Negative':'Negative'})

In [None]:
test_label=lbl_encoder.transform(test_data['Sentiment'])

In [None]:
test_x=test_data.OriginalTweet

<h1 style="background-color:lightgreen;font-family:newtimeroman;font-size:550%;text-align:center;border-radius: 15px 50px;padding: 5px">By Process 1</h1>


In [None]:
test_x_1=tokenizer.transform(test_x).toarray()

In [None]:
print(classification_report(test_label,model_p1.predict(test_x_1)))

<h1 style="background-color:lightgreen;font-family:newtimeroman;font-size:550%;text-align:center;border-radius: 15px 50px;padding: 5px">By Process 2</h1>


In [None]:
test_x_2=token.texts_to_sequences(test_data['OriginalTweet'])
test_x_2=pad_sequences(test_x_2,maxlen=60,padding='post',truncating='post')

In [None]:
print(classification_report(test_label,model.predict_classes(test_x_2)))

<h1 style="background-color:lightblue;font-family:newtimeroman;font-size:550%;text-align:center;border-radius: 15px 50px;padding: 5px">Conclusion</h1>


## Here both processes worked quite well though LSTM model worked better than linear model as Sequencial model understands data more efficiently and sequencially but that's not the case with linear model it just studies some common words and predict its output accordingly. So to use in daily practice for NLP Sequential model is mostly prefered.

<h1 style="background-color:pink;font-family:newtimeroman;font-size:550%;text-align:center;border-radius: 15px 10px;padding: 5px">If you liked this notebook . please upvote !!!</h1>
