In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import spacy
import seaborn as sns
from wordcloud import WordCloud
import matplotlib.pyplot as plt


In [None]:
train_df = pd.read_csv('../input/covid-19-nlp-text-classification/Corona_NLP_train.csv', encoding='ISO-8859-1', parse_dates=['TweetAt'])
test_df = pd.read_csv('../input/covid-19-nlp-text-classification/Corona_NLP_test.csv', encoding='ISO-8859-1',parse_dates=['TweetAt'])

In [None]:
test_df

In [None]:
train_df.head()

In [None]:
# we combined the train_df and test_df into one dataframe for preprocessing

# Create new column to identify the test data
train_df['is_test'] = 0
test_df['is_test'] = 1

# combine 
comp_df = pd.concat([train_df, test_df])
comp_df.reset_index(drop=True, inplace=True)

In [None]:
comp_df.Sentiment.value_counts().plot.bar(figsize=(7,4))
plt.xticks(rotation=None)
plt.title('Number of tweets in different sentiments',fontsize=12)
plt.xlabel('Number of tweets', fontsize=12)
plt.ylabel('Sentiment')

In [None]:
# In this task we will focus on the text data only, so we drop the other columns
comp_df = comp_df[['OriginalTweet','Sentiment','is_test']]
comp_df.columns =['tweet','label','is_test']

In [None]:
#Remove @ tags
comp_df.tweet = comp_df.tweet.str.replace(r'(@\w*)','')

#Remove URL
comp_df.tweet = comp_df.tweet.str.replace(r"http\S+", "")

#Remove # tag
comp_df.tweet = comp_df.tweet.str.replace(r'#\w+',"")

#Remove all non-character
comp_df.tweet = comp_df.tweet.str.replace(r"[^a-zA-Z ]","")

# Remove extra space
comp_df.tweet = comp_df.tweet.str.replace(r'( +)'," ")
comp_df.tweet = comp_df.tweet.str.strip()

# Change to lowercase
comp_df.tweet = comp_df.tweet.str.lower()

In [None]:
comp_df['label'] = comp_df.label.replace('Extremely Negative', 0)
comp_df['label'] = comp_df.label.replace('Negative',1)
comp_df['label'] = comp_df.label.replace('Neutral', 2)
comp_df['label'] = comp_df.label.replace('Positive', 3)
comp_df['label'] = comp_df.label.replace('Extremely Positive', 4)

In [None]:
comp_df.label.value_counts()

In [None]:
# Word tokenization
from spacy.lang.en import English
# Load English tokenizer, tagger, parser, NER and word vectors
nlp = English()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline

In [None]:
import string
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

# Create our list of punctuation marks
punctuations = string.punctuation

# Create our list of stopwords
nlp = spacy.load('en')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Load English tokenizer, tagger, parser, NER and word vectors
parser = English()

# Creating our tokenizer function
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = parser(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]

    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # return preprocessed list of tokens
    return mytokens

In [None]:
# Custom transformer using spaCy
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        # Cleaning Text
        return [clean_text(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}

# Basic function to clean the text
def clean_text(text):
    # Removing spaces and converting text into lowercase
    return text.strip().lower()

In [None]:
bow_vector = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1))

In [None]:
tfidf_vector = TfidfVectorizer(tokenizer = spacy_tokenizer)

In [None]:
x_train=comp_df.tweet[comp_df.is_test==0]
y_train=comp_df.label[comp_df.is_test==0]
x_test=comp_df.tweet[comp_df.is_test==1]
y_test=comp_df.label[comp_df.is_test==1]

In [None]:
# Logistic Regression Classifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
classifier = LogisticRegression(max_iter=500)
RandomFoest_model = RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=50, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

# Create pipeline using Bag of Words
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer',tfidf_vector),
                 ('classifier', RandomFoest_model)])

# model generation
pipe.fit(x_train,y_train)

In [None]:
y_predict=pipe.predict(x_test)

In [None]:
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test, y_predict)


In [None]:

sns.set(font_scale=1.4,color_codes=True,palette="deep")
sns.heatmap(cm,annot=True,annot_kws={"size":16},fmt="d",cmap="YlGnBu")
plt.title("Confusion Matrix")
plt.xlabel("Predicted Value")
plt.ylabel("True Value")