In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# importing libraries
import pandas as pd
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings(action="ignore")
import matplotlib.pyplot as plt
%matplotlib inline
import spacy
from spacy import displacy
from spacy.util import minibatch, compounding

In [None]:
# importing dataset
df = pd.read_csv("/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_train.csv",encoding = "Latin-1")
df.head()

In [None]:
# shape of dataset
df.shape

In [None]:
# checking for null values
df.isna().sum()

In [None]:
# Datatypes of each column
df.info()

In [None]:
df.head().T

In [None]:
df.Sentiment.unique()

In [None]:
df.replace(to_replace="Extremely Negative", value="Negative", inplace=True)
df.replace(to_replace="Extremely Positive", value="Positive", inplace=True)
df.replace(to_replace="Neutral", value="Negative", inplace=True)

In [None]:
df.Sentiment.unique()

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df.Sentiment = le.fit_transform(df.Sentiment)

In [None]:
df.head()

In [None]:
df.Sentiment.value_counts()

In [None]:
sns.countplot(df.Sentiment)

In [None]:
df1 = df[df.Sentiment==1][:18000]
df2 = df[df.Sentiment==0][:18000]

In [None]:
df_train = df1.append(df2)
df_train.head()

In [None]:
df_train = df_train[["OriginalTweet","Sentiment"]]
df_train.head()

In [None]:
df_train.shape

In [None]:
sns.countplot(df_train.Sentiment)

In [None]:
# Tokenization
spacy_tok = spacy.load('en_core_web_sm')
sample_tweet = df_train.OriginalTweet[23]
sample_tweet

In [None]:
parsed_tweet = spacy_tok(sample_tweet)
parsed_tweet

In [None]:
!wget https://raw.githubusercontent.com/tylerneylon/explacy/master/explacy.py

In [None]:
import explacy
explacy.print_parse_info(spacy_tok,'Covid-19 has various Symptoms') # text for example

In [None]:
explacy.print_parse_info(spacy_tok,df_train.OriginalTweet[23])

In [None]:
tokenized_text = pd.DataFrame()

for i, token in enumerate(parsed_tweet):
    tokenized_text.loc[i, 'text'] = token.text
    tokenized_text.loc[i, 'lemma'] = token.lemma_,
    tokenized_text.loc[i, 'pos'] = token.pos_
    tokenized_text.loc[i, 'tag'] = token.tag_
    tokenized_text.loc[i, 'dep'] = token.dep_
    tokenized_text.loc[i, 'shape'] = token.shape_
    tokenized_text.loc[i, 'is_alpha'] = token.is_alpha
    tokenized_text.loc[i, 'is_stop'] = token.is_stop
    tokenized_text.loc[i, 'is_punctuation'] = token.is_punct

tokenized_text[:20]

In [None]:
spacy.explain('GPE')

 **Dependency Parsing**:
* Syntactic Parsing or Dependency parsing is process of identifying sentences and assigning a syntactic structure to it. As in subject combined with object makes a sentence. Spacy provides a sparse tree which can be used to generate this structure.

**Sentence Boundary Detection**:
* Figuring out where sentences start and ends is important in NLP.

In [None]:
sentence_spans = list(parsed_tweet)
sentence_spans

In [None]:
displacy.render(parsed_tweet, style='dep',jupyter=True, options={"distance":140})

In [None]:
!pip install scattertext
import scattertext as st
nlp = spacy.load('en',disable_pipes=["tagger","ner"])

In [None]:
df_train.head()

In [None]:
df_train['parsed'] = df_train.OriginalTweet.apply(nlp)
corpus = st.CorpusFromParsedDocuments(df_train,category_col="Sentiment", parsed_col="parsed").build()

In [None]:
df_train.head()

**SpaCy Text Categorizer**:
* We will train convolutional neural network text classifier on our Coronavirus Tweets using spaCy's new TextCategorizer component.
* SpaCy provides classification model with multiple labels,non_mutually exclusive labels.The TextCategorizer uses its own CNN to balance weights and other pipeline components.

In [None]:
df_train['tuples'] = df_train.apply(lambda row: (row["OriginalTweet"], row["Sentiment"]), axis=1)
train = df_train["tuples"].tolist()
train[:6]

In [None]:
#functions from spaCy documentation
def load_data(limit=0, split=0.8):
    train_data = train
    np.random.shuffle(train_data)
    train_data = train_data[-limit:]
    texts, labels = zip(*train_data)
    cats = [{'POSITIVE': bool(y)} for y in labels]
    split = int(len(train_data) * split)
    return (texts[:split], cats[:split]), (texts[split:], cats[split:])

def evaluate(tokenizer, textcat, texts, cats):
    docs = (tokenizer(text) for text in texts)
    tp = 1e-8  # True positives
    fp = 1e-8  # False positives
    fn = 1e-8  # False negatives
    tn = 1e-8  # True negatives
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        for label, score in doc.cats.items():
            if label not in gold:
                continue
            if score >= 0.5 and gold[label] >= 0.5:
                tp += 1.
            elif score >= 0.5 and gold[label] < 0.5:
                fp += 1.
            elif score < 0.5 and gold[label] < 0.5:
                tn += 1
            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f_score = 2 * (precision * recall) / (precision + recall)
    return {'textcat_p': precision, 'textcat_r': recall, 'textcat_f': f_score}

#("Number of texts to train from","t" , int)
n_texts=30000
#You can increase texts count if you have more computational power.

#("Number of training iterations", "n", int))
n_iter=10

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
# add the text classifier to the pipeline if it doesn't exist
# nlp.create_pipe works for built-ins that are registered with spaCy
if 'textcat' not in nlp.pipe_names:
    textcat = nlp.create_pipe('textcat')
    nlp.add_pipe(textcat, last=True)
# otherwise, get it, so we can add labels to it
else:
    textcat = nlp.get_pipe('textcat')

# add label to text classifier
textcat.add_label('POSITIVE')

# load the dataset
print("Loading Covid Tweets data...")
(train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=n_texts)
print("Using {} examples ({} training, {} evaluation)"
      .format(n_texts, len(train_texts), len(dev_texts)))
train_data = list(zip(train_texts,
                      [{'cats': cats} for cats in train_cats]))

In [None]:
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
with nlp.disable_pipes(*other_pipes):  # only train textcat
    optimizer = nlp.begin_training()
    print("Training the model...")
    print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F'))
    for i in range(n_iter):
        losses = {}
        # batch up the examples using spaCy's minibatch
        batches = minibatch(train_data, size=compounding(4., 32., 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.2,
                       losses=losses)
        with textcat.model.use_params(optimizer.averages):
            # evaluate on the dev data split off in load_data()
            scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
        print('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}'  # print a simple table
              .format(losses['textcat'], scores['textcat_p'],
                      scores['textcat_r'], scores['textcat_f']))

In [None]:
test_text1 = "Mercedes is going to launch its new Car this weekend."
test_text2 = "Coronavirus is grown to mutate itself."
doc = nlp(test_text1)
test_text1, doc.cats

In [None]:
df_train["OriginalTweet"][2900]

In [None]:
doc3 = nlp(df_train["OriginalTweet"][2900])

In [None]:
df_train["OriginalTweet"][2900], doc3.cats

In [None]:
df_train["OriginalTweet"][26770]

In [None]:
doc4 = nlp(df_train["OriginalTweet"][26770])
df_train["OriginalTweet"][26770], doc4.cats

In [None]:
doc5 = nlp(df_train["OriginalTweet"][12500])
df_train["OriginalTweet"][12500], doc5.cats

* Now, we can apply this model to our Test dataset and get respective results whether tweets are related to coronavirus or not.

Author: Purvit Vashishtha