In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

#### Before starting with the approach I would really thank Miss.Poonam, She had worked previously for Food reviews dataset by Amazon, in which she had given a beautiful tutorial for SpaCy!

#### I have used her [notebook](https://www.kaggle.com/poonaml/text-classification-using-spacy) as a reference for my approach regarding Covid-19 Text Classification Approach, dealing with Sentiments ranging from "Extreamly Negative"(Score: 0) to "Extreamly Positive"(Score: 5).

* We are going to tackle an interesting natural language processing problem i.e text classification. We will explore texual data using amazing spaCy library and build a text classification model.

In [None]:
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
from spacy.util import minibatch, compounding

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df = pd.read_csv('../input/covid-19-nlp-text-classification/Corona_NLP_train.csv')
df.shape 

In [None]:
df.head()

In [None]:
# Import label encoder
from sklearn import preprocessing

label_encoder = preprocessing.LabelEncoder()
df['Sentiment']= label_encoder.fit_transform(df['Sentiment'])
  
df['Sentiment'].unique()

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
ax = df.Sentiment.value_counts().plot(kind='bar')
fig = ax.get_figure()

Here, Since we are only concered with Sentiment and Tweet we will be extracting those columns seperately in our dataframe, also drop null values(if any!)

In [None]:
df = df[['OriginalTweet','Sentiment']].dropna()
df.head()

In [None]:
df.shape

#### Tokenization:
* First step in any nlp pipeline is tokenizing text i.e breaking down paragraphs into sentenses and then sentenses into words, punctuations and so on.

* we will load english language model to tokenize our english text.

* Every language is different and have different rules. Spacy offers 8 different language models.

In [None]:
spacy_tok = spacy.load('en_core_web_sm') #English Language model for tokenization!
sample_review=df.OriginalTweet[32]
sample_review

In [None]:
parsed_review = spacy_tok(sample_review)
parsed_review

There is not much difference between parsed review and original one. But we will see ahead what has actually happened. We can see how parsing has been done visually through explacy

In [None]:
!wget https://raw.githubusercontent.com/tylerneylon/explacy/master/explacy.py

In [None]:
import explacy
explacy.print_parse_info(spacy_tok, 'Covid-19 has various symtoms') #Text for demonstration

In [None]:
explacy.print_parse_info(spacy_tok, 'India has help various countries with Covid-19 Resources') #Text for demonstration

In [None]:
explacy.print_parse_info(spacy_tok,df.OriginalTweet[2])

Part-of-speech tagging
After tokenization we can parse and tag variety of parts of speech to paragraph text. SpaCy uses statistical models in background to predict which tag will go for each word(s) based on the context.

Lemmatization
It is the process of extracting uninflected/base form of the word. Lemma can be like For eg.

Adjectives: best, better → good Adverbs: worse, worst → badly Nouns: ducks, children → duck, child Verbs: standing,stood → stand

In [None]:
tokenized_text = pd.DataFrame()

for i, token in enumerate(parsed_review):
    tokenized_text.loc[i, 'text'] = token.text
    tokenized_text.loc[i, 'lemma'] = token.lemma_,
    tokenized_text.loc[i, 'pos'] = token.pos_
    tokenized_text.loc[i, 'tag'] = token.tag_
    tokenized_text.loc[i, 'dep'] = token.dep_
    tokenized_text.loc[i, 'shape'] = token.shape_
    tokenized_text.loc[i, 'is_alpha'] = token.is_alpha
    tokenized_text.loc[i, 'is_stop'] = token.is_stop
    tokenized_text.loc[i, 'is_punctuation'] = token.is_punct

tokenized_text[:20]

Named Entity Recognition (NER)
Named entity Recognition automatically identifies named entities in a text and classifies them into predefined categories. Entities can be names of people, organizations, locations, times, quantities, monetary values, percentages, and more.


Spacy figures out below entities automatically:

In [None]:
from IPython.display import Image
Image("https://d33wubrfki0l68.cloudfront.net/00d54115351b0e18776433853e794b76b59ee97c/eab3d/static/d0575562cdedb47340c00662c5c1b859/80132/example.png")

In [None]:
spacy.displacy.render(parsed_review, style='ent', jupyter=True)

In [None]:
spacy.explain('GPE') # to explain POS tag

#### Dependency parsing:
Syntactic Parsing or Dependency Parsing is process of identifyig sentenses and assigning a syntactic structure to it. As in Subject combined with object makes a sentence. Spacy provides parse tree which can be used to generate this structure.

#### Sentense Boundry Detection:
Figuring out where sentense starts and ends is very imporatnt part of nlp.

In [None]:
sentence_spans = list(parsed_review.sents)
sentence_spans

In [None]:
displacy.render(parsed_review, style='dep', jupyter=True,options={'distance': 140})

In [None]:
options = {'compact': True, 'bg': 'white','distance': 140,
           'color': 'blue', 'font': 'Trebuchet MS'}
displacy.render(parsed_review, jupyter=True, style='dep', options=options)

In [None]:
spacy.explain("ADJ") ,spacy.explain("det") ,spacy.explain("ADP") ,spacy.explain("prep")  
#just to understand what does the tag means!

In [None]:
noun_chunks_df = pd.DataFrame()

for i, chunk in enumerate(parsed_review.noun_chunks):
    noun_chunks_df.loc[i, 'text'] = chunk.text
    noun_chunks_df.loc[i, 'root'] = chunk.root,
    noun_chunks_df.loc[i, 'root.text'] = chunk.root.text,
    noun_chunks_df.loc[i, 'root.dep_'] = chunk.root.dep_
    noun_chunks_df.loc[i, 'root.head.text'] = chunk.root.head.text

noun_chunks_df[:20]

In [None]:
!pip install scattertext
import scattertext as st
nlp = spacy.load('en',disable_pipes=["tagger","ner"])

In [None]:
print(df.shape)

In [None]:
df.head()

In [None]:
nlp = spacy.load('en',disable_pipes=["tagger","ner"])
df['parsed'] = df.OriginalTweet.apply(nlp)
corpus = st.CorpusFromParsedDocuments(df,
                             category_col='Sentiment',
                             parsed_col='parsed').build()

In [None]:
df.head()

#### Sence2vec
The idea is get something better than word2vec model.

It assight parts of speech tags like verb, noun , adjective to words, which will in turn be used to make sence of context.

Please book [VERB] my ticket.
Read the book [NOUN].

In [None]:
!pip install sense2vec==1.0.0a0

#### SpaCy Text Categorizer
We will train a multi-label convolutional neural network text classifier on our food reviews, using spaCy's new TextCategorizer component.

SpaCy provides classification model with multiple, non-mutually exclusive labels. You can change the model architecture rather easily, but by default, the TextCategorizer class uses a convolutional neural network to assign position-sensitive vectors to each word in the document. The TextCategorizer uses its own CNN model, to avoid sharing weights with the other pipeline components

In [None]:
#Prepare data:
#Let's prepare the data as SpaCy would like it. It accepts list of tuples of text and labels.

In [None]:
df['tuples'] = df.apply(
    lambda row: (row['OriginalTweet'],row['Sentiment']), axis=1)
train = df['tuples'].tolist()
train[:5]

In [None]:
#functions from spacy documentation
def load_data(limit=0, split=0.8):
    train_data = train
    np.random.shuffle(train_data)
    train_data = train_data[-limit:]
    texts, labels = zip(*train_data)
    cats = [{'POSITIVE': bool(y)} for y in labels]
    split = int(len(train_data) * split)
    return (texts[:split], cats[:split]), (texts[split:], cats[split:])

def evaluate(tokenizer, textcat, texts, cats):
    docs = (tokenizer(text) for text in texts)
    tp = 1e-8  # True positives
    fp = 1e-8  # False positives
    fn = 1e-8  # False negatives
    tn = 1e-8  # True negatives
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        for label, score in doc.cats.items():
            if label not in gold:
                continue
            if score >= 0.5 and gold[label] >= 0.5:
                tp += 1.
            elif score >= 0.5 and gold[label] < 0.5:
                fp += 1.
            elif score < 0.5 and gold[label] < 0.5:
                tn += 1
            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f_score = 2 * (precision * recall) / (precision + recall)
    return {'textcat_p': precision, 'textcat_r': recall, 'textcat_f': f_score}

#("Number of texts to train from","t" , int)
n_texts=30000
#You can increase texts count if you have more computational power.

#("Number of training iterations", "n", int))
n_iter=10

In [None]:
nlp = spacy.load('en_core_web_sm')  # create english Language class

In [None]:
# add the text classifier to the pipeline if it doesn't exist
# nlp.create_pipe works for built-ins that are registered with spaCy
if 'textcat' not in nlp.pipe_names:
    textcat = nlp.create_pipe('textcat')
    nlp.add_pipe(textcat, last=True)
# otherwise, get it, so we can add labels to it
else:
    textcat = nlp.get_pipe('textcat')

# add label to text classifier
textcat.add_label('POSITIVE')

# load the dataset
print("Loading Covid Tweets data...")
(train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=n_texts)
print("Using {} examples ({} training, {} evaluation)"
      .format(n_texts, len(train_texts), len(dev_texts)))
train_data = list(zip(train_texts,
                      [{'cats': cats} for cats in train_cats]))

In [None]:
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
with nlp.disable_pipes(*other_pipes):  # only train textcat
    optimizer = nlp.begin_training()
    print("Training the model...")
    print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F'))
    for i in range(n_iter):
        losses = {}
        # batch up the examples using spaCy's minibatch
        batches = minibatch(train_data, size=compounding(4., 32., 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.2,
                       losses=losses)
        with textcat.model.use_params(optimizer.averages):
            # evaluate on the dev data split off in load_data()
            scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
        print('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}'  # print a simple table
              .format(losses['textcat'], scores['textcat_p'],
                      scores['textcat_r'], scores['textcat_f']))

In [None]:
# test the trained model
test_text1 = "Life is worth living than just existing."
test_text2="A strip about Covid that is funny, smart and has good art."
doc = nlp(test_text1)
test_text1, doc.cats

In [None]:
doc2 = nlp(test_text2)
test_text2, doc2.cats

* As you can see that the sentiments range from 0-100%, where the above text has positivity score of 0.99% which suggests that the tweet is quiet positive!

### Thank you! Hope this Notebook was helpful, Kindly Up-Vote if it helped you in any ways!