In [None]:
try:
    import pycaret
except:
    !pip install pycaret-nightly

try:
    import missingno
except:
    !pip install missingno
    
try:
    import interpret
except:
    !pip install interpret

<hr style="border: solid 3px blue;">

# Introduction

![](https://miro.medium.com/max/1400/1*fTPhu7PqgIbnngbWG5zFWA.gif)

Picture Credit: https://miro.medium.com

**What is Natural language processing?**
> Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human language, in particular how to program computers to process and analyze large amounts of natural language data. The goal is a computer capable of "understanding" the contents of documents, including the contextual nuances of the language within them. The technology can then accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves.
> 
> Challenges in natural language processing frequently involve speech recognition, natural language understanding, and natural language generation.

Ref: https://en.wikipedia.org/wiki/Natural_language_processing

In [None]:
import numpy as np 
import pandas as pd 
from fastai.text.all import *

import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import rcParams

import warnings
warnings.filterwarnings(action='ignore')

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

sns.set(style="ticks", context="talk")
plt.style.use("dark_background")

In [None]:
train_df = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test_df = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

---------------------------
# EDA

In [None]:
train_df.head().style.set_properties(**{'background-color': 'black',
                           'color': 'white',
                           'border-color': 'white'})

In [None]:
train_df.info()

---------------------------------------
## Checking Missing Values

In [None]:
import missingno as msno
msno.matrix(df=train_df)

In [None]:
train_df.isnull().sum()

In [None]:
train_df.drop(['keyword','location','id'],axis=1,inplace=True)
test_df.drop(['keyword','location','id'],axis=1,inplace=True)

------------------------------------
## Checking Target Imbalance

In [None]:
colors = ['gold', 'mediumturquoise']
labels = ['Non-Disaster','Disaster']
values = train_df['target'].value_counts()/train_df['target'].shape[0]

fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=.3)])
fig.update_traces(hoverinfo='label+percent', textinfo='percent', textfont_size=20,
                  marker=dict(colors=colors, line=dict(color='#000000', width=2)))
fig.update_layout(
    title_text="Target Balance",
    title_font_color="white",
    legend_title_font_color="yellow",
    paper_bgcolor="black",
    plot_bgcolor='black',
    font_color="white",
)
fig.show()

## Seeing Disater Tweets

In [None]:
train_df[train_df['target']==1].head(10)

## Seeing Non-disater Tweets

In [None]:
train_df[train_df['target']==0].head(10)

<span style="color:Blue"> Observation:    
    
Non-disater tweets seem shorter.

--------------------------
# Visualizing using Topic Modeling

![](https://miro.medium.com/max/1400/1*cDwKSHmfp5awjqjobV707g.png)

Picture Credit: https://miro.medium.com

**What is Topic Modeling?**
> In machine learning and natural language processing, a topic model is a type of statistical model for discovering the abstract "topics" that occur in a collection of documents. Topic modeling is a frequently used text-mining tool for discovery of hidden semantic structures in a text body. Intuitively, given that a document is about a particular topic, one would expect particular words to appear in the document more or less frequently: "dog" and "bone" will appear more often in documents about dogs, "cat" and "meow" will appear in documents about cats, and "the" and "is" will appear approximately equally in both. A document typically concerns multiple topics in different proportions; thus, in a document that is 10% about cats and 90% about dogs, there would probably be about 9 times more dog words than cat words. The "topics" produced by topic modeling techniques are clusters of similar words. A topic model captures this intuition in a mathematical framework, which allows examining a set of documents and discovering, based on the statistics of the words in each, what the topics might be and what each document's balance of topics is.

Ref: https://en.wikipedia.org/wiki/Topic_model

In [None]:
!python -m spacy download en_core_web_sm
!python -m textblob.download_corpora

In [None]:
import spacy
from pycaret.nlp import *
nlp = spacy.load('en_core_web_sm')

In [None]:
%time 
disaster_nlp = setup(data = train_df, 
                     target = 'text',
                     html = False,
                     custom_stopwords = ['CO','.co','co','https','http'],
                     session_id = 123)

<span style="color:Blue"> Observation:
    
* 9233 vocabularies were created from a total of 7613 documents.

----------------------
# LDA(Latent Dirichlet allocation)

![](https://ars.els-cdn.com/content/image/1-s2.0-S0164121218302103-gr6.jpg)

Picture Credit: https://ars.els-cdn.com

> In natural language processing, the latent Dirichlet allocation (LDA) is a generative statistical model that allows sets of observations to be explained by unobserved groups that explain why some parts of the data are similar. For example, if observations are words collected into documents, it posits that each document is a mixture of a small number of topics and that each word's presence is attributable to one of the document's topics. LDA is an example of a topic model.

Ref: https://en.wikipedia.org/wiki/Latent_Dirichlet_allocation

In [None]:
lda = create_model('lda',multi_core=True)

In [None]:
lda_df = assign_model(lda, verbose=True)
lda_df.head()

In [None]:
plot_model(lda, plot = 'topic_distribution')

In [None]:
plot_model(lda, plot = 'topic_model')

----------------------------------
## Ploting Wordcloud

In [None]:
plot_model(lda, plot = 'wordcloud')

--------------------------------------
## Checking N-grams

> In the fields of computational linguistics and probability, an n-gram (sometimes also called Q-gram) is a contiguous sequence of n items from a given sample of text or speech. The items can be phonemes, syllables, letters, words or base pairs according to the application. The n-grams typically are collected from a text or speech corpus. When the items are words, n-grams may also be called shingles.

Ref: https://en.wikipedia.org/wiki/N-gram

In [None]:
plot_model(lda, plot = 'bigram')

In [None]:
plot_model(lda, plot = 'trigram')

------------------------------------------------------
## Checking Frequency

In [None]:
plot_model(lda, plot = 'frequency')

-----------------------------------
## Checking Part of Speech Frequency (POS)

![](https://cdn.analyticsvidhya.com/wp-content/uploads/2021/03/tree.png)

Picture Credit: https://cdn.analyticsvidhya.com

> In traditional grammar, a part of speech or part-of-speech (abbreviated as POS or PoS) is a category of words (or, more generally, of lexical items) that have similar grammatical properties. Words that are assigned to the same part of speech generally display similar syntaxic behavior (they play similar roles within the grammatical structure of sentences), sometimes similar morphology in that they undergo inflection for similar properties and even similar semantic behavior.
> 
Commonly listed English parts of speech are noun, verb, adjective, adverb, pronoun, preposition, conjunction, interjection, numeral, article, or determiner. 

Ref: https://en.wikipedia.org/wiki/Part_of_speech

In [None]:
plot_model(lda, plot = 'pos')

---------------------------
## Visualizing after Dimensionality Reduction

Let's show the 7613 documents by dimensionality reduction in 2D and 3D.

In [None]:
plot_model(lda, plot = 'umap')

In [None]:
plot_model(lda, plot = 'tsne')

<hr style="border: solid 3px blue;">

# Preprocessing, Modeling and Training

In [None]:
sns.set(style="ticks", context="talk")
plt.style.use("dark_background")

------------------------------------------
# Making Pipeline and Dataloaders

The following is the process of tokenizing text and batching it through dataloader.

In [None]:
tweet_datablock = DataBlock(
    blocks=(TextBlock.from_df('text', seq_len=36), CategoryBlock),
    get_x=ColReader('text'), get_y=ColReader('target'))

dls = tweet_datablock.dataloaders(train_df, bs=32)

## Showing Batch

In [None]:
dls.show_batch(max_n=5)

A few special tokens are described as following.
* xxbos: marks the beginning of the text.
* xxmaj: Indicates that the next word starts with a capital letter.
* xxunk: Indicates that the current word is not in the list.

## Showing Basic processing Rules

In [None]:
defaults.text_proc_rules

---------------------------------
# Modeling

In [None]:
learn = text_classifier_learner(dls,
                                AWD_LSTM,
                                drop_mult=0.5,
                                metrics=accuracy,                        
                                cbs = [EarlyStoppingCallback(monitor='accuracy', min_delta=0.001, patience=3),ActivationStats(with_hist=True)])

learn.model

-------------------------------
# Training

In [None]:
sr = learn.lr_find()
sr.valley

In [None]:
learn.fit_one_cycle(100,sr.valley)

In [None]:
learn.recorder.plot_loss()

------------------------------------------------
# Interpreting

We can confirm that the training was successful with the activation distribution.

In [None]:
def plot_layer_stats(self, idx):
    plt,axs = subplots(1, 3, figsize=(15,3))
    plt.subplots_adjust(wspace=0.5)
    for o,ax,title in zip(self.layer_stats(idx),axs,('mean','std','% near zero')):
        ax.plot(o)
        ax.set_title(f"{-1*layer}th layer {title}")

In [None]:
for layer in range(1,4):
    plot_layer_stats(learn.activation_stats,-1*layer)

In [None]:
def color_dim(self, idx):
    with plt.rc_context({"figure.figsize": (10,40), "figure.dpi": (600)}):
        res = self.hist(idx)
        fig = plt.figure()
        ax = fig.add_subplot(111)
        ax.imshow(res, origin='lower')
        ax.set_title(f"{idx}th activation histogram")
        ax.axis('off')

In [None]:
matplotlib.rcParams['image.cmap'] = 'rainbow_r'
for layer in range(1,4):
    color_dim(learn.activation_stats,-1*layer)

In [None]:
interp = ClassificationInterpretation.from_learner(learn)
interp.plot_confusion_matrix(figsize=(3,3),dpi=200)

----------------------------------------
# Predicting

In [None]:
test_dl = learn.dls.test_dl(test_df)
test_dl.show_batch(n_max=2)

In [None]:
preds = learn.get_preds(dl=test_dl)
results = preds[0].argmax(axis=1)
results = results.tolist()

In [None]:
submission_data = pd.read_csv('../input/nlp-getting-started/sample_submission.csv')

In [None]:
submission_data['target'] = results

In [None]:
submission_data.to_csv('submission.csv', index = False)

<hr style="border: solid 3px blue;">