In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## What's in this Kernel?
- Data Exploration
- Date(Text) Clearning
- BERT Model using K-train

## install "Ktrain" using pip

In [None]:
!pip install ktrain

## What is K-Train

ktrain is a lightweight wrapper for the deep learning library TensorFlow Keras (and other libraries) to help build, train, and deploy neural networks and other machine learning models. Inspired by ML framework extensions like fastai and ludwig, ktrain is designed to make deep learning and AI more accessible and easier to apply for both newcomers and experienced practitioners.

know more about it 
-https://github.com/amaiya/ktrain



### Importing required libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from nltk.corpus import stopwords
from nltk.util import ngrams
from sklearn.feature_extraction.text import CountVectorizer
from collections import defaultdict
from collections import  Counter
plt.style.use('ggplot')
stop=set(stopwords.words('english'))
import re
from nltk.tokenize import word_tokenize
import gensim
import string
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm
from keras.models import Sequential
from keras.layers import Embedding,LSTM,Dense,SpatialDropout1D
from keras.initializers import Constant
from sklearn.model_selection import train_test_split
from keras.optimizers import Adam
import ktrain
from ktrain import text


import matplotlib as mpl
from cycler import cycler
plt.style.use('ggplot')

### load the dataset check head

In [None]:
train = pd.read_csv("../input/nlp-getting-started/train.csv")
test = pd.read_csv("../input/nlp-getting-started/test.csv")

train.head(3)

## check the shape of the data

In [None]:
train.shape, test.shape

## Data Exploration (EDA)

In [None]:
# check for class distribution
sns.countplot(train['target'])

# class(0) :- No Disaster
# class(1) :- Disaster

## No of character in tweet

In [None]:
# check no of character in tweet
plt.style.use('seaborn-dark')
fig, ax = plt.subplots(1,2,figsize=(10,5))

# for target 0 
ax[0].hist(train[train['target']==0]['text'].str.len(),color='b',bins=20)
ax[0].set_title("Not Disaster Tweets len");

# for target 1
ax[1].hist(train[train['target']==1]['text'].str.len(),color='c',bins=20)
ax[1].set_title("Disaster Tweets len");


## No of words in tweet

In [None]:
# check no of character in tweet
fig, ax = plt.subplots(1,2,figsize=(10,5))

# for target 0 
ax[0].hist(train[train['target']==0]['text'].str.split().map(lambda x:len(x)),color='b',bins=20)
ax[0].set_title("Not Disaster Tweets len");

# for target 1
ax[1].hist(train[train['target']==1]['text'].str.split().map(lambda x:len(x)),color='c',bins=20)
ax[1].set_title("Disaster Tweets len");

plt.show()


## Stopwords counts

In [None]:
def create_corpus(target):
    corpus = []
    for x in train[train['target'] == target]['text'].str.split():
        for i in x:
            corpus.append(i)
    return corpus

# for class 0
corpus = create_corpus(0)

dic = defaultdict(int)
for word in corpus:
    if word in stop:
        dic[word]+=1
        
top = sorted(dic.items(),key=lambda x:x[1], reverse=True)[:10]


x, y = zip(*top)
# print(x,y)
b = pd.DataFrame({
    'value':x,
    'count':y
})

# b.head()
sns.barplot(x='value',y='count',data=b);


In [None]:
# for class 1
corpus=create_corpus(1)

dic=defaultdict(int)
for word in corpus:
    if word in stop:
        dic[word]+=1

top=sorted(dic.items(), key=lambda x:x[1],reverse=True)[:10] 
    


x,y=zip(*top)
c = pd.DataFrame({
    'value':x,
    'count':y
})

# b.head()
sns.barplot(x='value',y='count',data=c)

## Check for null values

In [None]:
# for train
sns.heatmap(train.isnull());

# location and so many null values

In [None]:
# for test
sns.heatmap(test.isnull());

# location and so many null values

## Data(Text) Cleaning

In [None]:
import nltk 
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

## Most Important Step (Text clearning)
- Remove Urls
- Remove Mentions
- Remove Hastags
- Remove HTML tags
- Remove Punctuations
- Remove Stop Words

In [None]:
def clean(text):

    #     remove urls
    text = re.sub(r'https?://\S+|www\.\S+', " ", text)

    #     remove mentions
    text = re.sub(r'@\w+',' ',text)

    #     remove hastags
    text = re.sub(r'#\w+', ' ', text)

    #     remove digits
    text = re.sub(r'\d+', ' ', text)

    #     remove html tags
    text = re.sub('r<.*?>',' ', text)
    
    #remove puct
    text = text.translate(str.maketrans("","",string.punctuation))
    
    #     remove stop words 
    text = text.split()
    text = " ".join([word for word in text if not word in stop_words])
    
      
    return text

## Removing Emojis

In [None]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

### Apply text cleaning on datasets

In [None]:
train['text'] = train['text'].apply(lambda x:clean(x))
train['text'] = train['text'].apply(lambda x:remove_emoji(x))

test['text'] = test['text'].apply(lambda x:clean(x))
test['text'] = test['text'].apply(lambda x:remove_emoji(x))

In [None]:
train.head()

### By looking at the datset we can get a sense that we only need 2 cloumns (Text and target) 
- So taking only these columns

and making copy

In [None]:
df_train = train.iloc[:,3:].copy()
df_test = test.iloc[:,3:].copy()

In [None]:
df_train.head()

In [None]:
df_test.head()

### What is bert
- BERT (Bidirectional Encoder Representations from Transformers) is a deep learning model developed by Google. It represented one of the major machine learning breakthroughs of the year, as it achieved state-of-the-art results across 11 different Natural Language Processing (NLP) tasks. Moreover, Google open-sourced the code and made pretrained models available for download similar to computer vision models pretrained on ImageNet. For these reasons, there continues to be a great deal of interest in BERT (even as other models slightly overtake it).

- We will first use the "texts_from_df" function to load the data from the data frames
- then pass train data set along with the text and target columns
- BERT can handle a maximum length to 512, but we only use 400 to reduce memory and improve speed.
- We need to process text a specific way for use with BERT. for that use preprocess_mode = 'bert'


In [None]:
(X_train,y_train),(X_test,y_test),preprocess = text.texts_from_df(train_df=df_train, text_column='text',
                  label_columns='target',
                   val_df= df_test,
                    maxlen=400,
                   preprocess_mode='bert'
                  )

- for text classification we are usign BERT so need to pass name = 'bert'
- passing our train_data 
- then pass preprocess

In [None]:
model = text.text_classifier(name='bert',
                            train_data = (X_train,y_train),
                            preproc=preprocess)

- train the model using train_data, val_data and batch_size

In [None]:
# get learning rate
learner = ktrain.get_learner(model=model,
                            train_data = (X_train,y_train),
                             val_data=(X_test,y_test),
                             batch_size = 6
                            )

- Finding the best learning rate for the model

#### Note:- It will take a lot of time even on GPU (in my case it took more then 2 hour)

In [None]:
learner.lr_find(max_epochs=2)
learner.lr_plot()

In [None]:
learner.fit_onecycle(lr=2e-5,epochs=2)

In [None]:
learner

- Now time for prediction

In [None]:
predictor = ktrain.get_predictor(learner.model, preprocess)

In [None]:
y_list = []

- Because as predict it will going to predict either 
- target or not target so mapping these values with 1 and 0

In [None]:
for i in range(len(df_test['text'])):
    text = df_test['text'][i]
    y_pred =  predictor.predict(text)
    if y_pred == 'target':
        y_list.append(1)
    else:
        y_list.append(0)

In [None]:
len(y_list)


In [None]:
sns.countplot(y_list)

- Reading the submission file

In [None]:
sample_sub=pd.read_csv('../input/nlp-getting-started/sample_submission.csv')

In [None]:
sub=pd.DataFrame({'id':sample_sub['id'].values.tolist(),'target':y_pre})
sub.to_csv('submission.csv',index=False)

In [None]:
sub.head()

# If you like the kernal please do a UpVote