> Import Libraries

In [None]:
!pip install ktrain # for BERT model

In [None]:
import pandas as pd
import numpy as np
import spacy
import ktrain
import re
import string
import os
import matplotlib.pyplot as plt
import seaborn as sns
import unicodedata
from wordcloud import WordCloud
from textblob import TextBlob

In [None]:
from spacy.lang.en.stop_words import STOP_WORDS as stopwords

In [None]:
train_df=pd.read_csv('../input/nlp-getting-started/train.csv')
train_df.sample(5)

In [None]:
test_df=pd.read_csv('../input/nlp-getting-started/test.csv')
test_df.sample(5)

In [None]:
g=sns.countplot(x='target',data=train_df)

> Convert to lowercase

In [None]:
train_df.head(2)

In [None]:
train_df['text'] = train_df['text'].apply(lambda x: str(x).lower())
test_df['text'] = test_df['text'].apply(lambda x: str(x).lower())
train_df.head(2)

> Contraction to Extraction

In [None]:
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how does",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so is",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
" u ": " you ",
" ur ": " your ",
" n ": " and ",
"won't": "would not",
'dis': 'this',
'bak': 'back',
'brng': 'bring'}

> Expansion 

In [None]:
def cont_to_exp(x):
    if type(x) is str:
        for key in contractions:
            value = contractions[key]
            x = x.replace(key, value)
        return x
    else:
        return x

In [None]:
train_df['text'] = train_df['text'].apply(lambda x: cont_to_exp(x))
test_df['text'] = test_df['text'].apply(lambda x: cont_to_exp(x))

> Count number of words, characters, hashtags, mentions and emails

In [None]:
def wordcount(x):
    length = len(str(x).split())
    return length

In [None]:
def charcount(x):
    s = x.split()
    x = ''.join(s)
    return len(x)

def hashtag_count(x):
    l = len([t for t in x.split() if t.startswith('#')])
    return l

def mentions_count(x):
    l = len([t for t in x.split() if t.startswith('@')])
    return l

In [None]:
train_df['char_count'] = train_df['text'].apply(lambda x: charcount(x))
train_df['word_count'] = train_df['text'].apply(lambda x: wordcount(x))
train_df['hashtag_count'] = train_df['text'].apply(lambda x: hashtag_count(x))
train_df['mention_count'] = train_df['text'].apply(lambda x: mentions_count(x))
train_df.head(2)

In [None]:
plt.figure(figsize=(20,4))
plt.subplot(1,3,1)
sns.barplot(y='char_count',x='target',data=train_df)
plt.subplot(1,3,2)
sns.barplot(y='word_count',x='target',data=train_df)
plt.subplot(1,3,3)
g=sns.barplot(y='hashtag_count',x='target',data=train_df)

In [None]:
test_df['char_count'] = test_df['text'].apply(lambda x: charcount(x))
test_df['word_count'] = test_df['text'].apply(lambda x: wordcount(x))
test_df['hashtag_count'] = test_df['text'].apply(lambda x: hashtag_count(x))
test_df['mention_count'] = test_df['text'].apply(lambda x: mentions_count(x))
test_df.head(2)

> Remove hashtags, mentions and emails

In [None]:
def remove_emails(x):
     return re.sub(r'([a-z0-9+._-]+@[a-z0-9+._-]+\.[a-z0-9+_-]+)',"", x)


def remove_urls(x):
    return re.sub(r'(http|https|ftp|ssh)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '' , x)

def remove_rt(x):
    return re.sub(r'\brt\b', '', x).strip()

def remove_special_chars(x):
    x = re.sub(r'[^\w ]+', "", x)
    x = ' '.join(x.split())
    return x


def remove_accented_chars(x):
    x = unicodedata.normalize('NFKD', x).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return x

def remove_stopwords(x):
    return ' '.join([t for t in x.split() if t not in stopwords])	



In [None]:
train_df['text'] = train_df['text'].apply(lambda x: remove_emails(x))
train_df['text'] = train_df['text'].apply(lambda x: remove_urls(x))
train_df['text'] = train_df['text'].apply(lambda x: remove_rt(x))
train_df['text'] = train_df['text'].apply(lambda x: remove_special_chars(x))
train_df['text'] = train_df['text'].apply(lambda x: remove_accented_chars(x))
train_df['text'] = train_df['text'].apply(lambda x: remove_stopwords(x))

In [None]:
test_df['text'] = test_df['text'].apply(lambda x: remove_emails(x))
test_df['text'] = test_df['text'].apply(lambda x: remove_urls(x))
test_df['text'] = test_df['text'].apply(lambda x: remove_rt(x))
test_df['text'] = test_df['text'].apply(lambda x: remove_special_chars(x))
test_df['text'] = test_df['text'].apply(lambda x: remove_accented_chars(x))
test_df['text'] = test_df['text'].apply(lambda x: remove_stopwords(x))

> Wordcloud Visualization 

In [None]:
text = ' '.join(train_df[train_df.target==1]['text'])
word_cloud = WordCloud(max_font_size=100).generate(text)
plt.imshow(word_cloud)
plt.axis('off')
plt.title('Disaster tweets')
plt.show()

In [None]:
text = ' '.join(train_df[train_df.target==0]['text'])
word_cloud = WordCloud(max_font_size=100).generate(text)
plt.imshow(word_cloud)
plt.title('Not Disaster tweets')
plt.axis('off')
plt.show()

> After removing frequent words

In [None]:
def get_value_counts(df, col):
    text = ' '.join(df[col])
    text = text.split()
    freq = pd.Series(text).value_counts()
    return freq

def remove_common_words(x, freq, n=20):
    fn = freq[:n]
    x = ' '.join([t for t in x.split() if t not in fn])
    return x

In [None]:
freq=get_value_counts(train_df,'text')

In [None]:
train_df['text'] = train_df['text'].apply(lambda x: remove_common_words(x,freq,100))

In [None]:
text = ' '.join(train_df[train_df.target==1]['text'])
word_cloud = WordCloud(max_font_size=100).generate(text)
plt.imshow(word_cloud)
plt.axis('off')
plt.title('Disaster tweets')
plt.show()

In [None]:
text = ' '.join(train_df[train_df.target==0]['text'])
word_cloud = WordCloud(max_font_size=100).generate(text)
plt.imshow(word_cloud)
plt.axis('off')
plt.title('Not Disaster tweets')
plt.show()

# BERT

In [None]:
from ktrain import text

In [None]:
(X_train, y_train), (X_test, y_test), preproc = text.texts_from_df(train_df=train_df, text_column='text', label_columns='target', maxlen=50, preprocess_mode='bert')

In [None]:
model = text.text_classifier(name='bert', train_data=(X_train, y_train), preproc=preproc)

In [None]:
learner = ktrain.get_learner(model=model, train_data=(X_train, y_train), val_data=(X_test, y_test), batch_size=32)
learner.fit_onecycle(lr = 2e-4, epochs=3)

In [None]:
predictor = ktrain.get_predictor(learner.model, preproc)

In [None]:
classes=predictor.get_classes()

In [None]:
output = pd.DataFrame(columns=['id','target'])
for index, row in test_df.iterrows(): 
    y_pred,p= predictor.predict(row['text'],return_proba=True)
    pred=classes.index(y_pred)
    output1 = pd.DataFrame({'id': row['id'], 'target': pred},index=[0])
    output=output.append(output1)
    
output = output.astype({'target': 'int32'})
output.to_csv('submission_bert.csv', index=False)