In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install beautifulsoup4 

In [None]:
# Basic imports
import numpy as np
import pandas as pd 
import re
from bs4 import BeautifulSoup
import warnings
warnings.filterwarnings("ignore")
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from collections import Counter, defaultdict
import altair as alt
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import RepeatedStratifiedKFold,cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import f1_score
import lime
from lime import lime_text
from lime.lime_text import LimeTextExplainer

#### Datasets

In [None]:
train = pd.read_csv('../input/nlp-getting-started/train.csv')
test = pd.read_csv('../input/nlp-getting-started/test.csv')
sample_submission = pd.read_csv('../input/nlp-getting-started/sample_submission.csv')

#### Basic Cleaning pipeline

In [None]:
stop_words = set(stopwords.words('english')) 

def text_cleaner(text,num):
    '''
        Text cleaner does the following
        1. Lowercase text
        2. Removes non text from raw reviews
        3. Substitutes not alphanumeric characters
        4. Correct words using the contractions mapping dictionary
        5. Removes Junk characters generated after cleaning
        6. Remove stop words if num=0 that means for review only not for summary
        
        Parameters: String, Number
        Returns: String
    '''
    newString = text.lower() 
    newString = BeautifulSoup(newString, "lxml").text
    newString = re.sub(r'\([^)]*\)', '', newString)
    newString = re.sub(r"http", "", newString)
    newString = re.sub('"','', newString)    
    newString = re.sub(r"'s\b","",newString)
    newString = re.sub("[^a-zA-Z]", " ", newString) 
    newString = re.sub('[m]{2,}', 'mm', newString)
    if(num==0):
        tokens = [w for w in newString.split() if not w in stop_words]
    else:
        tokens=newString.split()
    long_words=[]
    for i in tokens:
        if len(i)>1:                                                
            long_words.append(i)   
    return (" ".join(long_words)).strip()

# Cleaning raw reviews
cleaned_text_train = []
for t in train['text']:
    cleaned_text_train.append(text_cleaner(t,1)) 

cleaned_text_test = []
for t in test['text']:
    cleaned_text_test.append(text_cleaner(t,1))
    
train['text']=cleaned_text_train
test['text']=cleaned_text_test

# Cleaning keywords and locations
train['keyword'] = train['keyword'].apply(lambda x: re.sub("[^a-zA-Z]", " ", str(x)))
train['keyword'] = train['keyword'].apply(lambda x: lemmatizer.lemmatize(x.lower()))
test['keyword'] = test['keyword'].apply(lambda x: re.sub("[^a-zA-Z]", " ", str(x)))
test['keyword'] = test['keyword'].apply(lambda x: lemmatizer.lemmatize(x.lower()))
train['location'] = train['location'].apply(lambda x: re.sub("[^a-zA-Z]", " ", str(x)))
test['location'] = test['location'].apply(lambda x: re.sub("[^a-zA-Z]", " ", str(x)))
train['full_text'] = train['keyword'] + ' ' + train['location'] + ' ' + train['text']
test['full_text'] = test['keyword'] + ' ' + test['location'] + ' '  + test['text']
train['full_text'] = train['full_text'].apply(lambda x: re.sub('nan', ' ', str(x)))
test['full_text'] = test['full_text'].apply(lambda x: re.sub('nan', ' ', str(x)))

**Comparison of Disaster and Non Disaster tweet counts**

In [None]:
temp0 = train.target.value_counts().reset_index(name='count')
alt.Chart(temp0, title='Non Disaster vs Disaster tweet count').mark_bar().encode(
    x='index:O',
    y='count',
    color='index:O'
).properties(width=400)

#### New Dataframe to analyze importance of keywords and locations 

In [None]:
no_missing = train[(train['keyword']!='nan') & (train['location']!='nan')]

**Top 20 keywords in the dataset**

In [None]:
temp1 = no_missing['keyword'].value_counts().head(20).reset_index(name='count')
alt.Chart(temp1, title='Top 20 keywords in tweets').mark_bar().encode(
    alt.X('index', axis=alt.Axis(labelAngle=-45)),
    y='count'
).properties(width=700).configure_axis(
    labelFontSize=15,
    titleFontSize=15
)

Fatality, weapon, siren, death are top keywords bot not neccesarily disaster indicators. But, keywords like flood and wildfire indicated disaster. We will compare keyword occurance in disaster and non disaster tweets later on.

**Top 20 locations of tweets**

In [None]:
temp2 = no_missing['location'].value_counts().head(20).reset_index(name='count')
alt.Chart(temp2, title='Top 20 locations of tweets').mark_bar().encode(
    alt.X('index', axis=alt.Axis(labelAngle=-45)),
    y='count'
).properties(width=700).configure_axis(
    labelFontSize=15,
    titleFontSize=15
)

Lot of tweets come from unknown places. USA tops the location reference in tweets. We can correct some of the locations as unknown location can be classified as worldwide. And locations with different spelling and abbreviations can be clubbed.

**Most occuring keyword and location combinations**

In [None]:
temp3 = no_missing.groupby(['keyword', 'location']).size().nlargest(20).reset_index(name='count')
alt.Chart(temp3, title='Top location and keyword combinations').mark_circle(size=60).encode(
    alt.X('location', axis=alt.Axis(labelAngle=-45)),
    y='count',
    color='keyword',
    tooltip=['location', 'keyword', 'count']
).properties(width=700).interactive().configure_axis(
    labelFontSize=15,
    titleFontSize=15
)

Keyword and location combination is good indicator of disaster. Like USA has lot of sandstorms, india having rail disasters, mumbai battling with floods. Damn! There is such a thing as pedophile hunting ground. WTF

**How good a keyword is indicator of disaster**

In [None]:
temp4 = no_missing.groupby(['keyword', 'target']).size().nlargest(40).reset_index(name='count')
alt.Chart(temp4, title='How good a keyword is indicator of disaster?').mark_bar().encode(
    x='target:O',
    y='count',
    color='target:N',
    column='keyword'
).properties(width=40)

Some keywords are strong indicators of disasters such as draught, earthquake, flood, airplaneaccident, buildingsonfire etc

**How good a location is indicator of disaster**

In [None]:
temp5 = no_missing.groupby(['location', 'target']).size().nlargest(30).reset_index(name='count')
alt.Chart(temp5, title='How good a location is indicator of disaster?').mark_bar().encode(
    x='target:O',
    y='count',
    color='target:N',
    column='location'
).properties(width=40)

Some locations like australia, india, mumbai have most of their tweets related to disaster

***Top keyowrds related to diasters***

In [None]:
temp6 = no_missing[no_missing['target']==1]['keyword'].value_counts().head(20).reset_index(name='count')
alt.Chart(temp6, title='Top keywords present in Disaster tweets').mark_bar().encode(
    alt.X('index', axis=alt.Axis(labelAngle=-45)),
    y='count'
).properties(width=700).configure_axis(
    labelFontSize=15,
    titleFontSize=15
)

## Modeling
We will not be building complex model or use neural nets for this dataset. A simple model should be good enough for decent outcome on f1 metrics. We will try to understand predictions of simple model with LIME explainer

In [None]:
list_corpus = train["full_text"].tolist()
list_labels = train["target"].tolist()
X_train, X_val, y_train, y_val = train_test_split(list_corpus, list_labels, test_size=0.2, random_state=40)
vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), stop_words = 'english', binary=True, lowercase=True)
train_vectors = vectorizer.fit_transform(X_train)
val_vectors = vectorizer.transform(X_val)

In [None]:
logreg = LogisticRegression(C=30.0, class_weight='balanced', solver='newton-cg', n_jobs=-1, random_state=40)
logreg.fit(train_vectors, y_train)
pred = logreg.predict(val_vectors)
f1 = f1_score(y_val, pred, average='weighted')
print("f1 score = %.3f" % (f1))

**Example 1**

In [None]:
X_val[205]

In [None]:
c = make_pipeline(vectorizer, logreg)
class_names=list(train.target.unique()[::-1])
explainer = LimeTextExplainer(class_names=class_names)

idx = 205
exp = explainer.explain_instance(X_val[idx], c.predict_proba, num_features=6)
print('Tweet id: %d' % idx)
print('Predicted class =', class_names[logreg.predict(val_vectors[idx]).reshape(1,-1)[0,0]])
print('True class: %s' % class_names[y_val[idx]])

In [None]:
exp.show_in_notebook(text = X_val[idx])

**Example 2**

In [None]:
X_val[23]

In [None]:
idx = 23
exp = explainer.explain_instance(X_val[idx], c.predict_proba, num_features=6)
print('Tweet id: %d' % idx)
print('Predicted class =', class_names[logreg.predict(val_vectors[idx]).reshape(1,-1)[0,0]])
print('True class: %s' % class_names[y_val[idx]])

In [None]:
exp.show_in_notebook(text = X_val[idx])

**Submission**

In [None]:
test_corpus = test["full_text"].tolist()
test_vectors = vectorizer.transform(test_corpus)
pred = logreg.predict(test_vectors)
sample_submission["target"] = pred
sample_submission.to_csv("./submission.csv", index=False)