## Importing Data and Dependencies

In [None]:
#Libraries
import numpy as np
import pandas as pd
import regex as re
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as ms

#Data
train = pd.read_csv('../input/nlp-getting-started/train.csv')
test = pd.read_csv('../input/nlp-getting-started/test.csv')
ss = pd.read_csv('../input/nlp-getting-started/sample_submission.csv')

## Target value :

### 1. Target Value Missing data

In [None]:
#Missing Value
print('Number of Missing Values in Target feature: {}'.format(train.target.isnull().sum()))

### 2. Target Value Distribution

In [None]:
#Distribution
canv, axs = plt.subplots(1,2,figsize=(22,8))
color = ['darkgreen','darkslategrey']

plt.sca(axs[0])
plt.pie(train.groupby('target').count()['id'],explode=(0.1,0),startangle=120,colors=color,
    textprops={'fontsize':15},labels=['Not Disaster (57%)', 'Disaster (43%)'])

plt.sca(axs[1])
bars = plt.bar([0,0.5],train.groupby('target').count()['id'],width=0.3,color=color)
plt.xticks([0,0.5],['Not Disaster','Disaster'])
plt.tick_params(axis='both',labelsize=15,size=0,labelleft=False)

for sp in plt.gca().spines.values():
    sp.set_visible(False)
    
for bar,val in zip(bars,train.groupby('target').count()['id']):
    plt.text(bar.get_x()+0.113,bar.get_height()-250,val,color='w',fontdict={'fontsize':18,'fontweight':'bold'})

canv.suptitle('Target Value Distribution in Training Data',fontsize=18);

## Missing Values

In [None]:
#Train Data
train_na = (train.isnull().sum() / len(train)) * 100
train_na = train_na.drop(train_na[train_na==0].index).sort_values(ascending=False)

pd.DataFrame({'Train Missing Ratio' :train_na}).head(3)

In [None]:
#Test Data
test_na = (test.isnull().sum() / len(test)) * 100
test_na = test_na.drop(test_na[test_na==0].index).sort_values(ascending=False)

pd.DataFrame({'Test Missing Ratio' :test_na}).head(3)

In [None]:
#Visualizing Mssing Values
title = 'Train'
data = [train_na,test_na]
canv, axs = plt.subplots(1,2)
canv.set_size_inches(18,5)
for ax, dat in zip(axs,data):
    plt.sca(ax)
    sns.barplot(x=dat.index, y=dat,dodge=False)  
    plt.xlabel('Features', fontsize=15,labelpad=10)
    plt.ylabel('Percent of missing values', fontsize=15,labelpad=13)
    plt.title('Percent missing data by feature in {} Data'.format(title), fontsize=15,pad=20)
    plt.tick_params(axis='both',labelsize=12)
    
    sp = plt.gca().spines
    sp['top'].set_visible(False)
    sp['right'].set_visible(False)
    
    title = 'Test'

## Handling Missing Data:
Both training and test set have same ratio of missing values in `keyword` and `location`.
* **0.8%** of `keyword` is missing in both training and test set
* **33%** of `location` is missing in both training and test set

Since missing value ratios between training and test set are too close, **they are most probably taken from the same sample**. Missing values in those features are filled with `None` and `None` respectively.

In [None]:
#Filling Missing Data
for df in [train,test]:
    for col in ['keyword','location']:
        df[col].fillna('None',inplace=True)

## Feature Engineering:

### Helper Function to add Features

In [None]:
#Function
def add_feature(X, feature_to_add):
    """
    Returns sparse feature matrix with added feature.
    """
    from scipy.sparse import csr_matrix, hstack
    return hstack([X, csr_matrix(feature_to_add).T], 'csr')


### List of Features:
- TFIDF with 2-5 ngrams and Min DF of 5
- Word Count
- Unique Word Count
- Character Count
- Number of Hastags
- Number of Tagged People('@')
- Number of Outlinks
- Number of Non-WordCharacters
- Number of Digits(Stats)

In [None]:
#Target Value
y = train.target
X = train.text

#Train Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42,test_size=0.2)

### 1. TFIDF

In information retrieval, tf–idf or TFIDF, short for term frequency–inverse document frequency, is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus.[1] It is often used as a weighting factor in searches of information retrieval, text mining, and user modeling. The tf–idf value increases proportionally to the number of times a word appears in the document and is offset by the number of documents in the corpus that contain the words.

![TFIDF](https://www.researchgate.net/profile/Haider_Al-Khateeb2/publication/291950178/figure/fig1/AS:330186932408324@1455734107458/Term-Frequency-Inverse-Document-Frequency-TF-IDF.png)

- We will be using a min docmuent frequency of 5 that means a ngram needs to be in atleast 5 document or instances in the data to be able to get in the vacabulary of the vectorizer 
- And we will using ngrams of 2 to 5 words 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(min_df=5,ngram_range=(1,5)).fit(X_train)

#train
X_train_vect = tfidf.transform(X_train)

#test
X_test_vect = tfidf.transform(X_test)

### 2. Word Count

In [None]:
X_train_vect = add_feature(X_train_vect,X_train.apply(lambda x : len(str(x).split())))
X_test_vect = add_feature(X_test_vect,X_test.apply(lambda x : len(str(x).split())))

### 3. Unique Words

In [None]:
X_train_vect = add_feature(X_train_vect,X_train.apply(lambda x : len(set(str(x).split()))))
X_test_vect = add_feature(X_test_vect,X_test.apply(lambda x : len(set(str(x).split()))))

### 4. Character Count

In [None]:
X_train_vect = add_feature(X_train_vect,X_train.apply(lambda x : len(str(x))))
X_test_vect = add_feature(X_test_vect,X_test.apply(lambda x : len(str(x))))

### 5. Number of Hastags

In [None]:
X_train_vect = add_feature(X_train_vect,X_train.apply(lambda x : x.count('#')))
X_test_vect = add_feature(X_test_vect,X_test.apply(lambda x : x.count('#')))

### 6. Number of Tagged People('@')

In [None]:
X_train_vect = add_feature(X_train_vect,X_train.apply(lambda x : x.count('@')))
X_test_vect = add_feature(X_test_vect,X_test.apply(lambda x : x.count('@')))

### 7. Number of Out Links

In [None]:
X_train_vect = add_feature(X_train_vect,X_train.apply(lambda x : x.count('http')))
X_test_vect = add_feature(X_test_vect,X_test.apply(lambda x : x.count('http')))

### 8. Number of Non-WordCharacters

In [None]:
X_train_vect = add_feature(X_train_vect,X_train.str.count(r'[\\/!?,\.:=<>^-]'))
X_test_vect = add_feature(X_test_vect,X_test.str.count(r'[\\/!?,\.:=<>^-]'))

### 9. Number of Digits

In [None]:
X_train_vect = add_feature(X_train_vect,X_train.str.count(r'\d'))
X_test_vect = add_feature(X_test_vect,X_test.str.count(r'\d'))

## Looking at the final Data

In [None]:
# Number of Features
print('The Number of Features in the Processed Data: {}'.format(X_train_vect.shape[-1]))

In [None]:
# Lets look at some of the Vacabulary from the Tfidf
print(tfidf.get_feature_names()[350:420])

## Model Building

1. Multilayer Perceptron Classifier

### 1. MLPClassifier

MLPClassifier stands for Multi-layer Perceptron classifier which in the name itself connects to a Neural Network. Unlike other classification algorithms such as Support Vectors or Naive Bayes Classifier, MLPClassifier relies on an underlying Neural Network to perform the task of classification.

![MLP](https://s3.amazonaws.com/stackabuse/media/intro-to-neural-networks-scikit-learn-3.png)

In [None]:
#Metric and Model
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score,make_scorer,accuracy_score,roc_auc_score

#Creating Log Loss Scorer
LogLoss = make_scorer(f1_score, greater_is_better=True, needs_proba=True)

#Two Classifier
mlp1 = MLPClassifier(max_iter=200,verbose=False,solver='sgd',activation='relu',learning_rate='adaptive')
mlp2 = MLPClassifier(max_iter=200,verbose=False,solver='adam',activation='relu',learning_rate='adaptive')

#Parameters for tunning
parameter_space = {
      'hidden_layer_sizes': [(50,50,50), (50,100,50),(100,100,100),(100,100),(500,500)],
      'alpha': [0.0001, 0.05],
  }

### HyperParameter Tunning

In [None]:
#Tunning the First Classifier
clf1 = GridSearchCV(mlp1, parameter_space,verbose=2,cv=3,scoring='f1')
clf1.fit(X_train_vect,y_train)

In [None]:
#Tunning the Second Classifier
clf2 = GridSearchCV(mlp2, parameter_space,verbose=2,cv=3,scoring='f1')
clf2.fit(X_train_vect,y_train)

In [None]:
#Results from the first GridSearch
pd.DataFrame(data=clf1.cv_results_,columns=['param_hidden_layer_sizes','param_alpha','mean_test_score','std_test_score']).sort_values('mean_test_score',
                                                                                                     ascending=False).reset_index(drop=True).iloc[:10]

In [None]:
#Results from the Second GridSearch
pd.DataFrame(data=clf2.cv_results_,columns=['param_hidden_layer_sizes','param_alpha','mean_test_score','std_test_score']).sort_values('mean_test_score',
                                                                                                     ascending=False).reset_index(drop=True).iloc[:10]

In [None]:
# So we can see that the best esitmator has the following parameters
prm = {'alpha':0.05,'hidden_layer_sizes':(100,100,100),'max_iter':200,'solver':'adam','activation':'relu','learning_rate':'adaptive'}
print('The Best Parameters are:\n')
for p,c in zip(prm.items(),range(1,7)):
    print('{}. {} = {}'.format(c,p[0],p[1]))

#Final F1 Score of test data
print('\n\nAnd the Accuracy of the Final Model on Test Data: {:.3f}'.format(accuracy_score(y_test,clf2.predict(X_test_vect))))
print('\n\nAnd the F1 score of the Final Model on Test Data: {:.3f}'.format(f1_score(y_test,clf2.predict(X_test_vect))))