links:

* https://www.kaggle.com/tuckerarrants/disaster-tweets-eda-glove-rnns-bert
* https://www.kaggle.com/mariapushkareva/nlp-disaster-tweets-with-glove-and-lstm
* https://keras.io/guides/functional_api/
    

In [None]:
import time
from datetime import datetime

#measure notebook running time
start_time = time.time()

%matplotlib inline

import os, warnings
warnings.filterwarnings('ignore')
import numpy as np 
from numpy.random import seed
import pandas as pd 
from matplotlib import pyplot as plt
import seaborn as sns
from collections import Counter
import re
import string 

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import scipy.sparse

from keras.models import Sequential
from keras.layers import Dense, Input, Dropout, BatchNormalization, RNN, SimpleRNN, LSTM, GRU,Embedding, Bidirectional, GlobalMaxPool1D, Conv1D, MaxPooling1D, SpatialDropout1D, Concatenate
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import metrics, Model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from tensorflow.keras.initializers import Constant

# import xgboost as xgb
# import lightgbm as lgb
# from sklearn.svm import SVC
# from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier, RidgeClassifier
from sklearn.naive_bayes import MultinomialNB
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.cluster import KMeans
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay,plot_confusion_matrix, precision_score,recall_score, f1_score, classification_report, accuracy_score

sns.set(style='white', context='notebook', palette='deep', rc={'figure.figsize':(10,8)})
pd.set_option('max_colwidth', None)
print("loaded ...")

In [None]:
# Reproducibility
RANDOM_SEED = 13
def set_seed(sd):
    seed(sd)
    np.random.seed(sd)
    tf.random.set_seed(sd)
    os.environ['PYTHONHASHSEED'] = str(sd)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'
set_seed(RANDOM_SEED)
print("random seed set as:", RANDOM_SEED)

# Data

In [None]:
TRAIN = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
TEST = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")
TRAIN['Set'] = "Train"
TEST['Set'] = "Test"
TEST['target'] = -1
TARGET = TRAIN['target']
DATA = TRAIN.append(TEST)
DATA['OriginalText'] = DATA.text
DATA.reset_index(inplace=True)
_train = DATA.Set == "Train"
_test = DATA.Set == "Test"
#DATA

In [None]:
DATA.OriginalText

---

# Preprocessing and feature engineering

## Keywords

In [None]:
DATA.keyword = DATA.keyword.fillna("")
DATA.keyword = DATA.keyword.str.replace("%20"," ", regex=True)
DATA.keyword[DATA.keyword != ""].value_counts().head()

In [None]:
def add_keyword(text, key):
    if key == "": return text
    return text + " " + key

DATA.text = DATA[['text','keyword']].apply(lambda row: add_keyword(*row), axis = 1)

## Locations (not improving score)

In [None]:
DATA.location = DATA.location.fillna("")
DATA.location[DATA.location.notna()].value_counts().head()

In [None]:
#DATA.text = DATA[['text','location']].apply(lambda row: add_keyword(*row), axis = 1)

---

### Feature Engineering - Numerical

In [None]:
disaster = DATA['target'] == 1

In [None]:
DATA['WC'] = DATA.text.apply(lambda row: len(row.split()))
fig, ax = plt.subplots(figsize = (8, 5))
sns.kdeplot(DATA.WC[_train & ~disaster], shade = True, label = 'Not disaster')
sns.kdeplot(DATA.WC[_train & disaster], shade = True, label = 'Disaster')
plt.title('Distribution of Word Count')
plt.legend()
plt.show()

In [None]:
DATA['CC'] = DATA.text.apply(lambda row: len(row))
fig, ax = plt.subplots(figsize = (8, 5))
sns.kdeplot(DATA.CC[_train & ~disaster], shade = True, label = 'Not disaster')
sns.kdeplot(DATA.CC[_train & disaster], shade = True, label = 'Disaster')
plt.title('Distribution of Character Count')
plt.legend()
plt.show()

In [None]:
DATA['AWL'] = DATA.text.apply(lambda row: np.mean([len(w) for w in row.split()]))
fig, ax = plt.subplots(figsize = (8, 5))
sns.kdeplot(DATA.AWL[_train & ~disaster], shade = True, label = 'Not disaster')
sns.kdeplot(DATA.AWL[_train & disaster], shade = True, label = 'Disaster')
plt.title('Average word length')
plt.legend()
plt.show()

In [None]:
DATA['PunctCount'] = DATA.text.apply(lambda row: len([i for i in str(row) if i in string.punctuation]))
fig, ax = plt.subplots(figsize = (8, 5))
sns.kdeplot(DATA.PunctCount[_train & ~disaster], shade = True, label = 'Not disaster')
sns.kdeplot(DATA.PunctCount[_train & disaster], shade = True, label = 'Disaster')
plt.title('Punctuation count')
plt.legend()
plt.show()

In [None]:
%%time
DATA['StopwordCount'] = DATA.text.apply(lambda row: len([w for w in row.lower().split() if w in stopwords.words('english')]))
DATA["SWR"] = DATA['StopwordCount'] / DATA.WC
fig, ax = plt.subplots(figsize = (8, 5))
sns.kdeplot(DATA.SWR[_train & ~disaster], shade = True, label = 'Not disaster')
sns.kdeplot(DATA.SWR[_train & disaster], shade = True, label = 'Disaster')
plt.title('Stopword ratio (SWR)')
plt.legend()
plt.show()

In [None]:
numeric = ['CC', "AWL", 'PunctCount',"SWR"]

scaler = StandardScaler()
DATA[numeric] = scaler.fit_transform(DATA[numeric])
DATA[numeric].head()

In [None]:
fig, ax = plt.subplots(figsize=(6,6))     
g = sns.heatmap(DATA[_train][[*numeric,'target']].corr(),annot=True, fmt = ".2f", cmap = "coolwarm")

---

## Text preprocessing

In [None]:
def get_tags(string):
    pat = "#\w+"
    tags = re.findall(pat, string)
    return [t.strip("#") for t in tags]

def change_tags(string):
    hashes = re.findall(r'#\w+', string)
    for h in hashes:
        string = re.sub(h, ' TAG ' + h.split("#")[1], string)
    return string

def get_mentions(string):
    pat = "@\w+"
    tags = re.findall(pat, string)
    return [t.strip("@") for t in tags]

def remove_stop_words(array):
    return [a for a in array if a not in stopwords.words('english')]

lem = WordNetLemmatizer()
def lemmatize(array):
    lemmatized = [lem.lemmatize(t) for t in array]
    return lemmatized

stm = PorterStemmer()
def portStem(array):
    stemmed = [stm.stem(t) for t in array]
    return stemmed

def do_nothing(tokens):
    return tokens

def remove_emoji(text):
    #thanks, https://www.kaggle.com/mariapushkareva/nlp-disaster-tweets-with-glove-and-lstm
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def decontracted(phrase):
    #it might be just start of single quotes
    phrase = re.sub(r"\s\'", " ", phrase)
    
    #thanks, https://stackoverflow.com/a/47091490/4154250
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [None]:
%%time
#split CamelCase worsens the score
#DATA.text = DATA.text.apply(lambda row: " ".join(re.sub('([A-Z][a-z]+)', r' \1', re.sub('([A-Z]+)', r' \1', row)).split()))

DATA.text = DATA.text.apply(remove_emoji)
DATA.text = DATA.text.apply(decontracted)
DATA.text = DATA.text.str.lower()
DATA['mentions'] = DATA.text.apply(get_mentions)
DATA.text = DATA.text.str.replace(r'((www\.[\s]+)|(https?://[^\s]+))',' <URL> ',regex=True)
DATA.text = DATA.text.apply(change_tags)
DATA.text = DATA.text.str.replace(r'@\w+',' USER ',regex=True)
DATA.text = DATA.text.str.replace(r'\\n','',regex=True)
DATA.text = DATA.text.str.replace(r'\\t','',regex=True)
DATA.text = DATA.text.str.replace(r'rt\b','retweet',regex=True)
DATA.text = DATA.text.str.replace(r'&amp;',' and ',regex=True)
DATA.text = DATA.text.str.replace(r'&lt','',regex=True)
DATA.text = DATA.text.str.replace(r'&gt','',regex=True)

remove = "..;''()ûò,&=!?-:|[]ã¢+ª*<>%$/+*ó_#ï÷@ìñ~/åêÛå"
for char in remove:
    DATA.text = DATA.text.str.replace(char,' ', regex=False)
    
    
DATA.text = DATA.text.str.replace(r'\d+',' NUMBER ', regex=True)
DATA.text = DATA.text.str.replace(r'\s+',' ', regex=True)
DATA.text = DATA.text.str.strip()

DATA.text.head(10)

In [None]:
#_ = DATA[~disaster]['text'][0:50].apply(lambda row: print(row))
#_ = DATA[disaster]['text'][0:50].apply(lambda row: print(row))
#_ = DATA['text'][70:80].apply(lambda row: print(row))
#DATA[disaster][['text','OriginalText']][0:50]

---

### Feature Engineering - Categorical

In [None]:
# DATA['traffic'] = DATA.text.apply(lambda row: 1 if 'traffic accident' in row else 0)
# DATA['traffic'].value_counts()

In [None]:
# g = sns.catplot(x="traffic",y="target",data=DATA[_train], kind="bar", height = 6, palette = "muted")
# g = g.set_ylabels("disaster probability - traffic")

In [None]:
DATA['nsfw'] = DATA.text.apply(lambda row: 1 if 'nsfw' in row else 0)
DATA['dance'] = DATA.text.apply(lambda row: 1 if 'dance' in row else 0)
DATA['aftershock'] = DATA.text.apply(lambda row: 1 if 'aftershock' in row else 0)
DATA['zombie'] = DATA.text.apply(lambda row: 1 if 'zombie' in row else 0)
DATA['fan army'] = DATA.text.apply(lambda row: 1 if 'fan army' in row else 0)
DATA['ebay'] = DATA.text.apply(lambda row: 1 if 'ebay' in row else 0)
DATA['armageddon'] = DATA.text.apply(lambda row: 1 if 'armageddon' in row else 0)
DATA['battle'] = DATA.text.apply(lambda row: 1 if 'battle' in row else 0)
DATA['game'] = DATA.text.apply(lambda row: 1 if 'game' in row else 0)
DATA['police'] = DATA.text.apply(lambda row: 1 if 'police' in row else 0)
DATA['girlfriend'] = DATA.text.apply(lambda row: 1 if 'girlfriend' in row else 0)
DATA['earthquake'] = DATA.text.apply(lambda row: 1 if 'earthquake' in row else 0)
DATA['flood'] = DATA.text.apply(lambda row: 1 if 'flood' in row else 0)
DATA['wildfire'] = DATA.text.apply(lambda row: 1 if 'wildfire' in row else 0)

In [None]:
#categorical = ['nsfw','dance','aftershock','zombie','fan army','ebay','armageddon','game','police','girlfriend','earthquake','flood','wildfire']
categorical = ['nsfw','dance','aftershock','fan army','ebay','girlfriend',]

In [None]:
%%time
def boxplot(x,y,**kwargs):
    sns.barplot(x=x,y=y)
    _=plt.xticks(rotation=90)

f = pd.melt(DATA[_train], id_vars=['target'], value_vars=categorical)
g = sns.FacetGrid(f, col="variable",  col_wrap=6, sharex=False, sharey=True, height=4)
g = g.map(boxplot, "value", "target")

In [None]:
fig, ax = plt.subplots(figsize=(10,10))     
g = sns.heatmap(DATA[_train][[*categorical,'target']].corr(),annot=True, fmt = ".2f", cmap = "coolwarm")

---

## Tokenization

In [None]:
DATA['tokens'] = DATA.text.apply(word_tokenize).apply(remove_stop_words)
DATA['tokens'].head(10)

In [None]:
DATA['tokens'] = DATA['tokens'].apply(lemmatize)
DATA['tokens'] = DATA['tokens'].apply(portStem)
DATA.head(10)

In [None]:
maxLenTokens = max([len(row) for row in DATA.tokens])
DATA.tokens.head(10)

## TokenText (experiment)

In [None]:
DATA['TokenText'] = DATA.tokens.apply(lambda row: " ".join(row))
DATA['TokenText'].head()

## Preprocessing for Keras

In [None]:
%%time
tokenizer = Tokenizer()
tokenizer.fit_on_texts(DATA.text)
texts_numeric = tokenizer.texts_to_sequences(DATA.text)
maxLen = max([len(row) for row in texts_numeric])
texts_pad = pad_sequences(texts_numeric, maxLen, padding='post')
vocab_length = len(tokenizer.word_index) + 1
DATA['texts_pad'] = list(texts_pad)
DATA['texts_pad'].head()

In [None]:
# %%time
# tokenizer = Tokenizer()
# tokenizer.fit_on_texts(DATA.TokenText)
# texts_numeric = tokenizer.texts_to_sequences(DATA.TokenText)
# maxLen = max([len(row) for row in texts_numeric])
# texts_pad = pad_sequences(texts_numeric, maxLen, padding='post')
# vocab_length = len(tokenizer.word_index) + 1
# DATA['texts_pad'] = list(texts_pad)
# DATA['texts_pad'].head()

### Embedings

In [None]:
# %%time
# embeddings_dictionary = dict()
# embedding_dim = 100
# glove_file = open('../input/glove6b/glove.6B.100d.txt')
# for line in glove_file:
#     records = line.split()
#     word = records[0]
#     vector_dimensions = np.asarray(records[1:], dtype='float32')
#     embeddings_dictionary[word] = vector_dimensions
# glove_file.close()

In [None]:
%%time
embeddings_dictionary = dict()
embedding_dim = 200
glove_file = open('../input/glove6b/glove.6B.200d.txt')
for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = np.asarray(records[1:], dtype='float32')
    embeddings_dictionary[word] = vector_dimensions
glove_file.close()

In [None]:
%%time
embedding_matrix = np.zeros((vocab_length, embedding_dim))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector
embedding_matrix.shape

## Setup

In [None]:
X = DATA[_train]["tokens"]
X_PAD = np.stack(DATA[_train]["texts_pad"])
X_PAD = X_PAD[:, :, None]
TEST_PAD = np.stack(DATA[_test]["texts_pad"])
TEST_PAD = TEST_PAD[:, :, None]

In [None]:
train_features = DATA[_train][[*numeric,*categorical]]
test_features = DATA[_test][[*numeric,*categorical]]

### CountVectorizer

In [None]:
count_vectorizer = CountVectorizer(tokenizer=do_nothing, preprocessor=do_nothing, ngram_range=(1,2)) #0.80324
TRAIN_VECTORS = count_vectorizer.fit_transform(X)
TEST_VECTORS = count_vectorizer.transform(DATA[_test]["tokens"])

## Tfidf

In [None]:
tfidf_vectorizer = TfidfVectorizer(tokenizer=do_nothing, preprocessor=do_nothing, ngram_range=(1,2), min_df = 1) #min_df=1 default, test!
TRAIN_VECTORS_TFIDF = tfidf_vectorizer.fit_transform(X)
TEST_VECTORS_TFIDF = tfidf_vectorizer.transform(DATA[_test]["tokens"])

In [None]:
#TRAIN_VECTORS_TFIDF.todense().shape

## Stack

In [None]:
TRAIN_VECTORS = scipy.sparse.hstack([TRAIN_VECTORS, TRAIN_VECTORS_TFIDF])
TEST_VECTORS = scipy.sparse.hstack([TEST_VECTORS, TEST_VECTORS_TFIDF])

---

# MODELS

### RNN

In [None]:
def plot_loss(loss,val_loss):
    plt.figure()
    plt.plot(loss)
    plt.plot(val_loss)
    plt.title('Model loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Test'], loc='upper right')
    plt.show()

def plot_accuracy(acc,val_acc):
    plt.figure()
    plt.plot(acc)
    plt.plot(val_acc)
    plt.title('Model accuracy')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Test'], loc='upper left')
    plt.show() 

In [None]:
def create_LSTM_model():
    nlp_input = Input(shape = (maxLen,), name = 'nlp_input')
    feature_input = Input(shape = (len(numeric)+len(categorical),), name = "feature_input")
    
    emb = Embedding(input_dim=embedding_matrix.shape[0],
                        output_dim = embedding_matrix.shape[1], 
                        input_length = maxLen, 
                        embeddings_initializer=Constant(embedding_matrix), 
                        name= "Embedding", 
                        trainable=False)(nlp_input)
    SPD1 = SpatialDropout1D(0.2, seed = RANDOM_SEED, name = "SP_drop_1")(emb)
    Conv1 = Conv1D(32, kernel_size= 3, padding="same", name = "Conv1D_1")(SPD1)
    BN1 = BatchNormalization(name = "BN_1")(Conv1)
    MP1 = MaxPooling1D(2, name = 'MP_1')(BN1)
    drop1 = Dropout(0.25, seed = RANDOM_SEED, name = "drop1")(MP1)
    D1 = Dense(embedding_matrix.shape[1], activation='relu', name="Dense1")(drop1)
    drop2 = Dropout(0.25, seed = RANDOM_SEED, name = "drop2")(D1)
    LSTM1 = Bidirectional(LSTM(64, return_sequences = True, dropout=0.1, recurrent_dropout=0.1, kernel_initializer = 'orthogonal', name = "LSTM1"))(drop2)
    GRU1 = GRU(64, return_sequences = False, dropout=0.1, name = "GRU1")(LSTM1)
    concatenate = Concatenate()([GRU1, feature_input])
    BN2 = BatchNormalization(name = "BN_2")(concatenate)
    D2 = Dense(64, activation='relu', kernel_initializer = 'he_normal', name="Dense2")(BN2)
    drop3 = Dropout(0.2, seed = RANDOM_SEED, name = "drop4")(D2)
    D3 = Dense(16, activation='relu', kernel_initializer = 'he_normal', name="Dense3")(drop3)
    out = Dense(1, activation="sigmoid", name= "output")(D3)

    model = Model(inputs=[nlp_input, feature_input], outputs = out, name = "RNN-LSTM")
    return model
    
RNN_MODEL = create_LSTM_model()
RNN_MODEL.summary()

In [None]:
tf.keras.utils.plot_model(RNN_MODEL, show_shapes=True)

In [None]:
%%time
RNN_MODEL.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001, epsilon=1e-03), metrics=['binary_accuracy'])
early_stopping_monitor = EarlyStopping(patience=25, monitor='val_binary_accuracy')
checkpoint = ModelCheckpoint("weights.hdf5", monitor = 'val_binary_accuracy', save_best_only = True)
RNN_MODEL.fit([X_PAD, train_features],TARGET, callbacks=[checkpoint, early_stopping_monitor], epochs=300, batch_size=64, verbose=0, validation_split=0.3);

In [None]:
RNN_MODEL.load_weights("weights.hdf5")
plot_loss(RNN_MODEL.history.history['loss'], RNN_MODEL.history.history['val_loss'])
plot_accuracy(RNN_MODEL.history.history['binary_accuracy'], RNN_MODEL.history.history['val_binary_accuracy'])

_, RNN_MODEL_SCORE = RNN_MODEL.evaluate([X_PAD, train_features],TARGET)
print('Train accuracy: {:.2f} %'.format(RNN_MODEL_SCORE*100))

In [None]:
class DNN_wrapper:
    def __init__(self, model):
        self.model = model
    def predict(self, df):
        pred = np.rint(self.model.predict(df))[:,0]
        return pred.astype(np.int)
    def predict_proba(self, df):
        probs = self.model.predict(df)
        probs2 = np.ones_like(probs) - probs
        packed = np.concatenate((probs2, probs), axis=1)        
        return packed
    
NN_MODEL = DNN_wrapper(RNN_MODEL)

## RidgeClassifier

In [None]:
# %%time
# clf = RidgeClassifier(max_iter=None, normalize=False, solver='auto', tol=0.001, random_state = RANDOM_SEED)
# param_grid = {'alpha': np.logspace(-4, 4, 10)}
# rc_grid = GridSearchCV(estimator = clf, param_grid=param_grid, cv=4, scoring= "f1", )
# rc_grid.fit(TRAIN_VECTORS,TARGET)
# print(rc_grid.best_params_)
# print(rc_grid.best_estimator_)
# RC_score = rc_grid.best_score_
# print(RC_score)

In [None]:
#RIDGE_MODEL = RidgeClassifier(alpha=21.54434690031882, random_state=RANDOM_SEED)
RIDGE_MODEL = RidgeClassifier(alpha=0.005994842503189409, random_state=13)

RIDGE_MODEL.fit(TRAIN_VECTORS,TARGET)
RIDGE_MODEL_SCORE = RIDGE_MODEL.score(TRAIN_VECTORS,TARGET)
RIDGE_MODEL_SCORE

## Naive Bayes

In [None]:
# %%time
# clf = MultinomialNB()
# param_grid = {'alpha': np.logspace(-4, 1, 10)}
# grid = GridSearchCV(estimator = clf, param_grid=param_grid, cv=4, scoring= "f1", )
# grid.fit(TRAIN_VECTORS,TARGET)
# print(grid.best_params_)
# print(grid.best_estimator_)
# score = grid.best_score_
# print(RC_score)

In [None]:
#NB_MODEL = MultinomialNB(alpha=0.7742636826811278)
NB_MODEL = MultinomialNB(alpha=2.782559402207126)
NB_MODEL.fit(TRAIN_VECTORS,TARGET)
NB_MODEL_SCORE = NB_MODEL.score(TRAIN_VECTORS,TARGET)
NB_MODEL_SCORE

### Random Forest

In [None]:
# %%time
# clf = RandomForestClassifier(random_state = RANDOM_SEED, n_jobs=-1)
# #param_grid = {'n_estimators': [50,75,150,300, 500],'max_depth': [*range(3,13), None], 'max_features': [*np.arange(0.5,1.0,0.1),'auto','sqrt',"log2"],}
# param_grid = {'n_estimators': [50, 100, 150],'max_depth': [4, 10, None], 'max_features': ['auto']}
# param_dist = {'n_estimators': range(50,150),'max_depth': [*range(3,13), None], 'max_features': [*np.arange(0.5,1.0,0.1),'auto','sqrt',"log2"]}
# #grid = GridSearchCV(clf, param_grid, cv=4, scoring= "accuracy")
# #grid = GridSearchCV(clf, param_grid, cv=4, scoring= "f1")
# grid = RandomizedSearchCV(clf, param_distributions = param_dist, cv=4, scoring= "f1")
# grid.fit(TRAIN_VECTORS,TARGET)
# print(grid.best_params_)
# print(grid.best_estimator_)
# score = grid.best_score_
# print(score)

In [None]:
%%time
#RF_MODEL = RandomForestClassifier(n_jobs=-1, random_state=RANDOM_SEED)
RF_MODEL = RandomForestClassifier(n_estimators=131, n_jobs=-1, random_state=13)
#RF_MODEL = RandomForestClassifier(max_depth=4, max_features=0.7999999999999999, n_estimators=65, n_jobs=-1, random_state=RANDOM_SEED)
#RF_MODEL = RandomForestClassifier(n_estimators=50, n_jobs=-1, random_state=13)
RF_MODEL.fit(TRAIN_VECTORS,TARGET)
RF_MODEL_SCORE = RF_MODEL.score(TRAIN_VECTORS,TARGET)
RF_MODEL_SCORE

---

# Prediction review

In [None]:
models = [RIDGE_MODEL,NB_MODEL, RF_MODEL]
model_name = ["Ridge", "NaiveBayes", "RF"]
train_scores = [RIDGE_MODEL_SCORE, NB_MODEL_SCORE, RF_MODEL_SCORE]

### Classification report

In [None]:
SCORES = pd.DataFrame(index = ['F1','Precision','Recall','Accuracy'])

def metrics(pred_tag, y_test, name):
    w = 53
    print("\n")
    print("="*w)
    print(name)
    print("="*w)
    print("F1-score: ", f1_score(pred_tag, y_test))
    print("Precision: ", precision_score(pred_tag, y_test))
    print("Recall: ", recall_score(pred_tag, y_test))
    print("Acuracy: ", accuracy_score(pred_tag, y_test))
    print("-"*w)
    
    print(classification_report(pred_tag, y_test))
    SCORES.loc['F1', name] = f1_score(pred_tag, y_test)
    SCORES.loc['Precision', name] = precision_score(pred_tag, y_test)
    SCORES.loc['Recall', name] = recall_score(pred_tag, y_test)
    SCORES.loc['Accuracy', name] = accuracy_score(pred_tag, y_test)
    

In [None]:
%%time
for i,m in enumerate(models):
    metrics(m.predict(TRAIN_VECTORS), TARGET, model_name[i])
    
metrics(NN_MODEL.predict([X_PAD, train_features]), TARGET, "RNN")

In [None]:
SCORES

---

### Confusion matrices

In [None]:
%%time
N_cols = 5
col_width = 6
N_rows = round((len(models) +1) / N_cols + 0.49)
fig, axs = plt.subplots(nrows = N_rows, ncols=N_cols, figsize=(col_width * N_cols, N_rows * col_width))
for i,(m,ax) in enumerate(zip(models, axs.flatten())):
    cm = confusion_matrix(TARGET, m.predict(TRAIN_VECTORS), normalize = 'pred', labels = m.classes_)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=m.classes_)
    disp.plot(ax=ax)
    ax.set_title(model_name[i])

disp = ConfusionMatrixDisplay(confusion_matrix(TARGET, NN_MODEL.predict([X_PAD, train_features]), normalize = 'pred'));
disp.plot(ax=axs[-1]);
_ =axs[-1].set_title("RNN");

## VC selection

In [None]:
#selected_models = ["Ridge", 'NaiveBayes', "RF","NN"]
selected_models = ["Ridge", 'NaiveBayes', "NN"]
TRESHOLD = 0.49

## Train predictions

In [None]:
ALL_TRAIN = pd.DataFrame({"id":DATA[_train]['id'],"target":DATA[_train]['target']})
for i,m in enumerate(models):
    ALL_TRAIN[model_name[i]] = m.predict(TRAIN_VECTORS)
    
ALL_TRAIN['NN'] = NN_MODEL.predict([X_PAD, train_features])
ALL_TRAIN['Voting'] = ALL_TRAIN[selected_models].mean(axis=1)
ALL_TRAIN['VC'] = ALL_TRAIN['Voting'].apply(lambda row: 1 if row > TRESHOLD else 0)
ALL_TRAIN.head(15)

In [None]:
#right = ALL_TRAIN.target == ALL_TRAIN.NN
right = ALL_TRAIN.target == ALL_TRAIN.VC
DATA[~right & _train][['target','text','tokens','OriginalText']].head(10)

## Confusion Matrix of voting classifier

In [None]:
ConfusionMatrixDisplay(confusion_matrix(TARGET, ALL_TRAIN.VC, normalize = 'pred')).plot();

## Test predictions

In [None]:
ALL_TEST = pd.DataFrame({"id":DATA[_test]['id']})
for i,m in enumerate(models):
    ALL_TEST[model_name[i]] = m.predict(TEST_VECTORS)
    
ALL_TEST['NN'] = NN_MODEL.predict([TEST_PAD, test_features])
ALL_TEST['Voting'] = ALL_TEST[selected_models].mean(axis=1)
ALL_TEST['VC'] = ALL_TEST['Voting'].apply(lambda row: 1 if row > TRESHOLD else 0)
ALL_TEST.head(10)

In [None]:
_id = 7618
print(DATA[_test]['text'][_id])
print(DATA[_test]['tokens'][_id])
print(DATA[_test]['OriginalText'][_id])

# Submission

In [None]:
#output = pd.DataFrame({"id": ALL_TEST.id, "target": ALL_TEST.Ridge}) #0.80539
#output = pd.DataFrame({"id": ALL_TEST.id, "target": ALL_TEST.NaiveBayes}) #0.80539
#output = pd.DataFrame({"id": ALL_TEST.id, "target": ALL_TEST.RF}) #0.79190
#output = pd.DataFrame({"id": ALL_TEST.id, "target": ALL_TEST.NN}) #0.81152
output = pd.DataFrame({"id": ALL_TEST.id, "target": ALL_TEST.VC}) #0.81428
output.head(10)

In [None]:
output.to_csv('submission.csv', index=False)
print("Submission was successfully saved!")

In [None]:
end_time = time.time()
print("Notebook run time: {:.1f} seconds. Finished at {}".format(end_time - start_time, datetime.now()) )