## Import libraries

In [None]:
!pip install tweet-preprocessor

In [None]:
import os
from google.cloud import storage, automl_v1beta1 as automl
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import scipy as sp

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, KFold, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

from sklearn.naive_bayes import GaussianNB, MultinomialNB

import seaborn as sns
import matplotlib.pyplot as plt 

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from nltk.stem import WordNetLemmatizer
import nltk
from collections import Counter
import string
import re
from nltk.corpus import wordnet as wn
from statistics import mean 
import preprocessor 

In [None]:
from sklearn import metrics

## Common libraries

This section of code cleans the tweet text using tweet preprocesser library

In [None]:
stop = set(STOPWORDS).union(set(['FAV' , 'RT']))
lemma = WordNetLemmatizer()
preprocessor.set_options(preprocessor.OPT.URL, preprocessor.OPT.MENTION, preprocessor.OPT.NUMBER, preprocessor.OPT.RESERVED)

def clean(text):   
    text = preprocessor.clean(text)
    text = re.sub(r'[^\w\s]','',text)
    stop_free = " ".join([i for i in text.split(' ') if (i not in stop)])
    normalized = " ".join(lemma.lemmatize(word) for word in stop_free.split())
    return normalized

## Load Data

In [None]:
train_df = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test_df = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

## Clean data

In [None]:
train_df.text = train_df.text.apply(clean)
test_df.text = test_df.text.apply(clean)

In [None]:
train_df.head(5)

In [None]:
test_df.head(5)

## AutoML Model

In [None]:
# Set your own project id here
PROJECT_ID = 'automl-kaggle-263107'

In [None]:
#REPLACE THIS WITH A NEW BUCKET NAME. NOTE: BUCKET NAMES MUST BE GLOBALLY UNIQUE
BUCKET_NAME = 'automl-disaster-tweet-cleaned'
#Note: the bucket_region must be us-central1.
BUCKET_REGION = 'us-central1'

In [None]:
storage_client = storage.Client(project=PROJECT_ID)
tables_gcs_client = automl.GcsClient(client=storage_client, bucket_name=BUCKET_NAME)
automl_client = automl.AutoMlClient()
# Note: AutoML Tables currently is only eligible for region us-central1. 
prediction_client = automl.PredictionServiceClient()
# Note: This line runs unsuccessfully without each one of these parameters
tables_client = automl.TablesClient(project=PROJECT_ID, region=BUCKET_REGION, client=automl_client, gcs_client=tables_gcs_client, prediction_client=prediction_client)

In [None]:
# Create your GCS Bucket with your specified name and region (if it doesn't already exist)
bucket = storage.Bucket(storage_client, name=BUCKET_NAME)
if not bucket.exists():
    bucket.create(location=BUCKET_REGION)

In [None]:
def upload_blob(bucket_name, source_file_name, destination_blob_name):
    """Uploads a file to the bucket. https://cloud.google.com/storage/docs/ """
    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)
    blob.upload_from_filename(source_file_name)
    print('File {} uploaded to {}.'.format(
        source_file_name,
        destination_blob_name))
    
def download_to_kaggle(bucket_name,destination_directory,file_name,prefix=None):
    """Takes the data from your GCS Bucket and puts it into the working directory of your Kaggle notebook"""
    os.makedirs(destination_directory, exist_ok = True)
    full_file_path = os.path.join(destination_directory, file_name)
    blobs = storage_client.list_blobs(bucket_name,prefix=prefix)
    for blob in blobs:
        blob.download_to_filename(full_file_path)

In [None]:
test_df.head(5)

In [None]:
# Select the text body and the target value, for sending to AutoML
train_df[['id','text','target']].to_csv('/kaggle/working/train.csv', index=False) 
test_df[['id','text']].to_csv('/kaggle/working/test.csv', index=False) 

In [None]:
upload_blob(BUCKET_NAME, '/kaggle/working/train.csv', 'train.csv')
upload_blob(BUCKET_NAME, '/kaggle/working/test.csv', 'test.csv')

In [None]:
dataset_display_name = 'tweet_disaster_cleaned'
new_dataset = False
try:
    dataset = tables_client.get_dataset(dataset_display_name=dataset_display_name)
except:
    new_dataset = True
    dataset = tables_client.create_dataset(dataset_display_name)

In [None]:
# gcs_input_uris have the familiar path of gs://BUCKETNAME//file

if new_dataset:
    gcs_input_uris = ['gs://' + BUCKET_NAME + '/train.csv']

    import_data_operation = tables_client.import_data(
        dataset=dataset,
        gcs_input_uris=gcs_input_uris
    )
    print('Dataset import operation: {}'.format(import_data_operation))

    # Synchronous check of operation status. Wait until import is done.
    import_data_operation.result()

In [None]:
print(dataset)

In [None]:
ID_COLUMN = 'id'

In [None]:
TARGET_COLUMN = 'target'

tables_client.set_target_column(
    dataset=dataset,
    column_spec_display_name=TARGET_COLUMN
)

In [None]:
# Make all columns nullable (except the Target and ID Column)
for col in tables_client.list_column_specs(PROJECT_ID,BUCKET_REGION,dataset.name):
    if TARGET_COLUMN in col.display_name or ID_COLUMN in col.display_name:
        continue
    tables_client.update_column_spec(PROJECT_ID,
                                     BUCKET_REGION,
                                     dataset.name,
                                     column_spec_display_name=col.display_name,
                                     type_code=col.data_type.type_code,
                                     nullable=True)

In [None]:
# Train the model. This will take hours (up to your budget). AutoML will early stop if it finds an optimal solution before your budget.
# On this dataset, AutoML usually stops around 2000 milli-hours (2 hours)

TRAIN_BUDGET = 1000 # (specified in milli-hours, from 1000-72000)
model = None
model_display_name = 'tweet_disaster_model_clean'
try:
    model = tables_client.get_model(model_display_name=model_display_name)
except:
    response = tables_client.create_model(
        model_display_name,
        dataset=dataset,
        train_budget_milli_node_hours=TRAIN_BUDGET,
        exclude_column_spec_names=[TARGET_COLUMN,ID_COLUMN]
    )
    print('Create model operation: {}'.format(response.operation))
    # Wait until model training is done.
    model = response.result()
print(model)

In [None]:
gcs_input_uris = 'gs://' + BUCKET_NAME + '/test.csv'
gcs_output_uri_prefix = 'gs://' + BUCKET_NAME + '/predictions'

batch_predict_response = tables_client.batch_predict(
    model=model, 
    gcs_input_uris=gcs_input_uris,
    gcs_output_uri_prefix=gcs_output_uri_prefix,
)
print('Batch prediction operation: {}'.format(batch_predict_response.operation))
# Wait until batch prediction is done.
batch_predict_result = batch_predict_response.result()
batch_predict_response.metadata

In [None]:
# The output directory for the prediction results exists under the response metadata for the batch_predict operation
# Specifically, under metadata --> batch_predict_details --> output_info --> gcs_output_directory
# Then, you can remove the first part of the output path that contains the GCS Bucket information to get your desired directory
gcs_output_folder = batch_predict_response.metadata.batch_predict_details.output_info.gcs_output_directory.replace('gs://' + BUCKET_NAME + '/','')
download_to_kaggle(BUCKET_NAME,'/kaggle/working','submissions.csv', prefix=gcs_output_folder)

In [None]:
preds_df = pd.read_csv("/kaggle/working/submissions.csv")
preds_df = preds_df.sort_values(by=['id'])
preds_df['target'] = (preds_df['target_1_score'] >= 0.5).astype(int)

In [None]:
preds_df.head(50)

In [None]:
preds_df[['id','target']].to_csv("submission.csv", index=False)

## Simple Classifier Models

### Building vectors

The theory behind the model we'll build in this notebook is pretty simple: the words contained in each tweet are a good indicator of whether they're about a real disaster or not (this is not entirely correct, but it's a great place to start).

We'll use scikit-learn's `CountVectorizer` to count the words in each tweet and turn them into data our machine learning model can process.

Note: a `vector` is, in this context, a set of numbers that a machine learning model can work with. We'll look at one in just a second.

In [None]:
tfidf_vectorizer = feature_extraction.text.TfidfVectorizer(ngram_range = (1,2), stop_words='english',strip_accents='unicode')

In [None]:
train_vectors = tfidf_vectorizer.fit_transform(train_df["text"])

## note that we're NOT using .fit_transform() here. Using just .transform() makes sure
# that the tokens in the train vectors are the only ones mapped to the test vectors - 
# i.e. that the train and test vectors use the same set of tokens.
test_vectors = tfidf_vectorizer.transform(test_df["text"])

In [None]:
train_vectors.todense().shape

### Random Forest classifier for non-text data

In [None]:
# define X and y
feature_cols = ['keyword', 'location']
X = train_df[feature_cols]
y = train_df.target
one_hot_encoded_training_predictors = pd.get_dummies(X)
clf = RandomForestClassifier(n_estimators = 100)
scores = model_selection.cross_val_score(clf, one_hot_encoded_training_predictors, y, cv=5, scoring="f1")
scores

In [None]:
clf.fit(one_hot_encoded_training_predictors, y)

### Ridge and SVM classifier for text data

As we mentioned above, we think the words contained in each tweet are a good indicator of whether they're about a real disaster or not. The presence of particular word (or set of words) in a tweet might link directly to whether or not that tweet is real.

What we're assuming here is a _linear_ connection. So let's build a linear model and see!

In [None]:
## Our vectors are really big, so we want to push our model's weights
## toward 0 without completely discounting different words - ridge regression 
## is a good way to do this.
clf = linear_model.RidgeClassifier()
# clf = SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
#     decision_function_shape='ovr', degree=3, gamma=0.7, kernel='rbf',
#     max_iter=-1, probability=False, random_state=None, shrinking=True,
#     tol=0.001, verbose=False)
#clf = linear_model.LogisticRegression() #same as ridge
#clf = DecisionTreeClassifier() #bad performance
#clf=RandomForestClassifier(n_estimators = 100) #bad performance

In [None]:
# Let's test our model and see how well it does on the training data. For this we'll use `cross-validation` - where we train on a portion of the known data, then validate it with the rest. If we do this several times (with different portions) we can get a good idea for how a particular model or method performs.

# The metric for this competition is F1, so let's use that here.
scores = model_selection.cross_val_score(clf, train_vectors, train_df["target"], cv=10, scoring="f1")
scores

In [None]:
clf.fit(train_vectors, train_df["target"])

#### SVM

In [None]:
parameters = { 
    'gamma': [0.7, 1, 'auto', 'scale']
}
clf = GridSearchCV(SVC(kernel='rbf'), parameters, cv=5, n_jobs=-1, scoring="f1").fit(train_vectors, train_df["target"]) #SVM-slightly better than ridge

In [None]:
clf.best_estimator_

In [None]:
clf.best_score_

Let's do predictions on our training set and build a submission for the competition.

In [None]:
sample_submission = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")
sample_submission["target"] = clf.predict(test_vectors)
df = pd.DataFrame({'text' : test_df['text'], 'prediction' : sample_submission["target"]})

In [None]:
sample_submission.to_csv("submission1.csv", index=False)

## Tensorflow model

In [None]:
import tensorflow as tf
print(tf.__version__)
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
X = train_df["text"]
y = train_df["target"]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
vocab_size = 10000
embedding_dim = 16
max_length = 30 #Based on data exploration
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"

In [None]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(X_train)
padded = pad_sequences(sequences,maxlen=max_length, padding=padding_type, truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(X_val)
testing_padded = pad_sequences(testing_sequences,maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [None]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_sentence(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

print(decode_sentence(padded[0]))

In [None]:
print(X_train.values[0])

In [None]:
len(word_index)

#### Embedding model

In [None]:
# Note this is the 100 dimension version of GloVe from Stanford
# I unzipped and hosted it on my site to make this notebook easier
!wget --no-check-certificate \
    https://storage.googleapis.com/laurencemoroney-blog.appspot.com/glove.6B.100d.txt \
    -O /tmp/glove.6B.100d.txt
embeddings_index = {};
vocab_size=len(word_index)
embedding_dim = 100
with open('/tmp/glove.6B.100d.txt') as f:
    for line in f:
        values = line.split();
        word = values[0];
        coefs = np.asarray(values[1:], dtype='float32');
        embeddings_index[word] = coefs;

embeddings_matrix = np.zeros((vocab_size+1, embedding_dim));
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word);
    if embedding_vector is not None:
        embeddings_matrix[i] = embedding_vector;

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
 ])   
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy',tf.keras.metrics.AUC()])
model.summary()

In [None]:
model = tf.keras.Sequential([
    #Embedding
#     tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length), #Each sentence will have 120 words
#     tf.keras.layers.Flatten(),
#     tf.keras.layers.Dense(6, activation='relu'),
#     tf.keras.layers.Dense(1, activation='sigmoid')
    
    #Word embedding with pooling
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
    
    #LSTM
#     tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
#     tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
#     tf.keras.layers.Dense(24, activation='relu'),
#     tf.keras.layers.Dense(1, activation='sigmoid')
    
    #Multi Layer LSTM - Best performing
#     tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
#     tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
#     tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
#     tf.keras.layers.Dense(64, activation='relu'),
#     tf.keras.layers.Dense(1, activation='sigmoid')
    
    #Glove embedding, Drop out etc
#     tf.keras.layers.Embedding(vocab_size+1, embedding_dim, input_length=max_length, weights=[embeddings_matrix], trainable=False),
#     tf.keras.layers.Dropout(0.2),
#     tf.keras.layers.Conv1D(64, 5, activation='relu'),
#     tf.keras.layers.MaxPooling1D(pool_size=4),
#     tf.keras.layers.LSTM(64),
#     tf.keras.layers.Dense(1, activation='sigmoid')
    
    #GRU
#     tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
#     tf.keras.layers.Bidirectional(tf.keras.layers.GRU(32)),
#     tf.keras.layers.Dense(6, activation='relu'),
#     tf.keras.layers.Dense(1, activation='sigmoid')
    
    #ConvD
#     tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
#     tf.keras.layers.Conv1D(128, 5, activation='relu'),
#     tf.keras.layers.GlobalAveragePooling1D(),
#     tf.keras.layers.Dense(6, activation='relu'),
#     tf.keras.layers.Dense(1, activation='sigmoid') 
])
#model.compile(optimizer='adam', loss=f1_loss, metrics=['accuracy', f1])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy',tf.keras.metrics.AUC()])
model.summary()

In [None]:
num_epochs = 3
history = model.fit(padded, y_train, epochs=num_epochs, validation_data=(testing_padded, y_val))

In [None]:
model_loss = pd.DataFrame(model.history.history)
model_loss.head()

In [None]:
#model_loss[['accuracy','val_accuracy']].plot(ylim=[0,1]);
model_loss[['auc_9','val_auc_9']].plot(ylim=[0,1]);

In [None]:
testing_sequences2 = tokenizer.texts_to_sequences(test_df.text)
testing_padded2 = pad_sequences(testing_sequences2, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [None]:
probabilities = model.predict(testing_padded2)

In [None]:
predictions = (probabilities > 0.5).astype(int)
predictions = np.ndarray.flatten(predictions)
pd.value_counts(predictions)

In [None]:
original_test_df = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")
df = pd.DataFrame({'text' : original_test_df['text'],'cleaned_text' : test_df['text'], 'prediction' : predictions,'probabilities' : np.ndarray.flatten(probabilities)})
df.to_csv("test_df.csv", index=False)

In [None]:
df.values[50:100]

In [None]:
sample_submission = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")
sample_submission["target"] = predictions
sample_submission.to_csv("submission.csv", index=False)

## Tensorflow Hub  - Universal Sentence Encoder + LightGBM

In [None]:
import tensorflow_hub as hub
import lightgbm as lgb
from lightgbm import LGBMClassifier

In [None]:
module_url = "https://tfhub.dev/google/nnlm-en-dim128/2"
embed = hub.KerasLayer(module_url)
embeddings = embed(["A long sentence.", "single-word",
                  "http://example.com"])
print(embeddings.shape)  #(3,128)

In [None]:
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/3")

In [None]:
X_train_embeddings = embed(train_df.text.values)
X_test_embeddings = embed(test_df.text.values)

In [None]:
params = {
    'learning_rate': 0.04,
    'n_estimators': 1000,
    'colsample_bytree': 0.4,
    'metric':'auc'
}

In [None]:
text_clf = LGBMClassifier(**params)

In [None]:
text_clf.fit(X_train_embeddings['outputs'][:5000,:], train_df.target.values[:5000],
             eval_set=[(X_train_embeddings['outputs'][:5000,:], train_df.target.values[:5000]),
                       (X_train_embeddings['outputs'][5000:,:], train_df.target.values[5000:])],
             verbose=200, early_stopping_rounds=20,
            )


In [None]:
text_clf.fit(X_train_embeddings['outputs'][:5000,:], train_df.target.values[:5000])
Y_pred = text_clf.predict(X_train_embeddings['outputs'][5000:])

In [None]:
print(metrics.classification_report(train_df.target[5000:], Y_pred, digits=3),) 
print(metrics.confusion_matrix(train_df.target[5000:], Y_pred))

In [None]:
text_clf.fit(X_train_embeddings['outputs'], train_df.target.values)
pred_test = text_clf.predict(X_test_embeddings['outputs'])

In [None]:
df = pd.DataFrame({'cleaned_text' : test_df['text'], 'prediction' : pred_test})
df.head(20)

In [None]:
sample_submission = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")
sample_submission["target"] = pred_test
sample_submission.to_csv("submission.csv", index=False)