In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
dataset = pd.read_csv("train.csv")
dataset.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


So I will try to compose a few combinations of data & models to find the best one and use it as a final one. Those combinations are:
* Data with no changes (only tweet text used);
  * Use this data to build mostly used sklearn ML models and one DL model:
    * Logistic Regression;
    * K-neighbors;
    * SVC;
    * Decision Tree;
    * Ensemble;
    * Naive Bayes;
    * ANN;
* Data with cleaned up text in tweets (removed symbols, stop-words and stemmed text);
  * Use this data to build mostly used sklearn ML models and one DL model (same models as in previous chapter);
* Data with cleaned up text in tweets and properly formatted "location" and "keyword" columns;
  * Use this data to build mostly used sklearn ML models and one DL model (same models as in previous chapter);

In [None]:
import re
import nltk
nltk.download("stopwords") # downloads stopwords
from nltk.corpus import stopwords # imports them afterwards
from nltk.stem.porter import PorterStemmer # applies stemming, takes only roots of the words
corpus = [] # it will contain all our different tweets, but all clean
ps = PorterStemmer()
all_stopwords = stopwords.words('english')[:143]
all_stopwords.remove('not')
all_stopwords.remove("don")
all_stopwords.remove("don't")
for i in range(0, len(dataset)):
  tweet = re.sub('[^a-zA-Z]', ' ', dataset['text'][i])
  tweet = tweet.lower()
  tweet = tweet.split()
  tweet = [ps.stem(word) for word in tweet if not word in set(all_stopwords)]
  tweet = ' '.join(tweet)
  corpus.append(tweet)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

X = cv.fit_transform(corpus).toarray()
y = dataset["target"].values

In [None]:
len(X[0])

18887

In [None]:
X[:20]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Logistic Regression

In [None]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import RandomizedSearchCV

# lr_grid = {"C": np.logspace(-4, 4, 40),
#                 "penalty": ['l1', 'l2'],
#                 "solver": ["liblinear"]}

# lr_search = RandomizedSearchCV(estimator=LogisticRegression(),
#                           param_distributions=lr_grid,
#                           cv=5,
#                           scoring='f1',
#                           verbose=True)

# lr_search.fit(X_train, y_train)
# lr_best_params = lr_search.best_params_

In [None]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(penalty = 'l2',solver = 'liblinear', max_iter=1000)
lr_model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
y_pred_lr = lr_model.predict(X_test)

In [None]:
from sklearn.metrics import f1_score

f1_score_lr = f1_score(y_test, y_pred_lr)
print(f1_score_lr)

0.7360655737704918


# K-neighbors

In [None]:
# from sklearn.neighbors import KNeighborsClassifier

# knc_grid = {"leaf_size": list(range(1, 5)),
#             "n_neighbors": list(range(1, 5)),
#             "p": [2]}

# knc_search = RandomizedSearchCV(estimator=KNeighborsClassifier(),
#                           param_distributions=knc_grid,
#                           cv=5,
#                           scoring='f1',
#                           verbose=True)

# knc_search.fit(X_train, y_train)
# knc_best_params = knc_search.best_params_

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knc_model = KNeighborsClassifier()
knc_model.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [None]:
y_pred_knc = knc_model.predict(X_test)

In [None]:
f1_score_knc = f1_score(y_test, y_pred_knc)
print(f1_score_knc)

0.42857142857142855


# SVC

In [None]:
# from sklearn.svm import SVC

# svc_grid = {'C': [0.1,1, 10, 100], 
#             'gamma': [1,0.1,0.01,0.001],
#             'kernel': ['rbf', 'poly', 'sigmoid']}

# svc_search = RandomizedSearchCV(estimator=SVC(),
#                           param_distributions=svc_grid,
#                           cv=5,
#                           scoring='f1',
#                           verbose=True)

# svc_search.fit(X_train, y_train)
# svc_best_params = svc_search.best_params_

In [None]:
from sklearn.svm import SVC

svc_model = SVC()
svc_model.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [None]:
y_pred_svc = svc_model.predict(X_test)

In [None]:
f1_score_svc = f1_score(y_test, y_pred_svc)
print(f1_score_svc)

0.7585616438356163


# Decision Tree

In [None]:
# from sklearn.tree import DecisionTreeClassifier

# dtc_grid = {"max_depth": [3, None],
#               "max_features": randint(1, 9),
#               "min_samples_leaf": randint(1, 9),
#               "criterion": ["gini", "entropy"]}

# dtc_search = RandomizedSearchCV(estimator=DecisionTreeClassifier(),
#                           param_distributions=dtc_grid,
#                           cv=5,
#                           scoring='f1',
#                           verbose=True)

# dtc_search.fit(X_train, y_train)
# dtc_best_params = dtc_search.best_params_

In [None]:
from sklearn.tree import DecisionTreeClassifier

dtc_model = DecisionTreeClassifier()
dtc_model.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [None]:
y_pred_dtc = dtc_model.predict(X_test)

In [None]:
f1_score_dtc = f1_score(y_test, y_pred_dtc)
print(f1_score_dtc)

0.6776470588235294


# Ensemble

In [None]:
# from sklearn.ensemble import RandomForestClassifier

# rfc_grid = {"n_estimators": [10, 100, 200, 500, 1000, 1200],
#             "max_depth": [5, 8, 15, 25, 30],
#             "max_features": ["auto", "sqrt"],
#             "min_samples_split": [2, 5, 10, 15, 100],
#             "min_samples_leaf": [1, 2, 5, 10]}

# rfc_search = RandomizedSearchCV(estimator=RandomForestClassifier(),
#                           param_distributions=rfc_grid,
#                           cv=5,
#                           scoring='f1',
#                           verbose=True)

# rfc_search.fit(X_train, y_train)
# rfc_best_params = rfc_search.best_params_

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc_model = RandomForestClassifier()
rfc_model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
y_pred_rfc = rfc_model.predict(X_test)

In [None]:
f1_score_rfc = f1_score(y_test, y_pred_rfc)
print(f1_score_rfc)

0.7180851063829787


# Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

gnb_model = GaussianNB()
gnb_model.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [None]:
y_pred_gnb = gnb_model.predict(X_test)

In [None]:
f1_score_gnb = f1_score(y_test, y_pred_gnb)
print(f1_score_gnb)

0.6321493076459963


# ANN

In [None]:
from keras import backend as K

def recall_m(y_true, y_pred):
  true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
  possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
  recall = true_positives / (possible_positives + K.epsilon())
  return recall

def precision_m(y_true, y_pred):
  true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
  predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
  precision = true_positives / (predicted_positives + K.epsilon())
  return precision

def f1_m(y_true, y_pred):
  precision = precision_m(y_true, y_pred)
  recall = recall_m(y_true, y_pred)
  return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
import tensorflow as tf

ann = tf.keras.models.Sequential()
ann.add(tf.keras.layers.Dense(units=1721, activation="relu")) # Rectifier linear unit
ann.add(tf.keras.layers.Dense(units=1721, activation="relu"))
ann.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

ann.compile(optimizer='adam', loss='binary_crossentropy', metrics=[f1_m])

In [None]:
ann.fit(X_train, y_train, batch_size=32, epochs=25)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<tensorflow.python.keras.callbacks.History at 0x7ff016f97e48>

In [None]:
y_pred = ann.predict(X_test)
ann.evaluate(X_test, y_test, verbose=1)



[1.8525800704956055, 0.6956595778465271]

# The best result using this data was shown by SVC model, which is F1 = 0.758