In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
dataset = pd.read_csv("train.csv")
dataset.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


# Plan

So I will try to compose a few combinations of data & models to find the best one and use it as a final one. Those combinations are:
* Data with no changes (only tweet text used);
  * Use this data to build mostly used sklearn ML models and one DL model:
    * Logistic Regression;
    * K-neighbors;
    * Decision Tree;
    * Ensemble;
    * ANN;
* Data with cleaned up text in tweets (removed symbols, stop-words and stemmed text);
  * Use this data to build mostly used sklearn ML models and one DL model (same models as in previous chapter);


In [None]:
import re
import nltk
nltk.download("stopwords") # downloads stopwords
from nltk.corpus import stopwords # imports them afterwards
from nltk.stem.porter import PorterStemmer # applies stemming, takes only roots of the words

corpus = [] # it will contain all our different tweets, but all clean
ps = PorterStemmer()
all_stopwords = stopwords.words('english')[:143]
all_stopwords.remove('not')
all_stopwords.remove("don")
all_stopwords.remove("don't")
for i in range(0, len(dataset)):
  tweet = re.sub('[^a-zA-Z]', ' ', str(dataset['text'][i]))
  tweet = tweet.lower()
  tweet = tweet.split()
  tweet = [ps.stem(word) for word in tweet if not word in set(all_stopwords)]
  tweet = ' '.join(tweet)
  corpus.append(tweet)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
X = corpus
y = dataset["target"]

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=2000)
X_train = cv.fit_transform(X_train).toarray()
X_test = cv.transform(X_test).toarray()

In [None]:
f1_scores = {}

# LogisticRegression

In [None]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(penalty = 'l2',solver = 'liblinear', max_iter=1000)
lr_model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
from sklearn.metrics import f1_score

y_pred_lr = lr_model.predict(X_test)

f1_score_lr = f1_score(y_test, y_pred_lr)
print(f1_score_lr)

0.742857142857143


In [None]:
f1_scores["LogisticRegression"] = f1_score_lr

# K-neighbors

In [None]:
# %time
# from sklearn.neighbors import KNeighborsClassifier

# knc_grid = {"leaf_size": list(range(1, 5)),
#             "n_neighbors": list(range(1, 5)),
#             "p": [2]}

# knc_search = RandomizedSearchCV(estimator=KNeighborsClassifier(),
#                           param_distributions=knc_grid,
#                           cv=5,
#                           scoring='f1',
#                           verbose=True)

# knc_search.fit(X_train, y_train)
# knc_best_params = knc_search.best_params_

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knc_model = KNeighborsClassifier()
knc_model.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [None]:
y_pred_knc = knc_model.predict(X_test)

f1_score_knc = f1_score(y_test, y_pred_knc)
print(f1_score_knc)

0.5167037861915368


In [None]:
f1_scores["KNeighborsClassifier"] = f1_score_knc

# Decision Tree

In [None]:
# from sklearn.tree import DecisionTreeClassifier

# dtc_grid = {"max_depth": [3, None],
#               "max_features": randint(1, 9),
#               "min_samples_leaf": randint(1, 9),
#               "criterion": ["gini", "entropy"]}

# dtc_search = RandomizedSearchCV(estimator=DecisionTreeClassifier(),
#                           param_distributions=dtc_grid,
#                           cv=5,
#                           scoring='f1',
#                           verbose=True)

# dtc_search.fit(X_train, y_train)
# dtc_best_params = dtc_search.best_params_
# print(dtc_best_params)

In [None]:
from sklearn.tree import DecisionTreeClassifier

dtc_model = DecisionTreeClassifier()
dtc_model.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [None]:
y_pred_dtc = dtc_model.predict(X_test)

f1_score_dtc = f1_score(y_test, y_pred_dtc)
print(f1_score_dtc)

0.6877409406322282


In [None]:
f1_scores["DecisionTreeClassifier"] = f1_score_dtc

# Ensemble

In [None]:
# from sklearn.ensemble import RandomForestClassifier

# rfc_grid = {"n_estimators": [10, 100, 200, 500, 1000, 1200],
#             "max_depth": [5, 8, 15, 25, 30],
#             "max_features": ["auto", "sqrt"],
#             "min_samples_split": [2, 5, 10, 15, 100],
#             "min_samples_leaf": [1, 2, 5, 10]}

# rfc_search = RandomizedSearchCV(estimator=RandomForestClassifier(),
#                           param_distributions=rfc_grid,
#                           cv=5,
#                           scoring='f1',
#                           verbose=True)

# rfc_search.fit(X_train, y_train)
# rfc_best_params = rfc_search.best_params_
# print(rfc_best_params)

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc_model = RandomForestClassifier()
rfc_model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
y_pred_rfc = rfc_model.predict(X_test)

f1_score_rfc = f1_score(y_test, y_pred_rfc)
print(f1_score_rfc)

0.7190635451505016


In [None]:
f1_scores["RandomForestClassifier"] = f1_score_rfc

# XGBoost

In [None]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(n_estimators=1000, learning_rate=0.05, n_jobs=-1)
xgb_model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.05, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=1000, n_jobs=-1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [None]:
y_pred_xgb = xgb_model.predict(X_test)

f1_score_xgb = f1_score(y_test, y_pred_xgb)
print(f1_score_xgb)

0.7139107611548556


In [None]:
f1_scores["XGBoost"] = f1_score_xgb

In [None]:
f1_scores

{'DecisionTreeClassifier': 0.6877409406322282,
 'KNeighborsClassifier': 0.5167037861915368,
 'LogisticRegression': 0.742857142857143,
 'RandomForestClassifier': 0.7190635451505016,
 'XGBoost': 0.7139107611548556}

In [None]:
# output = pd.DataFrame({'Id': X_test.index,
#                        'SalePrice': preds_test})
# output.to_csv('submission.csv', index=False)



# Submission of model, trained on full training set

In [None]:
lr_grid = {'penalty' : ['l1', 'l2'],
            'C' : np.logspace(-4, 4, 20),
            'solver' : ['liblinear']}

In [None]:
from sklearn.model_selection import GridSearchCV

lr_model = LogisticRegression()
lr_model_grid = GridSearchCV(estimator=lr_model,
                             param_grid=lr_grid,
                             scoring='f1',
                             n_jobs=-1,
                             cv=5,
                             verbose=True)


lr_model_grid.fit(X_train, y_train)
%time

Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:  1.8min finished


CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.25 µs


In [None]:
lr_model_grid_params = lr_model_grid.best_params_
print(lr_model_grid_params)

{'C': 0.23357214690901212, 'penalty': 'l2', 'solver': 'liblinear'}


In [None]:
cv_final = CountVectorizer(max_features=2500)
X_train_full = cv_final.fit_transform(X).toarray()

lr_model_final = LogisticRegression(**lr_model_grid_params)
lr_model_final.fit(X_train_full, y)

LogisticRegression(C=0.23357214690901212, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
y_pred_lr_final = lr_model_final.predict(X_train_full)

f1_score_lr_final = f1_score(y, y_pred_lr_final)
print(f1_score_lr_final)

0.8306878306878308


In [None]:
X_test_full = pd.read_csv("test.csv")

In [None]:
# Transforming X_test_full to vector

In [None]:
test_corpus = []
ps = PorterStemmer()
all_stopwords = stopwords.words('english')[:143]
all_stopwords.remove('not')
all_stopwords.remove("don")
all_stopwords.remove("don't")
for i in range(0, len(X_test_full)):
  tweet = re.sub('[^a-zA-Z]', ' ', str(X_test_full['text'][i]))
  tweet = tweet.lower()
  tweet = tweet.split()
  tweet = [ps.stem(word) for word in tweet if not word in set(all_stopwords)]
  tweet = ' '.join(tweet)
  test_corpus.append(tweet)

In [None]:
#cv_final = CountVectorizer(max_features=2000)
X_test_vectorized = cv_final.transform(test_corpus).toarray()

In [None]:
y_pred_test = lr_model_final.predict(X_test_vectorized)

In [None]:
y_pred_test[:10]

array([1, 1, 1, 1, 1, 1, 0, 0, 0, 0])

In [None]:
# Save test predictions to file
output = pd.DataFrame({'id': X_test_full.id,
                       'target': y_pred_test})
output.to_csv('submission.csv', index=False)