In [3]:
import pandas as pd
import numpy as np

In [4]:
dataset = pd.read_csv("train.csv")
dataset.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


# Plan

So I will try to compose a few combinations of data & models to find the best one and use it as a final one. Those combinations are:
* Data with no changes (only tweet text used);
  * Use this data to build mostly used sklearn ML models and one DL model:
    * Logistic Regression;
    * K-neighbors;
    * Decision Tree;
    * Ensemble;
    * ANN;
* Data with cleaned up text in tweets (removed symbols, stop-words and stemmed text);
  * Use this data to build mostly used sklearn ML models and one DL model (same models as in previous chapter);


# def concatenate

In [5]:
# Let's first create the function which will concatenate text from tweets with 
# location and keyword fields.

import spacy

def concatenate(dataset):
  corpus = []
  for i in range(0, len(dataset)):
    entry = dataset['text'][i]
    if not str(dataset['keyword'][i]) == 'nan':
      entry += str(dataset['keyword'][i])
    if not str(dataset['location'][i]) == 'nan':
      entry += str(dataset['location'][i])

    corpus.append(entry)

  return corpus

# def lemmatize

In [6]:
# Now let's create the function which will lemmatize text from tweets with

def lemmatize(texts):
  nlp = spacy.load('en')
  tokenizer = nlp.Defaults.create_tokenizer(nlp)
  corpus = []
  for text in texts:
    tokens = tokenizer(text)
    tokens = [token.lemma_.lower() for token in tokens]
    tokens = ' '.join(tokens)
    corpus.append(tokens)

  return corpus

# def remove_stopwords

In [7]:
def remove_stopwords(texts):
  nlp = spacy.load('en')
  tokenizer = nlp.Defaults.create_tokenizer(nlp)
  corpus = []
  for text in texts:
    tokens = tokenizer(text)
    tokens = [token.text for token in tokens if not token.is_stop]
    tokens = ' '.join(tokens)
    corpus.append(tokens)

  return corpus

# def remove_punctuation

In [8]:
def remove_punctuation(texts):
  nlp = spacy.load('en')
  tokenizer = nlp.Defaults.create_tokenizer(nlp)
  corpus = []
  for text in texts:
    tokens = tokenizer(text)
    tokens = [token.text for token in tokens if not token.is_punct]
    tokens = ' '.join(tokens)
    corpus.append(tokens)

  return corpus

# def remove_notalpha

In [9]:
def remove_notalpha(texts):
  nlp = spacy.load('en')
  tokenizer = nlp.Defaults.create_tokenizer(nlp)
  corpus = []
  for text in texts:
    tokens = tokenizer(text)
    tokens = [token.text for token in tokens if token.is_alpha]
    tokens = ' '.join(tokens)
    corpus.append(tokens)

  return corpus

# def remove_links

In [10]:
def remove_links(texts):
  nlp = spacy.load('en')
  tokenizer = nlp.Defaults.create_tokenizer(nlp)
  corpus = []
  for text in texts:
    tokens = tokenizer(text)
    tokens = [token.text for token in tokens if not token.like_url]
    tokens = ' '.join(tokens)
    corpus.append(tokens)

  return corpus

# "Pipelinizing" the functions created previously

In [11]:
from sklearn.preprocessing import FunctionTransformer

trans_concat = FunctionTransformer(concatenate)
trans_lemma = FunctionTransformer(lemmatize)
trans_stop = FunctionTransformer(remove_stopwords)
trans_punct = FunctionTransformer(remove_punctuation)
trans_alpha = FunctionTransformer(remove_notalpha)
trans_links = FunctionTransformer(remove_links)

In [12]:
dataset = pd.read_csv("train.csv")

In [13]:
from sklearn.pipeline import Pipeline

transformers = [('concatenation', trans_concat),
                ('lemmatization', trans_lemma),
                ('links_removal', trans_links),
                ('leave_alpha', trans_alpha),
              #('stopwords_removal', trans_stop),
              #('punctuation_removal', trans_punct)]
              ]

my_pipe = Pipeline(transformers)
X = my_pipe.transform(dataset)

In [14]:
y = dataset['target']

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Now we'll create TF-IFD vectorizer and Count vectorizer

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

max_features=1500
tfidf = TfidfVectorizer(max_features=max_features)

X_train_tfidf = tfidf.fit_transform(X_train).toarray()
X_test_tfidf = tfidf.transform(X_test).toarray()

In [17]:
f1_scores = {}

# Logistic Regression

In [18]:
# lr_grid = {'penalty' : ['l1', 'l2'],
#             'C' : np.logspace(-4, 4, 20),
#             'solver' : ['liblinear'],
#             'max_iter' : [10, 100, 500, 1000, 2000, 5000]}

In [19]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import GridSearchCV

# lr_model = LogisticRegression()
# lr_model_grid = GridSearchCV(estimator=lr_model,
#                              param_grid=lr_grid,
#                              scoring='f1',
#                              n_jobs=-1,
#                              cv=5,
#                              verbose=True)

In [20]:
# lr_model_grid.fit(X_train_tfidf, y_train)
# lr_model_grid_params_tfidf = lr_model_grid.best_params_
# print(lr_model_grid_params_tfidf)

In [21]:
# Fitting 5 folds for each of 240 candidates, totalling 1200 fits
# [Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
# [Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    5.6s
# [Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:   18.5s
# [Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed:   39.8s
# [Parallel(n_jobs=-1)]: Done 796 tasks      | elapsed:  1.2min
# [Parallel(n_jobs=-1)]: Done 1200 out of 1200 | elapsed:  3.4min finished
# {'C': 1.623776739188721, 'max_iter': 10, 'penalty': 'l2', 'solver': 'liblinear'}

In [22]:
lr_model_grid_params_tfidf = {'C': 1.623776739188721,
                              'max_iter': 10,
                              'penalty': 'l2',
                              'solver': 'liblinear'}

In [23]:
from sklearn.linear_model import LogisticRegression

lr_model_tfidf = LogisticRegression(**lr_model_grid_params_tfidf)
lr_model_tfidf.fit(X_train_tfidf, y_train)

LogisticRegression(C=1.623776739188721, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=10, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [24]:
from sklearn.metrics import f1_score

y_pred_lr_tfidf = lr_model_tfidf.predict(X_test_tfidf)

f1_score_lr_tfidf = f1_score(y_test, y_pred_lr_tfidf)
print(f1_score_lr_tfidf)

0.7363100252737994


# Using CountVectorizer

In [25]:
count = CountVectorizer(max_features=max_features)

X_train_count = count.fit_transform(X_train).toarray()
X_test_count = count.transform(X_test).toarray()

In [26]:
# lr_model_grid.fit(X_train_count, y_train)
# lr_model_grid_params_count = lr_model_grid.best_params_
# print(lr_model_grid_params_count)

In [27]:
# Fitting 5 folds for each of 240 candidates, totalling 1200 fits
# [Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
# [Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    5.0s
# [Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:   20.3s
# [Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed:   46.3s
# [Parallel(n_jobs=-1)]: Done 796 tasks      | elapsed:  1.5min
# [Parallel(n_jobs=-1)]: Done 1200 out of 1200 | elapsed:  4.8min finished
# {'C': 0.615848211066026, 'max_iter': 10, 'penalty': 'l2', 'solver': 'liblinear'}

In [28]:
lr_model_grid_params_count = {'C': 0.615848211066026,
                              'max_iter': 10,
                              'penalty': 'l2',
                              'solver': 'liblinear'}

In [29]:
lr_model_count = LogisticRegression(**lr_model_grid_params_count)
lr_model_count.fit(X_train_count, y_train)

LogisticRegression(C=0.615848211066026, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=10, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [30]:
y_pred_lr_count = lr_model_count.predict(X_test_count)

f1_score_lr_count = f1_score(y_test, y_pred_lr_count)
print(f1_score_lr_count)

0.7417998317914214


In [31]:
f1_scores["LogisticRegression(TFIDF)"] = f1_score_lr_tfidf
f1_scores["LogisticRegression(COUNT)"] = f1_score_lr_count

In [32]:
f1_scores

{'LogisticRegression(COUNT)': 0.7417998317914214,
 'LogisticRegression(TFIDF)': 0.7363100252737994}

# KNN

In [33]:
from sklearn.neighbors import KNeighborsClassifier

knc_model = KNeighborsClassifier()
knc_model.fit(X_train_tfidf, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [34]:
y_pred_knc = knc_model.predict(X_test_tfidf)

f1_score_knc = f1_score(y_test, y_pred_knc)
print(f1_score_knc)

0.3951219512195122


# Decsision Tree

In [35]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

dtc_grid = {"max_depth": [3, None],
              "max_features": range(1, 10),
              "min_samples_leaf": range(1, 10),
              "criterion": ["gini", "entropy"]}

dtc_search = GridSearchCV(estimator=DecisionTreeClassifier(),
                          param_grid=dtc_grid,
                          scoring='f1',
                          n_jobs=-1,
                          cv=5,
                          verbose=True)

In [36]:
dtc_search.fit(X_train_tfidf, y_train)
dtc_best_params_tfidf = dtc_search.best_params_
print(dtc_best_params_tfidf)

Fitting 5 folds for each of 324 candidates, totalling 1620 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 116 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done 716 tasks      | elapsed:   26.7s


{'criterion': 'entropy', 'max_depth': None, 'max_features': 9, 'min_samples_leaf': 3}


[Parallel(n_jobs=-1)]: Done 1620 out of 1620 | elapsed:  1.0min finished


In [37]:
dtc_model_tfidf = DecisionTreeClassifier(**dtc_best_params_tfidf)
dtc_model_tfidf.fit(X_train_tfidf, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=9, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=3, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [38]:
y_pred_dtc_tfidf = dtc_model_tfidf.predict(X_test_tfidf)

f1_score_dtc_tfidf = f1_score(y_test, y_pred_dtc_tfidf)
print(f1_score_dtc_tfidf)

0.6526492851135408


In [39]:
f1_scores["DecisionTreeClassifier(TFIDF)"] = f1_score_dtc_tfidf

# Ensemble

In [40]:
from sklearn.ensemble import RandomForestClassifier

rfc_grid = {"n_estimators": [100, 200, 500, 1000],
            "max_depth": [8, 15, 25, 30],
            "max_features": ["auto", "sqrt"]}

rfc_search = GridSearchCV(estimator=RandomForestClassifier(),
                          param_grid=rfc_grid,
                          scoring='f1',
                          n_jobs=-1,
                          cv=5,
                          verbose=True)

In [41]:
rfc_search.fit(X_train_tfidf, y_train)
rfc_best_params_tfidf = rfc_search.best_params_
print(rfc_best_params_tfidf)

Fitting 5 folds for each of 32 candidates, totalling 160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 160 out of 160 | elapsed: 16.2min finished


{'max_depth': 30, 'max_features': 'auto', 'n_estimators': 1000}


In [42]:
rfc_model_tfidf = RandomForestClassifier(**rfc_best_params_tfidf)
rfc_model_tfidf.fit(X_train_tfidf, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=30, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [43]:
y_pred_rfc_tfidf = rfc_model_tfidf.predict(X_test_tfidf)

f1_score_rfc_tfidf = f1_score(y_test, y_pred_rfc_tfidf)
print(f1_score_rfc_tfidf)

0.634765625


In [44]:
f1_scores["RandomForestClassifier(TFIDF)"] = f1_score_rfc_tfidf

In [45]:
f1_scores

{'DecisionTreeClassifier(TFIDF)': 0.6526492851135408,
 'LogisticRegression(COUNT)': 0.7417998317914214,
 'LogisticRegression(TFIDF)': 0.7363100252737994,
 'RandomForestClassifier(TFIDF)': 0.634765625}

# Submission of model, trained on full training set, using TF-IDF

In [79]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [80]:
y = train['target']
train.drop(['target'], axis=1, inplace=True)

In [81]:
X_train_full = my_pipe.transform(train)
X_test_full = my_pipe.transform(test)

In [82]:
max_features=2500
tfidf_final = TfidfVectorizer(max_features=max_features)

X_train_full = tfidf.fit_transform(X_train_full).toarray()
X_test_full = tfidf.transform(X_test_full).toarray()

In [83]:
from sklearn.linear_model import LogisticRegression

lr_model_final = LogisticRegression(**lr_model_grid_params_tfidf)
lr_model_final.fit(X_train_full, y)

LogisticRegression(C=1.623776739188721, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=10, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [84]:
y_pred = lr_model_final.predict(X_test_full)

In [85]:
y_pred[:30]

array([1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1])

In [86]:
# Save test predictions to file
output = pd.DataFrame({'id': test.id,
                       'target': y_pred})
output.to_csv('submission.csv', index=False)

Score on kaggle - 0.79374

# Submission of model, trained on full training set, using Count

In [95]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [96]:
y = train['target']
train.drop(['target'], axis=1, inplace=True)

In [97]:
X_train_full = my_pipe.transform(train)
X_test_full = my_pipe.transform(test)

In [98]:
max_features = 2500
count_final = CountVectorizer(max_features=max_features)

X_train_full_count = count.fit_transform(X_train_full).toarray()
X_test_full_count = count.transform(X_test_full).toarray()

In [99]:
lr_model_count_final = LogisticRegression(**lr_model_grid_params_tfidf)
lr_model_count_final.fit(X_train_full_count, y)

LogisticRegression(C=1.623776739188721, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=10, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [100]:
y_pred_count = lr_model_count_final.predict(X_test_full_count)

In [101]:
# Save test predictions to file
output = pd.DataFrame({'id': test.id,
                       'target': y_pred_count})
output.to_csv('submission.csv', index=False)

Score on kaggle - 0.79374