In [1]:
import numpy as np 
import pandas as pd
pd.set_option('display.max_colwidth', 100)

import re
import string
import nltk
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB

## Load train and test data

In [3]:
train = pd.read_csv('train.csv')
train

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or...,1
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse into nearby homes http://t.co/STfMbbZFB5,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control wild fires in California even in the Northern part ...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. http://t.co/zDtoyd8EbJ,1
7611,10872,,,Police investigating after an e-bike collided with a car in Little Portugal. E-bike rider suffer...,1


In [4]:
test = pd.read_csv('test.csv')
test

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, stay safe everyone."
2,3,,,"there is a forest fire at spot pond, geese are fleeing across the street, I cannot save them all"
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTENERS XrWn
3259,10865,,,Storm in RI worse than last hurricane. My city&amp;3others hardest hit. My yard looks like it wa...
3260,10868,,,Green Line derailment in Chicago http://t.co/UtbXLcBIuY
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) http://t.co/3X6RBQJHn3


## Basic data exploration

In [5]:
train.isnull().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [6]:
test.isnull().sum()

id             0
keyword       26
location    1105
text           0
dtype: int64

In [7]:
train['target'].value_counts()

0    4342
1    3271
Name: target, dtype: int64

In this case we are going to use only the text data to build features quickly by applying Count Vectorization and then train a Naive Bayes machine learning model for text classification, but first we are going to do some text processing.

## Text processing

In [8]:
def process_text(text):
    # Make text lowercase
    text = text.lower()
    # Remove links
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    # Remove punctuation
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text) 
    # Remove words containing numbers
    text = re.sub('\w*\d\w*', '', text)
    # Tokenization
    tokens = re.split('\W+', text)
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    # Remove short tokens
    tokens = [word for word in tokens if len(word)>2]
    text = ' '.join(tokens)
    return text

In [9]:
# Apply the function to both test and training datasets
train['processed_text'] = train['text'].apply(lambda x: process_text(x))
test['processed_text'] = test['text'].apply(lambda x: process_text(x))

train

Unnamed: 0,id,keyword,location,text,target,processed_text
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1,deeds reason earthquake may allah forgive
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near ronge sask canada
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or...,1,residents asked shelter place notified officers evacuation shelter place orders expected
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1,people receive wildfires evacuation orders california
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1,got sent photo ruby alaska smoke wildfires pours school
...,...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse into nearby homes http://t.co/STfMbbZFB5,1,two giant cranes holding bridge collapse nearby homes
7609,10870,,,@aria_ahrary @TheTawniest The out of control wild fires in California even in the Northern part ...,1,ariaahrary thetawniest control wild fires california even northern part state troubling
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. http://t.co/zDtoyd8EbJ,1,volcano hawaii
7611,10872,,,Police investigating after an e-bike collided with a car in Little Portugal. E-bike rider suffer...,1,police investigating ebike collided car little portugal ebike rider suffered serious nonlife thr...


## Apply Token Vectorization to processed text data

In [10]:
def count_vect(min_ngram, max_ngram, binary=True):
    #Create a CountVectorizer object
    count_vectorizer = CountVectorizer(ngram_range=(min_ngram, max_ngram), binary=binary)
    
    #Fit on train data and transform both train and test data
    train_vectors = count_vectorizer.fit_transform(train['processed_text'])
    test_vectors = count_vectorizer.transform(test['processed_text'])
    
    #Transform vectors to arrays
    train_features = train_vectors.toarray()
    test_features = test_vectors.toarray()   
    print('X_train.shape: {}'.format(train_features.shape))
    
    return train_features, test_features

In [13]:
X_train, X_test = count_vect(1, 2, binary=True)

y_train = train['target']

X_train.shape: (7613, 60371)


## Build a Machine Learning model

In [14]:
def print_best_score(hp_optimizer):
    """
    Prints the score and parameters of the best model 
    selected by the hyperparameter optimizer.
    
    Also prints the mean and standard deviation score for
    all the models evaluated by the hyperparameter optimizer
    according to the entered parameters.
    """ 
    print('BEST SCORE: {} - PARAMS: {}\n'.format(round(hp_optimizer.best_score_, 3), hp_optimizer.best_params_))
    
    means = hp_optimizer.cv_results_['mean_test_score']
    stds = hp_optimizer.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, hp_optimizer.cv_results_['params']):
        print('Score: {} (+/-{}) for Params: {}'.format(round(mean, 3), round(std * 2, 3), params))

In [15]:
model = MultinomialNB()
parameters = {
    'alpha': np.arange(1, 11, 1)
}

hp_optimizer = GridSearchCV(model, parameters, cv=4, scoring='f1')
hp_optimizer.fit(X_train, y_train.values.ravel())

print_best_score(hp_optimizer)

BEST SCORE: 0.672 - PARAMS: {'alpha': 2}

Score: 0.671 (+/-0.039) for Params: {'alpha': 1}
Score: 0.672 (+/-0.044) for Params: {'alpha': 2}
Score: 0.67 (+/-0.052) for Params: {'alpha': 3}
Score: 0.667 (+/-0.057) for Params: {'alpha': 4}
Score: 0.664 (+/-0.065) for Params: {'alpha': 5}
Score: 0.663 (+/-0.066) for Params: {'alpha': 6}
Score: 0.66 (+/-0.065) for Params: {'alpha': 7}
Score: 0.66 (+/-0.062) for Params: {'alpha': 8}
Score: 0.657 (+/-0.063) for Params: {'alpha': 9}
Score: 0.656 (+/-0.065) for Params: {'alpha': 10}


### Keep optimizing parameters...

In [16]:
model = MultinomialNB()
parameters = {
    'alpha': np.arange(0.1, 3.1, 0.1)
}

hp_optimizer = GridSearchCV(model, parameters, cv=4, scoring='f1')
hp_optimizer.fit(X_train, y_train.values.ravel())

print_best_score(hp_optimizer)

BEST SCORE: 0.673 - PARAMS: {'alpha': 1.9000000000000001}

Score: 0.657 (+/-0.029) for Params: {'alpha': 0.1}
Score: 0.662 (+/-0.027) for Params: {'alpha': 0.2}
Score: 0.667 (+/-0.031) for Params: {'alpha': 0.30000000000000004}
Score: 0.666 (+/-0.033) for Params: {'alpha': 0.4}
Score: 0.668 (+/-0.031) for Params: {'alpha': 0.5}
Score: 0.669 (+/-0.033) for Params: {'alpha': 0.6}
Score: 0.668 (+/-0.033) for Params: {'alpha': 0.7000000000000001}
Score: 0.669 (+/-0.034) for Params: {'alpha': 0.8}
Score: 0.671 (+/-0.038) for Params: {'alpha': 0.9}
Score: 0.671 (+/-0.039) for Params: {'alpha': 1.0}
Score: 0.672 (+/-0.041) for Params: {'alpha': 1.1}
Score: 0.673 (+/-0.04) for Params: {'alpha': 1.2000000000000002}
Score: 0.673 (+/-0.041) for Params: {'alpha': 1.3000000000000003}
Score: 0.672 (+/-0.042) for Params: {'alpha': 1.4000000000000001}
Score: 0.672 (+/-0.04) for Params: {'alpha': 1.5000000000000002}
Score: 0.673 (+/-0.041) for Params: {'alpha': 1.6}
Score: 0.673 (+/-0.041) for Params: 

### Fit final model

In [17]:
best_model = hp_optimizer.best_estimator_

y_pred = best_model.predict(X_train)

confusion_matrix(y_train, y_pred)

array([[4237,  105],
       [ 240, 3031]], dtype=int64)

## Making the submission

In [18]:
predictions = best_model.predict(X_test).tolist()

sample_submission = pd.read_csv('sample_submission.csv')
submission = sample_submission.copy()
submission['target'] = pd.Series(predictions).astype(int)
submission.to_csv('submission_2020-04-05-1.csv', index=False)
submission

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [19]:
submission['target'].value_counts()

0    2078
1    1185
Name: target, dtype: int64

# Score: 0.80470