# Disaster Tweets

In [82]:
import re
import spacy
import numpy as np
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer


from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS, CountVectorizer, TfidfVectorizer
from sklearn.decomposition import PCA, KernelPCA
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.metrics import f1_score

In [3]:
# Load original data
training_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [4]:
# Inspect shapes
print('Training set shape:', training_data.shape)
print('Test set shape:    ', test_data.shape)

Training set shape: (7613, 5)
Test set shape:     (3263, 4)


In [5]:
# Inspect first rows
display(training_data.head())
display(test_data.head())

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [23]:
# Missing values in training set
training_data.isnull().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [24]:
# Missing values in test set
test_data.isnull().sum()

id             0
keyword       26
location    1105
text           0
dtype: int64

## Pre-Processing

### Indicatior features

In [25]:
# Functions to create a DataFrame of indicators
def indicate_pattern(col, pattern):
    if re.search(pattern, col):
        return 1
    else:
        return 0
    
    
def make_indicator_col(df, input_col_index, pattern='', suffix='pattern'):
    df['contains_' + suffix] = df[input_col_index].apply(lambda x: indicate_pattern(x, pattern))
    
    
def make_indicators(df, column, pattern_list, suffix_list):
    # Append indicator columns to original DataFrame
    for pattern, suffix in zip(pattern_list, suffix_list):
        make_indicator_col(df, column, pattern, suffix)
    
    new_columns = ['contains_' + suffix for suffix in suffix_list]
    output = df[new_columns]
    
    # Drop new columns from original DataFrame
    df.drop(new_columns, axis=1, inplace=True)
    
    return output

In [26]:
# Create indicators by regex patterns
training_data_ind = make_indicators(training_data, 'text', 
                                    pattern_list=['@[\w]*', '#[\w]*', r'http.?://[^\s]+[\s]?'], 
                                    suffix_list=['mention', 'hashtag', 'URL'])
test_data_ind = make_indicators(training_data, 'text', 
                                pattern_list=['@[\w]*', '#[\w]*', r'http.?://[^\s]+[\s]?'], 
                                suffix_list=['mention', 'hashtag', 'URL'])

### Clean text data

In [27]:
# Handle any duplicates in only the training set
print('Before:', training_data.shape)
training_data.drop_duplicates(inplace=True)
print('After: ', training_data.shape)

Before: (7613, 5)
After:  (7613, 5)


In [28]:
# Remove menntions, special characters, and extra whitespace from tweets
def clean_tweet(tweet):
    tweet = tweet.lower()
    # Remove mentions
    tweet = re.sub(r'@\w+', '', tweet)
    # Remove URLs
    tweet = re.sub(r'http.?://[^\s]+[\s]?', '', tweet)
    # Remove special characters except hash
    tweet = re.sub('[^a-zA-Z\s]', ' ', tweet)
    # Remove extra whitespace
    tweet = re.sub(" +", ' ', tweet)
    tweet = tweet.lstrip()
    tweet = tweet.rstrip()
    return tweet

def remove_stopwords(tweet):
    combined_stopwords = set(stopwords.words('english') + list(ENGLISH_STOP_WORDS))
    tokens = [token for token in tweet.split() if token not in combined_stopwords]
    return ' '.join(tokens)

def clean_df(df):
    df['cleaned_text'] =  df['text'].apply(clean_tweet)
    df['cleaned_text'] = df['cleaned_text'].apply(remove_stopwords)
    df = df.drop(['text', 'keyword', 'location'], axis=1, inplace=True)
    return df

In [29]:
# Clean training data, and add length features to indicator DataFrame
training_data_cleaned = clean_df(training_data)
training_data_ind[['original_length', 'cleaned_length']] = training_data_cleaned[['original_length', 'cleaned_length']]
training_data_cleaned = training_data_cleaned.drop(['original_length', 'cleaned_length'], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [30]:
# Clean test data, and add length features to indicator DataFrame
test_data_cleaned = clean_df(test_data)
test_data_ind[['original_length', 'cleaned_length']] = test_data_cleaned[['original_length', 'cleaned_length']]
test_data_cleaned = test_data_cleaned.drop(['original_length', 'cleaned_length'], axis=1)

### Make tokenizers

In [31]:
# Make a custom stemming tokenizer
def tokenizer_stems(document):
    stemmer = PorterStemmer()
    tokens = document.split()
    return [stemmer.stem(token) for token in tokens]

### TF-IDF transformation

In [33]:
training_data_cleaned['target_class'] = training_data_cleaned['target']
training_data_cleaned = training_data_cleaned.drop('target', axis=1)

In [36]:
vectorizer = TfidfVectorizer(min_df=3, tokenizer=tokenizer_stems)
vectorizer.fit(training_data_cleaned['cleaned_text'])
v = vectorizer.transform(training_data_cleaned['cleaned_text']).todense()

features = vectorizer.get_feature_names()
vect_df = pd.DataFrame(v, columns=features)



In [39]:
# Combine vect_df with indicators
abt = pd.concat([training_data_ind, vect_df], axis=1)
abt.head()

Unnamed: 0,contains_mention,contains_hashtag,contains_URL,original_length,cleaned_length,aa,aba,abandon,abbott,abbswinston,...,yyc,yycstorm,z,zayn,zero,zionism,zionist,zombi,zone,zouma
0,0,1,0,69,37,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0,0,38,32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0,0,133,88,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,1,0,65,53,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,1,0,88,55,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
y = training_data_cleaned['target_class']

## Model Selection

In [42]:
X_train, X_test, y_train, y_test = train_test_split(abt, y, stratify=y, random_state=1)

In [99]:
classifiers = [
    ('Logistic / liblinear', LogisticRegression()),
    #('Logistic / newton-cg, lbfgs, sag', LogisticRegression()),
    #('Logistic / saga', LogisticRegression()),
    ('LinearSVC', LinearSVC()),
    ('SVC', SVC()),
    ('SGD', SGDClassifier()),
    ('BernNB', BernoulliNB()),
    ('MultNB', MultinomialNB()),
    ('RandomForests', RandomForestClassifier()),
    ('ExtraTrees', ExtraTreesClassifier()),
    ('Bagging', BaggingClassifier())
]

param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
gamma = param_range + ['scale', 'auto']

param_grid = [
    {
        'clf': [LogisticRegression()],
        'clf__solver': ['liblinear'],
        'clf__penalty': ['l1', 'l2'],
        'clf__C': param_range,
    }, 
        #{'clf': [LogisticRegression()],
        #'clf__solver': ['lbfgs'],
        #'clf__penalty': ['l2', None],
        #'clf__C': param_range}
        #{'clf': [LogisticRegression()],
        #'clf__solver': ['saga'],
        #'clf__penalty': ['l1', 'l2', 'elasticnet', None],
        #'clf__C': param_range}
    {
        'clf': [LinearSVC()],
        'clf__C': param_range,
        'clf__penalty': ['l1', 'l2']
    }, {
        'clf': [SVC()],
        'clf__C': param_range,
        'clf__kernel': ['rbf'],
        'clf__gamma': gamma
    }, {
        'clf': [SGDClassifier()],
        'clf__loss': ['log', 'hinge'],
        'clf__penalty': ['l1', 'l2', 'elasticnet'],
        'clf__alpha': param_range
    }, {
        'clf': [BernoulliNB()],
        'clf__alpha': param_range
    }, {
        'clf': [MultinomialNB()],
        'clf__alpha': param_range
    }, {
        'clf': [RandomForestClassifier()],
        'clf__n_estimators': [50, 100, 250]
    }, {
        'clf': [ExtraTreesClassifier()],
        'clf__n_estimators': [50, 100, 250]
    }, {
        'clf': [BaggingClassifier()],
        'clf__n_estimators': [50, 100, 250]
    }
]
        
for (name, classifier), params in zip(classifiers, param_grid):
    clf_pipe = Pipeline([
        ('clf', LinearSVC())
    ])

    random_search = RandomizedSearchCV(clf_pipe, 
                                       params, 
                                       cv=3, 
                                       n_jobs=-1, 
                                       return_train_score=True,
                                       random_state=1)
    random_search.fit(X_train, y_train)
    print(name)
    print("CV: {:.2f}".format(random_search.best_score_))
    print("Test score: {:.2f}".format(random_search.score(X_test, y_test)))
    print("Best parameters: {}\n".format(random_search.best_params_))



Logistic / liblinear
CV: 0.76
Test score: 0.76
Best parameters: {'clf__solver': 'liblinear', 'clf__penalty': 'l1', 'clf__C': 10.0, 'clf': LogisticRegression(C=10.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l1',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)}





KeyboardInterrupt: 

## Bernoulli Naive-Bayes

In [50]:
bnb_params = {
    'alpha': np.arange(1.0, 1.25, 1.5, 1.75, 2.0)
}

bnb_grid = GridSearchCV(BernoulliNB(), bnb_params, cv=7, n_jobs=-1, return_train_score=True)
bnb_grid.fit(X_train, y_train)
print("Best cross-validation score: {}".format(bnb_grid.best_score_))
print("Best parameters: {}".format(bnb_grid.best_params_))

Best cross-validation score: 0.8013662637940094
Best parameters: {'alpha': 1.3500000000000003}


### Predictions

In [74]:
# TF-IDF transformation on test data
test_v = vectorizer.transform(test_data_cleaned['cleaned_text']).todense()

test_vect_df = pd.DataFrame(test_v, columns=features)

test_abt = pd.concat([test_data_ind, test_vect_df], axis=1)

# Fit model, make predictions
bnb = BernoulliNB(alpha=1.35)
bnb.fit(abt, y)

ids = test_data['id']
bnb_pred = pd.Series(bnb.predict(test_abt), index=ids)

# Export predictions
final = pd.DataFrame({'id':bnb_pred.index, 'target':bnb_pred.values})
final.to_csv('bnb.csv', index=False, header=True)