**Importing Required Libraries**

In [226]:
import matplotlib.pyplot as plt
%matplotlib inline

import time
import nltk
import string
import numpy as np
import pandas as pd
from xgboost import XGBClassifier

from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

from keras.models import Model
from keras.optimizers import Adam, SGD
from keras.preprocessing import text, sequence
from keras.layers import Input, Dense, LSTM, Embedding
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dropout, Activation, Bidirectional, GlobalMaxPool1D
from keras import initializers, regularizers, constraints, optimizers, layers

from nltk.corpus import stopwords
from nltk import word_tokenize, FreqDist
from nltk.tokenize import RegexpTokenizer

from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score

np.random.seed(0)

In [9]:
df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [10]:
df['num_char'] = df['text'].apply(lambda x: len(x))
df['num_words'] = df['text'].apply(lambda x: len(x.split()))

# Filling in Keyword Nulls

In [11]:
#grab all #hashtags in a list
df.loc[df['keyword'].isna(),'hashtags'] = df['text'].apply(lambda x: [i[1:] for i in x.split() if '#' in i])

#create a set of all values in keyword column
keywords = set(df[~df.keyword.isna()].keyword.values)

In [12]:
#narrow down the list list of all hashtags in keywords
df['new_tag'] = df.hashtags.apply(lambda x: [i.lower() for i in x if i in keywords] if type(x) == list else np.nan)

#drop empty lists
df['new_tag'] = df['new_tag'].apply(lambda x: x if x != [] and x != np.nan else np.nan)

#grab the first hashtag and fill in missing values
df['new_tag'] = df['new_tag'].apply(lambda x: x[0] if type(x) == list else np.nan)
df.loc[df.keyword.isna(), 'keyword'] = df['new_tag']

In [13]:
#drop old columns
df.drop(['new_tag','hashtags'],inplace=True, axis=1)

# Removing Weird Locations

In [14]:
#get rid of weird characters and blank spaces
characters= ['?','/', '#','+']
df['location'] = df.location.apply(lambda x: x if not any([char in str(x) for char in characters]) else np.nan)
df['location'] = df.location.apply(lambda x: x if type(x) != str else (x if x.strip() != '' else np.nan))


In [15]:
#get rid of locations with numbers
df['location'] = df.location.apply(lambda x: x if not any([i.isdigit() for i in str(x)]) else np.nan)

In [16]:
#get rid of locations with 4 or more words
df['location'] = df.location.apply(lambda x: x if type(x) != str else (x if len(x.split()) < 4 else np.nan))

In [17]:
bad_abbrev_and_words = ['Earth','Worldwide','Everywhere','Reddit','World','Global','ava','EIC','HTX', 'ATX','atx','PDX','MNL','CLT', 'NBO', 'AEP','mnl', 'ayr', 'GCC', 'Htx','wny', 'VCU', 'Orm', 'DMV','Ktx',]
df['location'] = df.location.apply(lambda x: x if x not in bad_abbrev_and_words else np.nan)


In [18]:
df.loc[df.groupby('location').location.transform('count') == 1,'location'] = np.nan

# Filling in Location Nulls

In [19]:
#split into two differnt columns for upper case abbreviations and lower case for words
df['new_loc_lower'] = df.text.apply(lambda x: [i.lower().strip() for i in x.split() if len(i) > 2])
df['new_loc_upper'] = df.text.apply(lambda x: [i.upper().strip() for i in x.split() if len(i) <= 2])

In [20]:
#remove all blank strings from both
df['new_loc_lower'] = df.new_loc_lower.apply(lambda x: x if '' not in x else np.nan)
df['new_loc_upper'] = df.new_loc_upper.apply(lambda x: x if '' not in x else np.nan)

#remove all empty lists from both
df['new_loc_lower'] = df.new_loc_lower.apply(lambda x: x if x != [] else np.nan)
df['new_loc_upper'] = df.new_loc_upper.apply(lambda x: x if x != [] else np.nan)

In [21]:
#create a unique list of locations for both upper and lower
x = list(df.location.unique())[1:]
key_down = [i.lower() for i in x if len(i) > 2]
key_up = [i.upper() for i in x if len(i) <= 2]

In [22]:
#remove all words that arnt in either key_up or key_down
df['new_loc_lower'] = df.new_loc_lower.apply(lambda x: [i.lower() for i in x if i.lower() in key_down] if type(x) == list else np.nan)
df['new_loc_upper'] = df.new_loc_upper.apply(lambda x: [i.upper() for i in x if i.upper() in key_up] if type(x) == list else np.nan)


In [23]:
#remove common 2 letter words from initials
bad_initials = ['IN','ON', 'OK']
df['new_loc_upper'] = df.new_loc_upper.apply(lambda x: [i for i in x if i not in bad_initials] if type(x) == list else np.nan)

#remove empty lists and grab the first element in each list
df['new_loc_upper'] = df.new_loc_upper.apply(lambda x: x if x != [] else np.nan)
df['new_loc_upper'] = df.new_loc_upper.apply(lambda x: x[0] if type(x) == list else x)

In [24]:
#remove empty lists
df['new_loc_lower'] = df.new_loc_lower.apply(lambda x: x if x != [] else np.nan)

#if only element in list grab that element
df['new_loc_lower'] = df.new_loc_lower.apply(lambda x: x if type(x) != list else (x[0] if len(x) == 1 else x))

#if element repeats in list grab that element
df['new_loc_lower'] = df.new_loc_lower.apply(lambda x: x if type(x) != list else (x[0] if x[0] in x[1:] else x))

In [25]:
bad_places = ['west','east','north','south', 'mass', 'hell','heaven', 'global','world', 'unknown', 'earth', 'ebola', 'nowhere','studio', 'lincoln']

#remove all words in bad_places for lists
df['new_loc_lower'] = df.new_loc_lower.apply(lambda x: [i for i in x if i not in bad_places] if type(x) == list else x)

#remove all words in bad_places for strings
df['new_loc_lower'] = df.new_loc_lower.apply(lambda x: x if type(x) != str else(x if x not in bad_places else np.nan))

#set empty lists to NaN and grab the first element in each list
df['new_loc_lower'] = df.new_loc_lower.apply(lambda x: x if x != [] else np.nan)
df['new_loc_lower'] = df.new_loc_lower.apply(lambda x: x[0] if type(x) == list else x)

In [26]:
#set 2 digit location first since more common, then words
df.loc[df['location'].isna(),'location'] = df.loc[df['location'].isna(),'new_loc_upper']
df.loc[df['location'].isna(),'location'] = df.loc[df['location'].isna(),'new_loc_lower']

In [27]:
df.drop(['new_loc_lower','new_loc_upper'],inplace=True,axis=1)

In [28]:
df.loc[df.groupby('location').location.transform('count') == 1, 'location'] = 'Missing'

In [29]:
#taking the second location in each location with two specified
df['location'] = df['location'].apply(lambda x: x if type(x) != str else x.split(', ')[1] if ', ' in x else x)

In [30]:
#removing .
df['location'] = df['location'].apply(lambda x: x if type(x) != str else  ''.join(x.split('.')) if '.' in x else x)

#capitalize 
df['location'] = df['location'].apply(lambda x: x.title() if type(x) == str else x)

In [31]:
df.loc[df.groupby('location').location.transform('count') < 11,'location'] = 'ten_or_less'

In [32]:
df.location.fillna('Missing', inplace=True)
df.keyword.fillna('Missing', inplace=True)

In [33]:
stopwords_list = stopwords.words('english')
stopwords_list += list(string.punctuation)
stopwords_list += ["''", '""', '...', '``', 'http', 'https']

def process_tweet(tweet):
    tokenized_tweet = nltk.word_tokenize(tweet)
    clean_results = [w.lower() for w in tokenized_tweet if not w.lower() in stopwords_list and not 't.co/' in w.lower()]
    return clean_results

In [34]:
processed_data = list(map(process_tweet, df.text))
df['tokenized_text'] = processed_data

In [35]:
total_vocab = set()
for i in processed_data:
    total_vocab.update(i)
len(total_vocab)
total_wordcount = len(total_vocab)

In [36]:
tweets_concat = []
for i in processed_data:
    tweets_concat+=i

In [37]:
tweets_freqdist = FreqDist(tweets_concat)
tweets_freqdist.most_common(20)

[("'s", 791),
 ("n't", 446),
 ('like', 345),
 ('amp', 344),
 ("'m", 250),
 ('fire', 249),
 ('get', 228),
 ('new', 219),
 ('via', 218),
 ('people', 197),
 ('news', 197),
 ('one', 194),
 ('video', 165),
 ('2', 162),
 ('emergency', 155),
 ('disaster', 153),
 ('would', 141),
 ('police', 138),
 ("'re", 129),
 ('still', 128)]

In [56]:
target = df.target
features = df.drop(columns=['target','id'])

X_train,X_test, y_train,y_test = train_test_split(features,target,test_size = 0.2,random_state = 2)

# TF-IDF Vectorization

In [57]:
vectorizer = TfidfVectorizer()

In [16]:
tf_idf_data_train = vectorizer.fit_transform(X_train.text)

tf_idf_data_test = vectorizer.transform(X_test.text)

In [17]:
tf_idf_data_train.shape

(6090, 18449)

In [18]:
non_zero_cols = tf_idf_data_train.nnz / float(tf_idf_data_train.shape[0])
print("Average Number of Non-Zero Elements in Vectorized Tweets: {}".format(non_zero_cols))

percent_sparse = 1 - (non_zero_cols / float(tf_idf_data_train.shape[1]))
print('Percentage of columns containing 0: {}'.format(percent_sparse))

Average Number of Non-Zero Elements in Vectorized Tweets: 14.652709359605911
Percentage of columns containing 0: 0.9992057721632822


In [19]:
df = df.drop(columns=['tokenized_text','id'])

In [21]:
nb_classifier = MultinomialNB()
rf_classifier = RandomForestClassifier(n_estimators=100)

nb_classifier.fit(tf_idf_data_train, y_train)
nb_train_preds = nb_classifier.predict(tf_idf_data_train)
nb_test_preds = nb_classifier.predict(tf_idf_data_test)

In [22]:
rf_classifier.fit(tf_idf_data_train, y_train)
rf_train_preds = rf_classifier.predict(tf_idf_data_train)
rf_test_preds = rf_classifier.predict(tf_idf_data_test)

In [23]:
nb_train_score = accuracy_score(y_train, nb_train_preds)
nb_test_score = accuracy_score(y_test, nb_test_preds)
rf_train_score = accuracy_score(y_train, rf_train_preds)
rf_test_score = accuracy_score(y_test, rf_test_preds)

print("Multinomial Naive Bayes")
print("Training Accuracy: {:.4} \t\t Testing Accuracy: {:.4}".format(nb_train_score, nb_test_score))
print("")
print('-'*70)
print("")
print('Random Forest')
print("Training Accuracy: {:.4} \t\t Testing Accuracy: {:.4}".format(rf_train_score, rf_test_score))

Multinomial Naive Bayes
Training Accuracy: 0.8892 		 Testing Accuracy: 0.7912

----------------------------------------------------------------------

Random Forest
Training Accuracy: 0.9969 		 Testing Accuracy: 0.7781


# Random Forest TF-IDF

In [124]:

#tokenizer to remove unwanted elements from out data like symbols and numbers
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(lowercase=True,stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize)
text_counts= cv.fit_transform(df['text'])

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    text_counts, df['target'], test_size=0.3, random_state=1)

In [8]:
rf_clf = RandomForestClassifier().fit(X_train, y_train)
predicted= rf_clf.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, predicted))



Random Forest Accuracy: 0.7810858143607706


# Multinomial Naive Bayes TF-IDF

In [187]:
tf=TfidfVectorizer()
text_tf = tf.fit_transform(df['text'])

X_train, X_test, y_train, y_test = train_test_split(
    text_tf, df['target'], test_size=0.25, random_state=1)

In [188]:
mnb_clf = MultinomialNB().fit(X_train, y_train)
predicted= mnb_clf.predict(X_test)
print("MultinomialNB Accuracy:", accuracy_score(y_test, predicted))

MultinomialNB Accuracy: 0.7962184873949579


In [218]:
mnb_param_grid = {
    'alpha': [0.0001,0.0005,0.001,0.01,0.05,0.1,0.2,0.6,1.0,1.1,1.5]
}

In [219]:
start = time.time()
mnb_grid_search = GridSearchCV(mnb_clf,mnb_param_grid,cv=50)
mnb_grid_search.fit(X_train, y_train)

print("Testing Accuracy: {:.4}%".format(mnb_grid_search.best_score_ * 100))
print("Total Runtime for Grid Search on Multinomial Naive Bayes: {:.4} seconds".format(time.time() - start))
print("")
print("Optimal Parameters: {}".format(mnb_grid_search.best_params_))

Testing Accuracy: 79.93%
Total Runtime for Grid Search on Multinomial Naive Bayes: 2.615 seconds

Optimal Parameters: {'alpha': 0.6}


## RF Hyperparameter Tuning


In [260]:
rf_param_grid = {
    "criterion": ["gini", "entropy"],
    "max_depth": [None, 10, 20, 30],
    "min_samples_split": [2, 5, 10, 20],
    "min_samples_leaf" : [1, 3, 6, 9],
    "n_estimators" : [10, 30, 100, 150, 200]
}

In [261]:

start = time.time()
rf_grid_search = GridSearchCV(rf_clf,rf_param_grid,cv=3)
rf_grid_search.fit(X_train, y_train)

print("Testing Accuracy: {:.4}%".format(rf_grid_search.best_score_ * 100))
print("Total Runtime for Grid Search on Random Forest Classifier: {:.4} seconds".format(time.time() - start))
print("")
print("Optimal Parameters: {}".format(rf_grid_search.best_params_))

Testing Accuracy: 78.06%
Total Runtime for Grid Search on Random Forest Classifier: 3.172e+03 seconds

Optimal Parameters: {'criterion': 'entropy', 'max_depth': None, 'min_samples_leaf': 3, 'min_samples_split': 10, 'n_estimators': 150}


In [69]:
adaboost_clf = AdaBoostClassifier()
adaboost_clf.fit(X_train, y_train)
adaboost_mean_cv_score = np.mean(cross_val_score(adaboost_clf,X_train,y_train,cv=3))

print("Mean Cross Validation Score for AdaBoost: {:.4}%".format(adaboost_mean_cv_score * 100))

Mean Cross Validation Score for AdaBoost: 73.02%


In [70]:
adaboost_param_grid = {
    "n_estimators": [50, 100, 250],
    "learning_rate": [1.0, 0.5, 0.1]
}

In [71]:
adaboost_grid_search = GridSearchCV(adaboost_clf,adaboost_param_grid,cv=3)
adaboost_grid_search.fit(X_train, y_train)

print("Testing Accuracy: {:.4}%".format(adaboost_grid_search.best_score_ * 100))
print("Total Runtime for Grid Search on AdaBoost: {:.4} seconds".format(time.time() - start))
print("")
print("Optimal Parameters: {}".format(adaboost_grid_search.best_params_))

Testing Accuracy: 75.55%
Total Runtime for Grid Search on AdaBoost: 854.7 seconds

Optimal Parameters: {'learning_rate': 0.5, 'n_estimators': 250}


In [14]:
data = df['text'].map(word_tokenize).values

In [15]:
def process_tweet(tweet):
    tokenized_tweet = nltk.word_tokenize(tweet)
    clean_results = [w.lower() for w in tokenized_tweet if not w.lower() in stopwords_list and not 't.co/' in w.lower()]
    return clean_results

In [16]:
total_vocabulary = set(word for headline in data for word in headline)
print("There are {} unique tokens in the dataset.".format(len(total_vocabulary)))

There are 27285 unique tokens in the dataset.


In [17]:
glove = {}
with open('glove.6B.50d.txt', 'rb') as f:
    for line in f:
        parts = line.split()
        word = parts[0].decode('utf-8')
        if word in total_vocabulary:
            vector = np.array(parts[1:], dtype=np.float32)
            glove[word] = vector

In [18]:
class W2vVectorizer(object):
    
    def __init__(self, w2v):
        # takes in a dictionary of words and vectors as input
        self.w2v = w2v
        if len(w2v) == 0:
            self.dimensions = 0
        else:
            self.dimensions = len(w2v[next(iter(glove))])
    
    # Note from Mike: Even though it doesn't do anything, it's required that this object implement a fit method or else
    # It can't be used in a sklearn Pipeline. 
    def fit(self, X, y):
        return self
            
    def transform(self, X):
        return np.array([
            np.mean([self.w2v[w] for w in words if w in self.w2v]
                   or [np.zeros(self.dimensions)], axis=0) for words in X])

In [19]:


rf =  Pipeline([("Word2Vec Vectorizer", W2vVectorizer(glove)),
              ("Random Forest", RandomForestClassifier(n_estimators=100, verbose=True))])
svc = Pipeline([("Word2Vec Vectorizer", W2vVectorizer(glove)),
                ('Support Vector Machine', SVC())])
lr = Pipeline([("Word2Vec Vectorizer", W2vVectorizer(glove)),
              ('Logistic Regression', LogisticRegression())])

In [20]:
models = [('Random Forest', rf)]
#           ("Support Vector Machine", svc),
#           ("Logistic Regression", lr)]

In [23]:
scores = [(name, cross_val_score(model, data, target, cv=2).mean()) for name, model, in models]

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    1.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    1.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.1s finished


In [24]:
scores

[('Random Forest', 0.7324304483223025)]

# Deep Learning w/ Word Embeddings

In [390]:
tokenizer = text.Tokenizer(num_words=5000)
tokenizer.fit_on_texts(list(df.text))
list_tokenized_tweets = tokenizer.texts_to_sequences(df.text)
X_t = sequence.pad_sequences(list_tokenized_tweets, maxlen=20)

In [391]:
y = pd.get_dummies(target).values

In [392]:
embedding_size = 32
input_ = Input(shape=(20,))
x = Embedding(5000, embedding_size)(input_)
x = LSTM(100, return_sequences=True)(x)
x = Dropout(0.5)(x)
x = Dense(50, kernel_regularizer=regularizers.l2(0.5))(x)
x = Dropout(0.5)(x)
x = Dense(25)(x)
x = Dropout(0.5)(x)
x = Dense(25)(x)
x = Dropout(0.5)(x)
x = GlobalMaxPool1D()(x)
x = Dropout(0.5)(x)
x = Dense(50, kernel_regularizer=regularizers.l2(0.5))(x)
x = Dropout(0.5)(x)
# There are 2 different possible classes, so we use 2 neurons in our output layer
x = Dense(1, activation='sigmoid')(x)

model = Model(inputs=input_, outputs=x)

In [393]:
model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.00005), metrics=['accuracy'])

In [394]:
model.summary()

Model: "model_45"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_45 (InputLayer)        (None, 20)                0         
_________________________________________________________________
embedding_45 (Embedding)     (None, 20, 32)            160000    
_________________________________________________________________
lstm_45 (LSTM)               (None, 20, 100)           53200     
_________________________________________________________________
dropout_112 (Dropout)        (None, 20, 100)           0         
_________________________________________________________________
dense_148 (Dense)            (None, 20, 50)            5050      
_________________________________________________________________
dropout_113 (Dropout)        (None, 20, 50)            0         
_________________________________________________________________
dense_149 (Dense)            (None, 20, 25)            127

In [7]:
history = model.fit(X_t, target, epochs=400, batch_size=100, validation_split=0.2)

# Plot training & validation accuracy values
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

NameError: name 'model' is not defined

# XGBoost

## w/ only tf-idf

In [228]:
target = df.target
tf=TfidfVectorizer()
text_tf = tf.fit_transform(df.text)

X_train, X_test, y_train, y_test = train_test_split(
    text_tf, target, test_size=0.3, random_state=1)

In [79]:
xgb_clf = XGBClassifier()

xgb_clf.fit(X_train, y_train)
xgb_clf.predict(X_test)

In [89]:
xgb_param_grid = {
    "max_depth": [1, 3, 6, 9],
    "learning_rate": [0.001,0.01, 0.05, 0.1],
    "n_estimators" : [10, 30, 100, 150, 200]
}

In [90]:
start = time.time()
xgb_grid_search = GridSearchCV(xgb_clf,xgb_param_grid,cv=3)
xgb_grid_search.fit(X_train, y_train)

print("Testing Accuracy: {:.4}%".format(xgb_grid_search.best_score_ * 100))
print("Total Runtime for Grid Search on XGBoost Classifier: {:.4} seconds".format(time.time() - start))
print("")
print("Optimal Parameters: {}".format(xgb_grid_search.best_params_))

Testing Accuracy: 76.02%
Total Runtime for Grid Search on XGBoost Classifier: 2.569e+03 seconds



NameError: name 'zgb_grid_search' is not defined

In [91]:
print("Testing Accuracy: {:.4}%".format(xgb_grid_search.best_score_ * 100))
print("Total Runtime for Grid Search on XGBoost Classifier: {:.4} seconds".format(time.time() - start))
print("")
print("Optimal Parameters: {}".format(xgb_grid_search.best_params_))

Testing Accuracy: 76.02%
Total Runtime for Grid Search on XGBoost Classifier: 2.608e+03 seconds

Optimal Parameters: {'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 200}


# Support Vector Machine TF-IDF

In [236]:
SVM_clf = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM_clf.fit(X_train,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [239]:
SVM_preds = SVM_clf.predict(X_test)
print("SVM Accuracy Score -> ",accuracy_score(SVM_preds, y_test)*100)

SVM Accuracy Score ->  80.69176882661996


In [243]:
SVM_param_grid = {
    "C": [1.0, 0.95, 0.9,],
    "kernel": ['rbf', 'poly', 'sigmoid', 'linear'],
    "degree" : [3, 4, 5]
}

In [246]:
start = time.time()
SVM_grid_search = GridSearchCV(SVM_clf,SVM_param_grid,cv=3)
SVM_grid_search.fit(X_train, y_train)

print("Testing Accuracy: {:.4}%".format(SVM_grid_search.best_score_ * 100))
print("Total Runtime for Grid Search on Support Vector Classifier: {:.4} seconds".format(time.time() - start))
print("")
print("Optimal Parameters: {}".format(SVM_grid_search.best_params_))

Testing Accuracy: 79.92%
Total Runtime for Grid Search on Support Vector Classifier: 14.66 seconds

Optimal Parameters: {'C': 0.9, 'degree': 3, 'kernel': 'linear'}


In [242]:
print("Testing Accuracy: {:.4}%".format(SVM_grid_search.best_score_ * 100))
print("Total Runtime for Grid Search on Support Vector Classifier: {:.4} seconds".format(time.time() - start))
print("")
print("Optimal Parameters: {}".format(SVM_grid_search.best_params_))

Testing Accuracy: 79.92%
Total Runtime for Grid Search on Support Vector Classifier: 259.2 seconds

Optimal Parameters: {'C': 0.9, 'degree': 3, 'kernel': 'linear'}


 # Submission Stuff

## tf-idf Multinomial NB and RF submission

In [211]:
sample_submission = pd.read_csv('sample_submission.csv')

In [212]:
vectorizer2 = TfidfVectorizer()

In [213]:
df.head()

Unnamed: 0,id,keyword,location,text,target,num_char,num_words,tokenized_text
0,1,earthquake,Us,Our Deeds are the Reason of this #earthquake M...,1,69,13,"[deeds, reason, earthquake, may, allah, forgiv..."
1,4,Missing,Canada,Forest fire near La Ronge Sask. Canada,1,38,7,"[forest, fire, near, la, ronge, sask, canada]"
2,5,Missing,Missing,All residents asked to 'shelter in place' are ...,1,133,22,"[residents, asked, 'shelter, place, notified, ..."
3,6,Missing,California,"13,000 people receive #wildfires evacuation or...",1,65,8,"[13,000, people, receive, wildfires, evacuatio..."
4,7,Missing,Missing,Just got sent this photo from Ruby #Alaska as ...,1,88,16,"[got, sent, photo, ruby, alaska, smoke, wildfi..."


In [214]:
tf_idf_data_train2 = vectorizer2.fit_transform(df.text)

tf_idf_data_test2 = vectorizer2.transform(test_df.text)

In [220]:
nb_classifier2 = MultinomialNB(alpha=0.6)

nb_classifier2.fit(tf_idf_data_train2, df.target)
nb_train_preds2 = nb_classifier2.predict(tf_idf_data_train2)
nb_test_preds2 = nb_classifier2.predict(tf_idf_data_test2)

In [121]:
rf_classifier2 = RandomForestClassifier(criterion='entropy', max_depth=None, min_samples_leaf=3, min_samples_split=10, n_estimators=150)

rf_classifier2.fit(tf_idf_data_train2, df.target)
rf_train_preds2 = rf_classifier2.predict(tf_idf_data_train2)
rf_test_preds2 = rf_classifier2.predict(tf_idf_data_test2)

In [222]:
sample_submission['target'] = nb_test_preds2

In [223]:
sample_submission.set_index('id', inplace=True)

In [224]:
sample_submission.to_csv('submission4')

## NN w/ word embeddings submission

In [None]:
tokenizer_test = text.Tokenizer(num_words=500)
tokenizer_test.fit_on_texts(list(test_df.text))
list_tokenized_tweets_test = tokenizer_test.texts_to_sequences(test_df.text)
testX_t = sequence.pad_sequences(list_tokenized_tweets_test, maxlen=100)

In [None]:
nn_pred = model.predict(testX_t)

y_classes = nn_pred.argmax(axis=-1)

In [225]:
sample_submission1.target = y_classes
sample_submission1.head()

Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
0,1
2,0
3,0
9,0
11,0


In [226]:
sample_submission1.to_csv('submission1')

## XGBoost TF-IDF Submission

In [94]:
vectorizer3 = TfidfVectorizer()
tf_idf_data_train3 = vectorizer3.fit_transform(df.text)
tf_idf_data_test3 = vectorizer3.transform(test_df.text)

In [95]:
xgb_clf = XGBClassifier(learning_rate=0.1, max_depth=9, n_estimators=200)

xgb_clf.fit(tf_idf_data_train3, df.target)
xbg_preds = xgb_clf.predict(tf_idf_data_test3)

In [112]:
sample_submission3 = sample_submission
sample_submission3.target = xbg_preds
sample_submission3.set_index('id', inplace=True)
sample_submission3.to_csv('submission3')

KeyError: 'id'

## SVC w/ TF-IDF

In [247]:
vectorizer4 = TfidfVectorizer()
tf_idf_data_train4 = vectorizer3.fit_transform(df.text)
tf_idf_data_test4 = vectorizer3.transform(test_df.text)

In [248]:
SVM_clf = svm.SVC(C=0.9, kernel='linear', degree=3, gamma='auto')
SVM_clf.fit(tf_idf_data_train4, df.target)
SVM_preds = SVM_clf.predict(tf_idf_data_test4)

In [252]:
sample_submission4 = sample_submission
sample_submission4.target = SVM_preds
sample_submission4.to_csv('submission5')