In [104]:
import nltk
from nltk.corpus import stopwords
import string
from nltk import word_tokenize, FreqDist
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
np.random.seed(0)

In [67]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [103]:
train_df.location.isna().sum()

2533

In [68]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [69]:
stopwords_list = stopwords.words('english')
stopwords_list += list(string.punctuation)
stopwords_list += ["''", '""', '...', '``', 'http', 'https']

In [70]:
def process_tweet(tweet):
    tokenized_tweet = nltk.word_tokenize(tweet)
    clean_results = [w.lower() for w in tokenized_tweet if not w.lower() in stopwords_list and not 't.co/' in w.lower()]
    return clean_results

In [71]:
processed_data = list(map(process_tweet,train_df.text))
train_df['tokenized_text'] = processed_data

In [72]:
total_vocab = set()
for i in processed_data:
    total_vocab.update(i)
len(total_vocab)

18443

In [73]:
tweets_concat = []
for i in processed_data:
    tweets_concat+=i

In [74]:
tweets_freqdist = FreqDist(tweets_concat)
tweets_freqdist.most_common(20)

[("'s", 791),
 ("n't", 446),
 ('like', 345),
 ('amp', 344),
 ("'m", 250),
 ('fire', 249),
 ('get', 228),
 ('new', 219),
 ('via', 218),
 ('people', 197),
 ('news', 197),
 ('one', 194),
 ('video', 165),
 ('2', 162),
 ('emergency', 155),
 ('disaster', 153),
 ('would', 141),
 ('police', 138),
 ("'re", 129),
 ('still', 128)]

In [75]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [76]:
vectorizer = TfidfVectorizer()

In [77]:
tf_idf_data_train = vectorizer.fit_transform(train_df.text)

tf_idf_data_test = vectorizer.transform(train_df.text)

In [83]:
tf_idf_data_train.shape

(7613, 21637)

In [85]:
non_zero_cols = tf_idf_data_train.nnz / float(tf_idf_data_train.shape[0])
print("Average Number of Non-Zero Elements in Vectorized Tweets: {}".format(non_zero_cols))

percent_sparse = 1 - (non_zero_cols / float(tf_idf_data_train.shape[1]))
print('Percentage of columns containing 0: {}'.format(percent_sparse))

Average Number of Non-Zero Elements in Vectorized Tweets: 14.645606199921188
Percentage of columns containing 0: 0.9993231221426297


In [88]:
target = train_df.target

In [90]:
nb_classifier = MultinomialNB()
rf_classifier = RandomForestClassifier(n_estimators=100)

nb_classifier.fit(tf_idf_data_train, target)
nb_train_preds = nb_classifier.predict(tf_idf_data_train)
nb_test_preds = nb_classifier.predict(tf_idf_data_test)

In [92]:
rf_classifier.fit(tf_idf_data_train, target)
rf_train_preds = rf_classifier.predict(tf_idf_data_train)
rf_test_preds = rf_classifier.predict(tf_idf_data_test)

In [97]:
nb_train_score = accuracy_score(target, nb_train_preds)
rf_train_score = accuracy_score(target, rf_train_preds)


In [99]:
print("Multinomial Naive Bayes")
print("Training Accuracy: {:.4}".format(nb_train_score))
print("")
print('-'*70)
print("")
print('Random Forest')
print("Training Accuracy: {:.4}".format(rf_train_score))

Multinomial Naive Bayes
Training Accuracy: 0.8853

----------------------------------------------------------------------

Random Forest
Training Accuracy: 0.9965


In [108]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target,tokenized_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,"[deeds, reason, earthquake, may, allah, forgiv..."
1,4,,,Forest fire near La Ronge Sask. Canada,1,"[forest, fire, near, la, ronge, sask, canada]"
2,5,,,All residents asked to 'shelter in place' are ...,1,"[residents, asked, 'shelter, place, notified, ..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"[13,000, people, receive, wildfires, evacuatio..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,"[got, sent, photo, ruby, alaska, smoke, wildfi..."


In [114]:
features = train_df.drop(columns=['target','tokenized_text'])

In [116]:
X_train,X_test, y_train,y_test = train_test_split(features,target,test_size = 0.2,random_state = 2)

In [126]:
tf_idf_data_train = vectorizer.fit_transform(X_train.text)

tf_idf_data_test = vectorizer.transform(X_test.text)

In [129]:
nb_classifier = MultinomialNB()
rf_classifier = RandomForestClassifier(n_estimators=100)

nb_classifier.fit(tf_idf_data_train, y_train)
nb_train_preds = nb_classifier.predict(tf_idf_data_train)
nb_test_preds = nb_classifier.predict(tf_idf_data_test)

In [130]:
rf_classifier.fit(tf_idf_data_train, y_train)
rf_train_preds = rf_classifier.predict(tf_idf_data_train)
rf_test_preds = rf_classifier.predict(tf_idf_data_test)

In [131]:
nb_train_score = accuracy_score(y_train, nb_train_preds)
nb_test_score = accuracy_score(y_test, nb_test_preds)
rf_train_score = accuracy_score(y_train, rf_train_preds)
rf_test_score = accuracy_score(y_test, rf_test_preds)

print("Multinomial Naive Bayes")
print("Training Accuracy: {:.4} \t\t Testing Accuracy: {:.4}".format(nb_train_score, nb_test_score))
print("")
print('-'*70)
print("")
print('Random Forest')
print("Training Accuracy: {:.4} \t\t Testing Accuracy: {:.4}".format(rf_train_score, rf_test_score))

Multinomial Naive Bayes
Training Accuracy: 0.8892 		 Testing Accuracy: 0.7912

----------------------------------------------------------------------

Random Forest
Training Accuracy: 0.9969 		 Testing Accuracy: 0.7807
