In [68]:
import nltk
from nltk.corpus import stopwords
import string
from nltk import word_tokenize, FreqDist
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
np.random.seed(0)

In [2]:
df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [3]:
df['length_of_tweet'] = df['text'].apply(lambda x: len(x))

In [4]:
df['length_of_tweet'] = df['text'].apply(lambda x: len(x))
df.loc[df['keyword'].isna(),'hashtags'] = df['text'].apply(lambda x: [i[1:] for i in x.split() if '#' in i])

keywords = set(df[~df.keyword.isna()].keyword.values)

df.loc[:, 'new_tag'] = df.hashtags.apply(lambda x: [i.lower() for i in x if i in keywords] if type(x) == list else np.nan)
df['new_tag'] = df['new_tag'].apply(lambda x: x if x != [] and x != np.nan else np.nan)
df['new_tag'] = df['new_tag'].apply(lambda x: x[0] if type(x) == list else np.nan)
df.loc[df.keyword.isna(), 'keyword'] = df['new_tag']
df.drop(columns=['new_tag','hashtags'], inplace=True)

In [5]:
stopwords_list = stopwords.words('english')
stopwords_list += list(string.punctuation)
stopwords_list += ["''", '""', '...', '``', 'http', 'https']

def process_tweet(tweet):
    tokenized_tweet = nltk.word_tokenize(tweet)
    clean_results = [w.lower() for w in tokenized_tweet if not w.lower() in stopwords_list and not 't.co/' in w.lower()]
    return clean_results

In [6]:
processed_data = list(map(process_tweet, df.text))
df['tokenized_text'] = processed_data

In [7]:
total_vocab = set()
for i in processed_data:
    total_vocab.update(i)
len(total_vocab)

18443

In [8]:
tweets_concat = []
for i in processed_data:
    tweets_concat+=i

In [9]:
tweets_freqdist = FreqDist(tweets_concat)
tweets_freqdist.most_common(20)

[("'s", 791),
 ("n't", 446),
 ('like', 345),
 ('amp', 344),
 ("'m", 250),
 ('fire', 249),
 ('get', 228),
 ('new', 219),
 ('via', 218),
 ('people', 197),
 ('news', 197),
 ('one', 194),
 ('video', 165),
 ('2', 162),
 ('emergency', 155),
 ('disaster', 153),
 ('would', 141),
 ('police', 138),
 ("'re", 129),
 ('still', 128)]

In [10]:
df.head()

Unnamed: 0,id,keyword,location,text,target,length_of_tweet,tokenized_text
0,1,earthquake,,Our Deeds are the Reason of this #earthquake M...,1,69,"[deeds, reason, earthquake, may, allah, forgiv..."
1,4,,,Forest fire near La Ronge Sask. Canada,1,38,"[forest, fire, near, la, ronge, sask, canada]"
2,5,,,All residents asked to 'shelter in place' are ...,1,133,"[residents, asked, 'shelter, place, notified, ..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,65,"[13,000, people, receive, wildfires, evacuatio..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,88,"[got, sent, photo, ruby, alaska, smoke, wildfi..."


In [11]:
target = df.target
features = df.drop(columns=['target','id'])

In [12]:
X_train,X_test, y_train,y_test = train_test_split(features,target,test_size = 0.2,random_state = 2)

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [14]:
vectorizer = TfidfVectorizer()

In [15]:
X_train.head()

Unnamed: 0,keyword,location,text,length_of_tweet,tokenized_text
4549,injured,USA,Offers : http://t.co/Gl3C1vc88P #8392 Deluxe T...,139,"[offers, 8392, deluxe, toilet, safety, support..."
4512,hurricane,,The hurricane mixxtail kinda tastes like the w...,87,"[hurricane, mixxtail, kinda, tastes, like, wat..."
4368,hijacker,,Complete Solution to Get Rid of http://t.co/9C...,117,"[complete, solution, get, rid, ûò, browser, h..."
4297,hellfire,,@HellFire_eV @JackPERU1 then I do this to one ...,58,"[hellfire_ev, jackperu1, one]"
13,,,#Flood in Bago Myanmar #We arrived Bago,39,"[flood, bago, myanmar, arrived, bago]"


In [16]:
tf_idf_data_train = vectorizer.fit_transform(X_train.text)

tf_idf_data_test = vectorizer.transform(X_test.text)

In [17]:
tf_idf_data_train.shape

(6090, 18449)

In [18]:
non_zero_cols = tf_idf_data_train.nnz / float(tf_idf_data_train.shape[0])
print("Average Number of Non-Zero Elements in Vectorized Tweets: {}".format(non_zero_cols))

percent_sparse = 1 - (non_zero_cols / float(tf_idf_data_train.shape[1]))
print('Percentage of columns containing 0: {}'.format(percent_sparse))

Average Number of Non-Zero Elements in Vectorized Tweets: 14.652709359605911
Percentage of columns containing 0: 0.9992057721632822


In [19]:
df = df.drop(columns=['tokenized_text','id'])

In [20]:
df.head()

Unnamed: 0,keyword,location,text,target,length_of_tweet
0,earthquake,,Our Deeds are the Reason of this #earthquake M...,1,69
1,,,Forest fire near La Ronge Sask. Canada,1,38
2,,,All residents asked to 'shelter in place' are ...,1,133
3,,,"13,000 people receive #wildfires evacuation or...",1,65
4,,,Just got sent this photo from Ruby #Alaska as ...,1,88


In [21]:
nb_classifier = MultinomialNB()
rf_classifier = RandomForestClassifier(n_estimators=100)

nb_classifier.fit(tf_idf_data_train, y_train)
nb_train_preds = nb_classifier.predict(tf_idf_data_train)
nb_test_preds = nb_classifier.predict(tf_idf_data_test)

In [22]:
rf_classifier.fit(tf_idf_data_train, y_train)
rf_train_preds = rf_classifier.predict(tf_idf_data_train)
rf_test_preds = rf_classifier.predict(tf_idf_data_test)

In [23]:
nb_train_score = accuracy_score(y_train, nb_train_preds)
nb_test_score = accuracy_score(y_test, nb_test_preds)
rf_train_score = accuracy_score(y_train, rf_train_preds)
rf_test_score = accuracy_score(y_test, rf_test_preds)

print("Multinomial Naive Bayes")
print("Training Accuracy: {:.4} \t\t Testing Accuracy: {:.4}".format(nb_train_score, nb_test_score))
print("")
print('-'*70)
print("")
print('Random Forest')
print("Training Accuracy: {:.4} \t\t Testing Accuracy: {:.4}".format(rf_train_score, rf_test_score))

Multinomial Naive Bayes
Training Accuracy: 0.8892 		 Testing Accuracy: 0.7912

----------------------------------------------------------------------

Random Forest
Training Accuracy: 0.9969 		 Testing Accuracy: 0.7781


# Bag of Words

In [43]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer
#tokenizer to remove unwanted elements from out data like symbols and numbers
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(lowercase=True,stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize)
text_counts= cv.fit_transform(df['text'])

In [45]:
X_train, X_test, y_train, y_test = train_test_split(
    text_counts, df['target'], test_size=0.3, random_state=1)

In [64]:
rf_clf = RandomForestClassifier().fit(X_train, y_train)
predicted= rf_clf.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, predicted))



Random Forest Accuracy: 0.76138353765324


In [54]:
tf=TfidfVectorizer()
text_tf = tf.fit_transform(df['text'])

In [55]:
X_train, X_test, y_train, y_test = train_test_split(
    text_tf, df['target'], test_size=0.3, random_state=1)

In [56]:
clf = MultinomialNB().fit(X_train, y_train)
predicted= clf.predict(X_test)
print("MultinomialNB Accuracy:", accuracy_score(y_test, predicted))

MultinomialNB Accuracy: 0.7964098073555166


In [65]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score


In [66]:
rf_param_grid = {
    "criterion": ["gini", "entropy"],
    "max_depth": [None, 2, 6, 10],
    "min_samples_split": [5, 10],
    "min_samples_leaf" : [3, 6],
    "n_estimators" : [10, 30, 100]
}

In [67]:
import time
start = time.time()
rf_grid_search = GridSearchCV(rf_clf,rf_param_grid,cv=3)
rf_grid_search.fit(X_train, y_train)

print("Testing Accuracy: {:.4}%".format(rf_grid_search.best_score_ * 100))
print("Total Runtime for Grid Search on Random Forest Classifier: {:.4} seconds".format(time.time() - start))
print("")
print("Optimal Parameters: {}".format(rf_grid_search.best_params_))

Testing Accuracy: 77.5%
Total Runtime for Grid Search on Random Forest Classifier: 136.3 seconds

Optimal Parameters: {'criterion': 'entropy', 'max_depth': None, 'min_samples_leaf': 3, 'min_samples_split': 10, 'n_estimators': 100}


In [69]:
adaboost_clf = AdaBoostClassifier()
adaboost_clf.fit(X_train, y_train)
adaboost_mean_cv_score = np.mean(cross_val_score(adaboost_clf,X_train,y_train,cv=3))

print("Mean Cross Validation Score for AdaBoost: {:.4}%".format(adaboost_mean_cv_score * 100))

Mean Cross Validation Score for AdaBoost: 73.02%


In [70]:
adaboost_param_grid = {
    "n_estimators": [50, 100, 250],
    "learning_rate": [1.0, 0.5, 0.1]
}

In [71]:
adaboost_grid_search = GridSearchCV(adaboost_clf,adaboost_param_grid,cv=3)
adaboost_grid_search.fit(X_train, y_train)

print("Testing Accuracy: {:.4}%".format(adaboost_grid_search.best_score_ * 100))
print("Total Runtime for Grid Search on AdaBoost: {:.4} seconds".format(time.time() - start))
print("")
print("Optimal Parameters: {}".format(adaboost_grid_search.best_params_))

Testing Accuracy: 75.55%
Total Runtime for Grid Search on AdaBoost: 854.7 seconds

Optimal Parameters: {'learning_rate': 0.5, 'n_estimators': 250}


 Submission Stuff

In [None]:
sample_submission = pd.read_csv('sample_submission.csv')

In [25]:
vectorizer2 = TfidfVectorizer()

In [26]:
df.head()

Unnamed: 0,keyword,location,text,target,length_of_tweet
0,earthquake,,Our Deeds are the Reason of this #earthquake M...,1,69
1,,,Forest fire near La Ronge Sask. Canada,1,38
2,,,All residents asked to 'shelter in place' are ...,1,133
3,,,"13,000 people receive #wildfires evacuation or...",1,65
4,,,Just got sent this photo from Ruby #Alaska as ...,1,88


In [27]:
tf_idf_data_train2 = vectorizer2.fit_transform(df.text)

tf_idf_data_test2 = vectorizer2.transform(test_df.text)

In [28]:
nb_classifier2 = MultinomialNB()

nb_classifier2.fit(tf_idf_data_train2, df.target)
nb_train_preds2 = nb_classifier2.predict(tf_idf_data_train2)
nb_test_preds2 = nb_classifier2.predict(tf_idf_data_test2)

In [29]:
rf_classifier2 = RandomForestClassifier(n_estimators=100)

rf_classifier2.fit(tf_idf_data_train2, df.target)
rf_train_preds2 = rf_classifier2.predict(tf_idf_data_train2)
rf_test_preds2 = rf_classifier2.predict(tf_idf_data_test2)

In [30]:
rf_test_preds2

array([1, 1, 0, ..., 1, 1, 0])

In [31]:
sample_submission['target'] = rf_test_preds2

In [35]:
sample_submission.set_index('id', inplace=True)

In [37]:
sample_submission.to_csv('submission0')