# Bayes

In [20]:
import re
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from statistics import mean

In [2]:
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")

In [3]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


## Trying things raw - no preprocessing

In [5]:
tfidf_vectorizer = TfidfVectorizer()

In [6]:
train_vectors = tfidf_vectorizer.fit_transform(train_df["text"])
test_vectors = tfidf_vectorizer.transform(test_df["text"])

In [7]:
nb = MultinomialNB()

In [8]:
scores = cross_val_score(nb, train_vectors, train_df["target"], cv=10, scoring="f1")

In [9]:
np.mean(scores)

0.6289311798416697

## Using some tokenization

In [10]:
train_texts = train_df["text"].tolist(); train_labs = train_df["target"].tolist()
tok_texts = []; tok_test_texts = []

In [11]:
number_re = re.compile(r"(?:\d+[,\.\d]*)?\d")
punct_re = re.compile(r"[!@#\$%\^&\*\(\)\-_\+=\{\}\[\]:;\"',<\.>\\/\?]")
multi_space_re = re.compile(r"\s\s+")
hashtag_re = re.compile(r"#[^\s]+")

In [12]:
for text in train_texts:
    text = multi_space_re.sub(" ", number_re.sub("", punct_re.sub("", hashtag_re.sub("", text.lower())))).strip()
    tok_texts.append(text)

In [13]:
tok_texts[:5]

['our deeds are the reason of this may allah forgive us all',
 'forest fire near la ronge sask canada',
 'all residents asked to shelter in place are being notified by officers no other evacuation or shelter in place orders are expected',
 'people receive evacuation orders in california',
 'just got sent this photo from ruby as smoke from pours into a school']

In [14]:
for text in test_df["text"].tolist():
    text = multi_space_re.sub(" ", number_re.sub("", punct_re.sub("", hashtag_re.sub("", text.lower())))).strip()
    tok_test_texts.append(text)

In [15]:
train_df["tokenized"] = tok_texts
test_df["tokenized"] = tok_test_texts

In [16]:
tfidf_vectorizer = TfidfVectorizer()
train_vectors = tfidf_vectorizer.fit_transform(train_df["tokenized"])
test_vectors = tfidf_vectorizer.transform(test_df["tokenized"])

In [17]:
nb = MultinomialNB()
scores = cross_val_score(nb, train_vectors, train_df["target"], cv=10, scoring="f1")

In [18]:
scores

array([0.62752294, 0.52929293, 0.6056338 , 0.50929368, 0.63194444,
       0.56171735, 0.61482821, 0.56214149, 0.72425249, 0.74304419])

In [21]:
mean(scores)

0.6109671527754281