In [136]:
from sklearn.naive_bayes import MultinomialNB
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics
plt.style.use('ggplot')
% matplotlib inline

In [7]:
plt.style.use('ggplot')

In [10]:
df = pd.read_csv('2012_Colorado_wildfires/2012_Colorado_wildfires-tweets_labeled.csv') # change the file location if needed
df.head()

Unnamed: 0,Tweet ID,Tweet Text,Information Source,Information Type,Informativeness
0,211040709124440064,#Intern #US #TATTOO #Wisconsin #Ohio #NC #PA #...,Not labeled,Not labeled,Not related
1,211111710294163457,RT @Jack4Ward: Get in on the fun every Thursda...,Not labeled,Not labeled,Not related
2,211157222699433985,Welcome to our newest STUDENTathlete- Reagan B...,Not labeled,Not labeled,Not related
3,211162553659830272,Denver Post: #Colorado governor signs bill cre...,Not labeled,Not labeled,Not related
4,211216962162933761,Pretty sure I'm going to live in Manitou Sprin...,Not labeled,Not labeled,Not related


In [11]:
col_dict = {}
for old_feature in df.columns.values:
    col_dict.update({old_feature: old_feature.replace(' ','')})
df.rename(columns=col_dict, inplace=True)

In [12]:
df.columns.values

array(['TweetID', 'TweetText', 'InformationSource', 'InformationType',
       'Informativeness'], dtype=object)

In [13]:
df.Informativeness.value_counts()

Related and informative          685
Related - but not informative    268
Not related                      238
Not applicable                     9
Name: Informativeness, dtype: int64

In [14]:
df = df[df.Informativeness!='Not applicable']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(df.TweetText, df.Informativeness, test_size=0.3, random_state=42)

In [16]:
count_vectorizer = CountVectorizer(stop_words = 'english')
count_train = count_vectorizer.fit_transform(X_train)
count_train_df = pd.DataFrame(count_train.A, columns=count_vectorizer.get_feature_names())
count_test = count_vectorizer.transform(X_test)
count_test_df = pd.DataFrame(count_test.A, columns=count_vectorizer.get_feature_names())

In [69]:
nb = MultinomialNB(0.1, False)
nb.fit(count_train_df, y_train)
y_pred = nb.predict(count_test_df)
cm = metrics.confusion_matrix(y_test, y_pred, labels=['Related and informative', 'Related - but not informative', 'Not related'])
accuracy = metrics.accuracy_score(y_test, y_pred)
accuracy

0.7374301675977654

In [30]:
accuracy

0.7178770949720671

In [12]:
cm

array([[176,   6,   1],
       [ 45,  46,   0],
       [ 58,   6,  20]])

In [137]:
tfidf_vectorizer = TfidfVectorizer(stop_words = 'english')
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_train_df = pd.DataFrame(tfidf_train.A, columns=tfidf_vectorizer.get_feature_names())
tfidf_test = tfidf_vectorizer.transform(X_test)
tfidf_test_df = pd.DataFrame(tfidf_test.A, columns=tfidf_vectorizer.get_feature_names())

In [131]:
nb_tfidf = MultinomialNB(0.9, False)
nb_tfidf.fit(tfidf_train_df, y_train)
y_pred = nb_tfidf.predict(tfidf_test_df)
cm = metrics.confusion_matrix(y_test, y_pred, labels=['Related and informative', 'Related - but not informative', 'Not related'])
accuracy = metrics.accuracy_score(y_test, y_pred)
accuracy

0.7318435754189944

In [19]:
accuracy

0.55027932960893855

In [132]:
cm

array([[171,  11,   1],
       [ 28,  62,   1],
       [ 42,  13,  29]], dtype=int64)