In [2]:
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics
plt.style.use('ggplot')
% matplotlib inline

In [3]:
plt.style.use('ggplot')

In [4]:
df = pd.read_csv('2012_Colorado_wildfires/2012_Colorado_wildfires-tweets_labeled.csv') # change the file location if needed
df.head()

Unnamed: 0,Tweet ID,Tweet Text,Information Source,Information Type,Informativeness
0,211040709124440064,#Intern #US #TATTOO #Wisconsin #Ohio #NC #PA #...,Not labeled,Not labeled,Not related
1,211111710294163457,RT @Jack4Ward: Get in on the fun every Thursda...,Not labeled,Not labeled,Not related
2,211157222699433985,Welcome to our newest STUDENTathlete- Reagan B...,Not labeled,Not labeled,Not related
3,211162553659830272,Denver Post: #Colorado governor signs bill cre...,Not labeled,Not labeled,Not related
4,211216962162933761,Pretty sure I'm going to live in Manitou Sprin...,Not labeled,Not labeled,Not related


In [5]:
col_dict = {}
for old_feature in df.columns.values:
    col_dict.update({old_feature: old_feature.replace(' ','')})
df.rename(columns=col_dict, inplace=True)

In [6]:
df.columns.values

array(['TweetID', 'TweetText', 'InformationSource', 'InformationType',
       'Informativeness'], dtype=object)

In [7]:
df.Informativeness.value_counts()

Related and informative          685
Related - but not informative    268
Not related                      238
Not applicable                     9
Name: Informativeness, dtype: int64

In [8]:
df = df[df.Informativeness!='Not applicable']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(df.TweetText, df.Informativeness, test_size=0.3, random_state=42)

In [10]:
count_vectorizer = CountVectorizer(stop_words = 'english')
count_train = count_vectorizer.fit_transform(X_train)
count_train_df = pd.DataFrame(count_train.A, columns=count_vectorizer.get_feature_names())
count_test = count_vectorizer.transform(X_test)
count_test_df = pd.DataFrame(count_test.A, columns=count_vectorizer.get_feature_names())

In [12]:
lr = LogisticRegression()
lr.fit(count_train_df, y_train)
y_pred = lr.predict(count_test_df)
cm = metrics.confusion_matrix(y_test, y_pred, labels=['Related and informative', 'Related - but not informative', 'Not related'])
accuracy = metrics.accuracy_score(y_test, y_pred)

In [13]:
accuracy

0.73743016759776536

In [14]:
cm

array([[168,  11,   4],
       [ 31,  53,   7],
       [ 34,   7,  43]])

In [16]:
from sklearn.model_selection import GridSearchCV
model = LogisticRegression()
parameters = {'solver':('newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'), 'class_weight': (None, 'balanced')}
classifier = GridSearchCV(model, parameters)
classifier.fit(count_train_df, y_train)
y_pred_gs = classifier.predict(count_test_df)
cm_gs = metrics.confusion_matrix(y_test, y_pred_gs, labels=['Related and informative', 'Related - but not informative', 'Not related'])
accuracy_gs = metrics.accuracy_score(y_test, y_pred_gs)

In [17]:
accuracy_gs

0.75977653631284914

In [18]:
cm_gs

array([[160,  13,  10],
       [ 25,  56,  10],
       [ 20,   8,  56]])