In [1]:
import pandas as pd
import nltk
import numpy as np
import regex as re
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# from nltk.stem import WordNetLemmatizer
# from nltk.tokenize import RegexpTokenizer
# from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

  import pandas.util.testing as tm


In [2]:
df_text_query = pd.read_csv('./Datasets/modeling_csv')

### Setting up data for modeling

In [3]:
X = df_text_query['text']
y = df_text_query['is_road_closed']

In [4]:
X.dtypes

dtype('O')

In [5]:
y.dtypes

dtype('int64')

In [6]:
## Train, test and split the data
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    stratify=y,
                                                    random_state=42)

In [7]:
## Baseline Predictions
y_test.value_counts()

0    2892
1     103
Name: is_road_closed, dtype: int64

In [8]:
y_test.value_counts(normalize=True)

0    0.965609
1    0.034391
Name: is_road_closed, dtype: float64

## CountVectorization/Logistic Regression Model

In [9]:
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('logreg', LogisticRegression(solver = 'lbfgs'))
])

params = {
    'cvec__stop_words' : [None, 'english'],
    'logreg__penalty' : ['none','l2'],
    'cvec__max_features': [2000, 3000, 4000, 5000],
    'cvec__min_df': [2, 3],
    'cvec__max_df': [.9, .95],
    'cvec__ngram_range': [(1,1), (1,2)]
}

In [10]:
gs = GridSearchCV(pipe, # what object are we optimizing?
                  params, # what parameters values are we searching?
                  cv=5) # 5-fold cross-validation.

In [11]:
# Fit GridSearch to training data.
gs.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('cvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                            

In [12]:
# Save best model as gs_model.
gs_model = gs.best_estimator_

In [13]:
# Score model on training set.
gs_model.score(X_train, y_train)

0.9993421052631579

In [14]:
# Score model on testing set.
gs_model.score(X_test, y_test)

0.9863105175292154

In [15]:
preds = gs.best_estimator_.predict(X_test)

#### Look at the coefficients of CountVectorization/Logistic Regression Model

In [16]:
# look at the coefficients
coefs = gs.best_estimator_.named_steps['logreg'].coef_[0]
features = gs.best_estimator_.named_steps['cvec'].get_feature_names()

In [17]:
coef_df = pd.DataFrame({'features' : features,
             'coefficients': coefs})

In [18]:
coef_df.sort_values('coefficients', ascending = False)

Unnamed: 0,features,coefficients
3567,road,2.615889
3569,road closures,1.678941
1107,closures,1.657915
2147,hwy,1.527241
661,avenue,1.439073
...,...,...
2464,la,-0.688243
3887,south near,-0.722229
1538,escondido,-0.738734
3105,open,-0.782942


#### Evaluation and Confusion Matrix of CountVectorization/Logistic Regression Model

In [19]:
def nice_conmat(y_test, preds, classes):
    conmat = confusion_matrix(y_test, preds)
    
    print(f'Accuracy Score: {accuracy_score(y_test, preds)}')
    return pd.DataFrame(conmat, columns=['Predicted ' +class_ for class_ in classes], \
                index=['Actual '+ class_ for class_ in classes])

nice_conmat(y_test, preds, ['open', 'is_road_closed'])

Accuracy Score: 0.9863105175292154


Unnamed: 0,Predicted open,Predicted is_road_closed
Actual open,2886,6
Actual is_road_closed,35,68


In [20]:
TN = 2886
FN = 35
FP = 6
TP = 68

In [21]:
# Recall / Sensitivity for CountVect/LogReg
TP / (TP + FN)

0.6601941747572816

In [22]:
# Specificity for CountVect/LogReg
TN / (FP + TN)

0.9979253112033195

## TFIDF/Logistic Regression Model

In [23]:
pipe_v2 = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('logreg', LogisticRegression(solver = 'lbfgs'))
])

params_2 = {
    'logreg__penalty' : ['none','l2'],
}

In [24]:
gs_2 = GridSearchCV(pipe_v2, # what object are we optimizing?
                  params_2, # what parameters values are we searching?
                  cv=5) # 5-fold cross-validation.

In [25]:
# Fit GridSearch to training data.
gs_2.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('tfidf',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                         

In [26]:
# Save best model as gs_model.
gs_2_model = gs_2.best_estimator_

In [27]:
# Score model on training set.
gs_2_model.score(X_train, y_train)

1.0

In [28]:
# Score model on testing set.
gs_2_model.score(X_test, y_test)

0.9853088480801335

In [29]:
preds_tfidf = gs_2.best_estimator_.predict(X_test)

In [30]:
def nice_conmat(y_test, preds_tfidf, classes):
    conmat = confusion_matrix(y_test, preds_tfidf)
    
    print(f'Accuracy Score: {accuracy_score(y_test, preds_tfidf)}')
    return pd.DataFrame(conmat, columns=['Predicted ' +class_ for class_ in classes], \
                index=['Actual '+ class_ for class_ in classes])

nice_conmat(y_test, preds_tfidf, ['open', 'is_road_closed'])

Accuracy Score: 0.9853088480801335


Unnamed: 0,Predicted open,Predicted is_road_closed
Actual open,2874,18
Actual is_road_closed,26,77


In [31]:
TN_tfidf = 2874
FN_tfidf = 26
FP_tfidf = 18
TP_tfidf = 77

In [32]:
# Recall / Sensitivity for TFIDFt/LogReg
TP_tfidf / (TP_tfidf + FN_tfidf)

0.7475728155339806

In [33]:
# Specificity for TFIDF/LogReg
TN_tfidf / (FP_tfidf + TN_tfidf)

0.9937759336099585

In [34]:
#Precision = TruePositives / (TruePositives + FalsePositives)
TP_tfidf / (TP_tfidf + FP_tfidf)

0.8105263157894737

## Filtering True Positives

In [35]:
df_text_query

Unnamed: 0.1,Unnamed: 0,username,text,hashtags,query,is_road_closed
0,0,EPCF #GreenNewDeal #SunriseMovement,our hearts go out to californians affected by ...,"['wildfires', 'ClimateChange', 'ClimateCrisis'...",saddleridgefire,0
1,1,Jason Singson,"from the #saddleridgefire to the #kincadefire,...","['SaddleridgeFire', 'KincadeFire']",saddleridgefire,0
2,2,Brandy Grote,our #saddleridgefire leaped a 12 lane major fr...,['SaddleridgeFire'],saddleridgefire,0
3,3,Mark In SoCal,"good morning, sam!\nall is well here. however,...",['SaddleridgeFire'],saddleridgefire,0
4,4,LVTIA,was your property or home damaged by the #sadd...,['SaddleRidgeFire'],saddleridgefire,0
...,...,...,...,...,...,...
9070,9175,Claudia Peschiutta,former state lawmaker says hes got a plan to c...,[],KNX1070,0
9071,9176,Eden,totally makes sense to do this midday,[],KNX1070,0
9072,9177,KNX 1070 NEWSRADIO,"a sanford, maine mom says her 9-year-old birac...",[],KNX1070,0
9073,9178,Andy,"i noticed it in anaheim hills, it looks crazy!",[],KNX1070,0


In [36]:
len(preds_tfidf)

2995

In [37]:
x_test_indices = list(X_test.index)

In [38]:
x_test_df = df_text_query.loc[x_test_indices, :]
x_test_df.head()

Unnamed: 0.1,Unnamed: 0,username,text,hashtags,query,is_road_closed
8334,8433,KNX 1070 NEWSRADIO,watch video of alanis morissette sharing her n...,[],KNX1070,0
6077,6162,LACoFD Incident Alerts,they’re engines equipped with cafs (compressed...,[],LACoFD911,0
7186,7271,Brian Douglas,brush fire burning in the ventura riverbed has...,['101fwy'],KNX1070,0
1724,1724,AJ Tenney,lost out on two days of work due to the #tickf...,['TickFire'],tickfire,0
394,394,AI6YR,#tickfire some confusion over the 23121 coltra...,"['TickFire', 'SaddleRidgeFire']",saddleridgefire,0


In [39]:
x_test_df['y_preds'] = preds_tfidf

In [40]:
road_closed_true = (x_test_df['is_road_closed'] == 1)
road_closed_pred = (x_test_df['y_preds'] == 1)

X_test_true_pos = x_test_df[road_closed_true & road_closed_pred]

In [41]:
X_test_true_pos.iloc[40:60]

Unnamed: 0.1,Unnamed: 0,username,text,hashtags,query,is_road_closed,y_preds
6572,6657,Go511,"sigalert in fountain valley \ni-405 north, jus...","['1', '2']",Go511,1,1
4405,4462,Caltrans District 7,wb 210 from grand ave to citrus ave - the # 4 ...,['Glendorapic'],CaltransDist7,1,1
4388,4445,Caltrans District 7,all caltrans planned lane closures for constru...,[],CaltransDist7,1,1
9020,9125,scott burt,#118fwy eb yosemite ave off-ramp remains close...,"['118Fwy', 'SimiValley', 'LATraffic', 'KNXTraf...",KNX1070,1,1
6378,6463,Go511,sigalert in bellflower\ni-605 south / sr-91 ea...,['2'],Go511,1,1
4036,4092,Caltrans District 7,on the eb 118 freeway in simi valley - the eb ...,[],CaltransDist7,1,1
6356,6441,Go511,sigalert in norwalk \ni-605 north / rosecrans ...,[],Go511,1,1
4412,4469,The Local Malibu,brush fire | fs125 #calabasas | sb 101 freeway...,"['Calabasas', 'OakFire', 'LACoFD']",CaltransDist7,1,1
6456,6541,Go511,sigalert in los angeles \ni-10 east / south v...,[],Go511,1,1
6579,6664,Go511,traffic alert in manhattan beach\nall lanes ar...,[],Go511,1,1


In [43]:
X_test_true_pos['text']

7060    #tickfire update\n\nsr-14 south between agua d...
7616    #sigalert!!! sb #605fwy just past peck rd cras...
6929    tick fire\nall lanes closed in both directions...
3440    nb i-5 all lanes scheduled to be closed norwal...
3589    nb & sb i-5 scheduled to be closed from artesi...
                              ...                        
8157    #sigalert continues for the eb #60fwy past hac...
8509    fire in #elsereno continues to burn and cause ...
6641    closure alert \nall lanes are closed in both d...
6836    60 swarm \nrepaving project lane and ramp clos...
6487    traffic alert in pomona\nsr-60 west at hwy 71/...
Name: text, Length: 77, dtype: object

In [44]:
road_closed_true = (x_test_df['is_road_closed'] == 1)
road_closed_pred = (x_test_df['y_preds'] == 0)

X_test_false_neg = x_test_df[road_closed_true & road_closed_pred]

In [45]:
X_test_false_neg.head(20)

Unnamed: 0.1,Unnamed: 0,username,text,hashtags,query,is_road_closed,y_preds
8414,8514,Brian Douglas,the #fullertonfire is now burning at the brea ...,['FullertonFire'],KNX1070,1,0
6625,6710,Go511,traffic alert in topanga\npacific coast hwy so...,[],Go511,1,0
8265,8364,scott burt,@lafd pounced on a #brushfire in the #brentwoo...,"['brushfire', 'Brentwood', 'LATraffic', 'KNXTr...",KNX1070,1,0
6418,6503,Go511,sigalert in moreno valley \nsr-60 w/b & e/b at...,[],Go511,1,0
8283,8382,scott burt,@lafd pounced on a #brushfire in the #brentwoo...,"['brushfire', 'Brentwood', 'LATraffic', 'KNXTr...",KNX1070,1,0
6846,6931,Go511,sigalert in hacienda heights\nsr-60 west / hac...,[],Go511,1,0
6979,7064,Go511,"sigalert\nla county, castaic \ni-5 north near ...",[],Go511,1,0
8672,8774,Brian Douglas,#sigalert!!! #santaana sb #5fwy at 17th st you...,"['SIGALERT', 'SantaAna', '5fwy', 'knxtraffic']",KNX1070,1,0
6899,6984,Go511,traffic alert #easyfire\nsr-118 between madera...,['EasyFire'],Go511,1,0
4056,4113,Stephanie Ferguson,when will sepulveda be opened?? the fire is no...,[],CaltransDist7,1,0


In [47]:
X_test_false_neg['text']

8414    the #fullertonfire is now burning at the brea ...
6625    traffic alert in topanga\npacific coast hwy so...
8265    @lafd pounced on a #brushfire in the #brentwoo...
6418    sigalert in moreno valley \nsr-60 w/b & e/b at...
8283    @lafd pounced on a #brushfire in the #brentwoo...
6846    sigalert in hacienda heights\nsr-60 west / hac...
6979    sigalert\nla county, castaic \ni-5 north near ...
8672    #sigalert!!! #santaana sb #5fwy at 17th st you...
6899    traffic alert #easyfire\nsr-118 between madera...
4056    when will sepulveda be opened?? the fire is no...
7058    traffic alert in pasadena\ni-210 west at sierr...
7034    traffic alert in norwalk \nsr-91 west at stude...
4260    santa clarita road closures can be found at ht...
8434    #sigalert!!! continues for the nb #57fwy where...
4283    7:45 am update #tickfire closures: nb sr-14 is...
9015    the #14fwy remains closed in both directions b...
6446    traffic alert in los angeles\ni-5 north at mai...
3485    remind

####  Type I Error (or False Positive) is a result that indicates that a given condition is present when it actually is not present. In our model, Type I errors would be incorrectly predicting the road was closed, when it is actually open. In our example this would cause less cars/traffic/people using a safe escape route. A low Type I error  value would mean less routes were incorrectly flagged as road being closed, therefore not allowing for the maximum the amount of escape routes. 

#### Type II Error (or False Negative) is a result that indicates that a given condition is not present when it actually is present. In our model, it would be incorrectly predicting the road was open, when it is actually closed. With this model we would want to minimize Type II errors, it could potentially be catastrophic to send a evacuee into a dangerous situation not to mention cause an increase in traffic. Unfortunately, our data and modeling indicate a relatively high level of false negatives. 

## CountVectorization/Decision Tree Model

In [48]:
## Pipeline and parameters for model
pipe_dt = Pipeline([
    ('cvec', CountVectorizer()),
    ('dt', DecisionTreeClassifier(random_state = 42))])

params_dt = {
    'cvec__stop_words' : ['english'],
    'cvec__max_features': [2000, 4000],
    'cvec__min_df': [2, 3],
    'cvec__max_df': [.95],
    'cvec__ngram_range': [(1,2)],
    'dt__max_depth': [5, 10],
    'dt__min_samples_split': [5, 10,],
    'dt__min_samples_leaf': [2, 3, 5]
                    }


In [49]:
## Model gridsearch
gs_dt = GridSearchCV(pipe_dt, # what object are we optimizing?
                  params_dt, # what parameters values are we searching?
                  cv=5) # 5-fold cross-validation.

In [50]:
# Fit GridSearch to training data.
gs_dt.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('cvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                            

In [51]:
gs_dt.best_estimator_

Pipeline(memory=None,
         steps=[('cvec',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=0.95,
                                 max_features=2000, min_df=3,
                                 ngram_range=(1, 2), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('dt',
                 DecisionTreeClassifier(class_weight=None, criterion='gini',
                                        max_depth=10, max_features=None,
                                        max_leaf_nodes=None,
                                        min_impurity_decrease=0.0,
                            

In [52]:
# What was the cross-validated score of the above decision tree?
gs_dt.best_score_

0.9773026315789474

In [53]:
# Score model on training set.
gs_dt.score(X_train, y_train)

0.9901315789473685

In [54]:
# Score model on testing set.
gs_dt.score(X_test, y_test)

0.9742904841402337

In [55]:
preds_dt = gs_dt.best_estimator_.predict(X_test)

In [56]:
## Model confusion matrix
def nice_conmat(y_test, preds_dt, classes):
    conmat = confusion_matrix(y_test, preds_dt)
    
    print(f'Accuracy Score: {accuracy_score(y_test, preds_dt)}')
    return pd.DataFrame(conmat, columns=['Predicted ' +class_ for class_ in classes], \
                index=['Actual '+ class_ for class_ in classes])

nice_conmat(y_test, preds_dt, ['open', 'is_road_closed'])

Accuracy Score: 0.9742904841402337


Unnamed: 0,Predicted open,Predicted is_road_closed
Actual open,2872,20
Actual is_road_closed,57,46


## TFIDF/Decision Trees Model

In [57]:
## Pipeline and parameters for model
pipe_tfidf_dt = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('dt', DecisionTreeClassifier(random_state = 42))])

params_tfidf_dt = {
    'dt__max_depth': [5, 10],
    'dt__min_samples_split': [5, 10,],
    'dt__min_samples_leaf': [2, 3, 5]
                    },

In [58]:
## Model gridsearch
gs_tfidf_dt = GridSearchCV(pipe_tfidf_dt, # what object are we optimizing?
                  params_tfidf_dt, # what parameters values are we searching?
                  cv=5) # 5-fold cross-validation.

In [59]:
# Fit GridSearch to training data.
gs_tfidf_dt.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('tfidf',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                         

In [60]:
# What was the cross-validated score of the above decision tree?
gs_tfidf_dt.best_score_

0.9728618421052632

In [61]:
# Score model on training set.
gs_tfidf_dt.score(X_train, y_train)

0.9896381578947369

In [62]:
# Score model on testing set.
gs_tfidf_dt.score(X_test, y_test)

0.9736227045075125

In [63]:
preds_tfidf_dt = gs_tfidf_dt.best_estimator_.predict(X_test)

In [64]:
## Model confusion matrix
def nice_conmat(y_test, preds_tfidf_dt, classes):
    conmat = confusion_matrix(y_test, preds_tfidf_dt)
    
    print(f'Accuracy Score: {accuracy_score(y_test, preds_tfidf_dt)}')
    return pd.DataFrame(conmat, columns=['Predicted ' +class_ for class_ in classes], \
                index=['Actual '+ class_ for class_ in classes])

nice_conmat(y_test, preds_tfidf_dt, ['open', 'is_road_closed'])

Accuracy Score: 0.9736227045075125


Unnamed: 0,Predicted open,Predicted is_road_closed
Actual open,2873,19
Actual is_road_closed,60,43


## CountVectorization/Random Forest Model

In [65]:
## Pipeline and parameters for model
pipe_rf = Pipeline([
    ('cvec', CountVectorizer()),
    ( 'rf',  RandomForestClassifier(random_state =42))])
rf_params = {
    'cvec__stop_words' : ['english'],
    'cvec__max_features': [2000, 4000],
    'cvec__min_df': [2, 3],
    'cvec__max_df': [.95],
    'cvec__ngram_range': [(1,2)],
    'rf__n_estimators' : [100,125],
    'rf__max_depth' : [None, 4],
    'rf__max_features' : [None, ## bagging
                      'auto'] ## random forest
}
gs_rf = GridSearchCV(pipe_rf, rf_params, cv=5)
gs_rf.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('cvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                            

In [66]:
# What was the cross-validated score of the above decision tree?
gs_rf.best_score_

0.9819078947368421

In [67]:
# Score model on training set.
gs_rf.score(X_train, y_train)

1.0

In [68]:
# Score model on testing set.
gs_rf.score(X_test, y_test)

0.9846410684474124

In [69]:
preds_rf = gs_rf.best_estimator_.predict(X_test)

In [70]:
## Model confusion matrix
def nice_conmat(y_test, preds_rf, classes):
    conmat = confusion_matrix(y_test, preds_rf)
    
    print(f'Accuracy Score: {accuracy_score(y_test, preds_rf)}')
    return pd.DataFrame(conmat, columns=['Predicted ' +class_ for class_ in classes], \
                index=['Actual '+ class_ for class_ in classes])

nice_conmat(y_test, preds_rf, ['open', 'is_road_closed'])

Accuracy Score: 0.9846410684474124


Unnamed: 0,Predicted open,Predicted is_road_closed
Actual open,2885,7
Actual is_road_closed,39,64


## TFIDF/Random Forest Model

In [71]:
## Model pipeline
pipe_tfidf_rf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ( 'rf',  RandomForestClassifier(random_state =42))])

params_tfidf_rf = {
    'rf__n_estimators' : [100,125],
    'rf__max_depth' : [None, 4],
    'rf__max_features' : [None, ## bagging
                      'auto'] ## random forest 
},

In [72]:
## Model gridsearch
gs_tfidf_rf = GridSearchCV(pipe_tfidf_rf, # what object are we optimizing?
                  params_tfidf_rf, # what parameters values are we searching?
                  cv=5) # 5-fold cross-validation.

In [73]:
gs_tfidf_rf.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('tfidf',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                         

In [74]:
# What was the cross-validated score of the above decision tree?
gs_tfidf_rf.best_score_

0.9827302631578947

In [75]:
# Score model on training set.
gs_tfidf_rf.score(X_train, y_train)

1.0

In [76]:
# Score model on testing set.
gs_tfidf_rf.score(X_test, y_test)

0.9819699499165275

In [77]:
preds_tfidf_rf = gs_tfidf_rf.best_estimator_.predict(X_test)

In [78]:
## Model confusion maxtrix
def nice_conmat(y_test, preds_tfidf_rf, classes):
    conmat = confusion_matrix(y_test, preds_tfidf_rf)
    
    print(f'Accuracy Score: {accuracy_score(y_test, preds_tfidf_rf)}')
    return pd.DataFrame(conmat, columns=['Predicted ' +class_ for class_ in classes], \
                index=['Actual '+ class_ for class_ in classes])

nice_conmat(y_test, preds_tfidf_rf, ['open', 'is_road_closed'])

Accuracy Score: 0.9819699499165275


Unnamed: 0,Predicted open,Predicted is_road_closed
Actual open,2884,8
Actual is_road_closed,46,57
