### Importing Libraries

In [136]:
#data stuff
import pandas as pd

#Modelling
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.feature_extraction import text
from sklearn.ensemble import RandomForestClassifier

#NLP
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from sklearn.feature_extraction import stop_words
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize import RegexpTokenizer
import re
import gensim

#other
from tqdm import tqdm_notebook
from datetime import datetime, timedelta

### Cleaning Data for Modeling
- Specifying date range and road for model
    - Code below refers only to `day_read` and `road_dir`. 
    - `start_day`, `end_day` and `road_dir` should be set here. 
- Formatting X to Input in Vectorizer
    - model is built on one day of data 
    - data is broken down by hour
    - all tweets from that hour are concatenated together
    
#### Data Dictionary
|Feature|Type|Dataset|Description|
|---|---|---|---|
|time|Object (datetime)|data2|Time of tweet (EDT), formatted yyyy-mm-dd hh:00:00+00:00; covers time span of hh:00 to hh:59|
|tweets|Object (string)|data2|Concatenated tweets pulled from traffic sites during that hour|  

- Formatting Y to Train X data
    - Y is built off of manually input values from reliable traffic twitter sites
    - Data is entered by hour 
    - 6 Major roadways represented in both directions

#### Data Dictionary
|Feature|Type|Dataset|Description|
|---|---|---|---|
|date|Object (datetime)|closed|Time of tweet (EDT), formatted yyyy-mm-dd hh:00:00+00:00; covers time span of hh:00 to hh:59|
|1-95 North|int|closed|Staus of major roadway in direction indicated (0 = no lane closures or incidents for duration of hour, 1 = any type of lane closure or incident during hour duration)|  
|1-95 South|int|closed|Staus of major roadway in direction indicated (0 = no lane closures or incidents for duration of hour, 1 = any type of lane closure or incident during hour duration)|  
|95 Express North|int|closed|Staus of major roadway in direction indicated (0 = no lane closures or incidents for duration of hour, 1 = any type of lane closure or incident during hour duration)|  
|95 Express South|int|closed|Staus of major roadway in direction indicated (0 = no lane closures or incidents for duration of hour, 1 = any type of lane closure or incident during hour duration)| 
|I-195 East|int|closed|Staus of major roadway in direction indicated (0 = no lane closures or incidents for duration of hour, 1 = any type of lane closure or incident during hour duration)|  
|I-195 West|int|closed|Staus of major roadway in direction indicated (0 = no lane closures or incidents for duration of hour, 1 = any type of lane closure or incident during hour duration)| 
|SR 826 North|int|closed|Staus of major roadway in direction indicated (0 = no lane closures or incidents for duration of hour, 1 = any type of lane closure or incident during hour duration)|  
|SR 826 South|int|closed|Staus of major roadway in direction indicated (0 = no lane closures or incidents for duration of hour, 1 = any type of lane closure or incident during hour duration)| 
|US-1 North|int|closed|Staus of major roadway in direction indicated (0 = no lane closures or incidents for duration of hour, 1 = any type of lane closure or incident during hour duration)|  
|US-1 South|int|closed|Staus of major roadway in direction indicated (0 = no lane closures or incidents for duration of hour, 1 = any type of lane closure or incident during hour duration)| 

- Merging data2 and closed2 such that only hours with data remain

In [118]:
start_day = 23 #  Day refers to a day in July 2019
end_day = 30 # Range is inclusive (end day is included in data)
road_dir = 'I-95 North'

In [119]:
def clean_X(day_read):
    data = pd.read_csv('Datasets/timeloop_'+day_read+'.csv')
    data.drop(columns=['User','User_ID','Geo'], inplace = True)
    data['Date'] = pd.to_datetime(data['Date'])
    data['Date'] = data.assign(Date=data['Date'].dt.round('H'))['Date']
    times = []
    times.append(data['Date'][0])

    for i in tqdm_notebook(data.index):
        time = data['Date'][i]
        if time != times[len(times)-1]:
            times.append(time)
        
    dic = {'time': [], 'tweets': []}

    for hour in tqdm_notebook(times):
        total = ''
        tweets = list(data[data['Date'] == hour]['Tweet'])
        for twit in tweets:
            total += str(twit)
        dic['time'].append(hour)
        dic['tweets'].append(total)
    data2 = pd.DataFrame.from_dict(dic)

    data2.set_index('time', inplace=True)
    data2.sort_index(inplace=True)
    return data2

In [120]:
data_X = pd.DataFrame()

for day in range(start_day, end_day+1):
    cleaned_X = clean_X('2019-07-' + str(day))
    data_X = pd.concat([cleaned_X, data_X])

HBox(children=(IntProgress(value=0, max=80000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=14), HTML(value='')))




HBox(children=(IntProgress(value=0, max=80000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=13), HTML(value='')))




HBox(children=(IntProgress(value=0, max=80000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=13), HTML(value='')))




HBox(children=(IntProgress(value=0, max=80000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=12), HTML(value='')))




HBox(children=(IntProgress(value=0, max=80000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=13), HTML(value='')))




HBox(children=(IntProgress(value=0, max=80000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=19), HTML(value='')))




HBox(children=(IntProgress(value=0, max=80000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=19), HTML(value='')))




HBox(children=(IntProgress(value=0, max=80000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=14), HTML(value='')))




In [121]:
data_X.to_csv('./Datasets/data_X_all_tweets')

In [139]:
closed = pd.read_csv('Datasets/manual_y - PDT.csv')
closed.rename({'Unnamed: 0': 'Date'}, axis=1, inplace=True)
closed['Date'] = pd.to_datetime(closed['Date'], utc=True) + timedelta(hours=7)
# added UTC=True to make formatting match X table
closed.set_index('Date', inplace=True)
closed.sort_index(inplace=True)

In [140]:
model_data = pd.concat([data_X, closed], axis=1, join='outer').dropna()

### Creating Model
- Train/Test Split (skipped for small initial dataset)
- Create Pipeline
- Gridsearch Parameters

In [142]:
model_data['I-95 North'].value_counts()

0    81
1    36
Name: I-95 North, dtype: int64

In [143]:
X = model_data['tweets']
y = model_data[road_dir]
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

#### Multinomial Naive Bayes

Best Model Performance: 
- Train score = 0.87
- Test score = 0.83

In [145]:
pipe = Pipeline([
    ('vec', TfidfVectorizer(token_pattern='[a-zA-z]+ | [A-Za-z]+\-*\d+\W(?:[sS]outh|[Nn]orth|East|West|[NSEW]{1,2}|[nswe]{1,2})*')),
    ('nb', MultinomialNB())
])
pipe_params = {
    'vec__stop_words': ['english'],
    'vec__max_features': [2000],
    'vec__min_df': [1],
    'vec__max_df': [.6],
    'vec__ngram_range': [(2,5)],
    'nb__alpha': [.6]
}

gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3)
gs.fit(X_train, y_train)
print('cvs:', gs.best_score_)
print('train score:', gs.score(X_train, y_train))
print('test score:', gs.score(X_test, y_test))
gs.best_params_

cvs: 0.7471264367816092
train score: 0.8735632183908046
test score: 0.8333333333333334


{'nb__alpha': 0.6,
 'vec__max_df': 0.6,
 'vec__max_features': 2000,
 'vec__min_df': 1,
 'vec__ngram_range': (2, 5),
 'vec__stop_words': 'english'}

#### Random Forest

Best Model Performance: 
- Train score = 
- Test score = 

In [None]:
pipe = Pipeline([
    ('vec', TfidfVectorizer(token_pattern='[a-zA-z]+ | [A-Za-z]+\-*\d+\W(?:[sS]outh|[Nn]orth|East|West|[NSEW]{1,2}|[nswe]{1,2})*')),
    ('rf', RandomForestClassifier())
])

pipe_params = {
    'vec__stop_words': ['english'],
    'vec__max_features': [1000, 2000, 3000],
    'vec__min_df': [1, 2],
    'vec__max_df': [.5, .6],
    'vec__ngram_range': [(2,5), (3,5)],
    'rf__n_estimators': [3],
    'rf__max_depth' : [6],
    'rf__min_samples_split' : [.18],
    'rf__criterion' : ['gini'],
    'rf__min_samples_leaf' : [1],
    'rf__max_features' : [.88]
}

gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3)
gs.fit(X_train, y_train)
print('cvs:', gs.best_score_)
print('train score:', gs.score(X_train, y_train))
print('test score:', gs.score(X_test, y_test))
gs.best_params_

#### Breaking out TFIDF 
- Vectorizing tweets is time consuming
- Tuning models to high performing TFIDF parameters first
- Run pipe param with vectorizer after other model parameters are tuned

In [None]:
tfidf = TfidfVectorizer(
    token_pattern='[a-zA-z]+ | [A-Za-z]+\-*\d+\W(?:[sS]outh|[Nn]orth|East|West|[NSEW]{1,2}|[nswe]{1,2})*',
    stop_words='english',
    max_features=2000,
    min_df=1,
    max_df=.6,
    ngram_range=(2,5)
)
X_train_vec = tfidf.fit_transform(X_train)
X_test_vec = tfidf.transform(X_test)

In [None]:
pipe = Pipeline(('rf', RandomForestClassifier()))

grid_params = {
    'rf__n_estimators': [3],
    'rf__max_depth' : [6],
    'rf__min_samples_split' : [.18],
    'rf__criterion' : ['gini'],
    'rf__min_samples_leaf' : [1],
    'rf__max_features' : [.88]
}

gs = GridSearchCV(pipe, param_grid=grid_params, cv=3)
gs.fit(X_train_vec, y_train)
print('cvs:', gs.best_score_)
print('train score:', gs.score(X_train_vec, y_train))
print('test score:', gs.score(X_test_vec, y_test))
gs.best_params_