### Importing Libraries

In [169]:
#data stuff
import pandas as pd

#Modelling
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.feature_extraction import text
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import SVC

#NLP
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from sklearn.feature_extraction import stop_words
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize import RegexpTokenizer
import re
import gensim

#other
from tqdm import tqdm_notebook
from datetime import datetime, timedelta

### Cleaning Data for Modeling
- Specifying date range and road for model 
    - `start_day` and `end_day` and should be set here. 
- Formatting X to Input in Vectorizer
    - model is built on one day of data 
    - data is broken down by hour
    - all tweets from that hour are concatenated together
    
#### Data Dictionary
|Feature|Type|Dataset|Description|
|---|---|---|---|
|time|Object (datetime)|data2|Time of tweet (EDT), formatted yyyy-mm-dd hh:00:00+00:00; covers time span of hh:00 to hh:59|
|tweets|Object (string)|data2|Concatenated tweets pulled from traffic sites during that hour|  

- Formatting Y to Train X data
    - Y is built off of manually input values from reliable traffic twitter sites
    - Data is entered by hour 
    - 6 Major roadways represented in both directions

#### Data Dictionary
|Feature|Type|Dataset|Description|
|---|---|---|---|
|date|Object (datetime)|closed|Time of tweet (EDT), formatted yyyy-mm-dd hh:00:00+00:00; covers time span of hh:00 to hh:59|
|1-95 North|int|closed|Staus of major roadway in direction indicated (0 = no lane closures or incidents for duration of hour, 1 = any type of lane closure or incident during hour duration)|  
|1-95 South|int|closed|Staus of major roadway in direction indicated (0 = no lane closures or incidents for duration of hour, 1 = any type of lane closure or incident during hour duration)|  
|95 Express North|int|closed|Staus of major roadway in direction indicated (0 = no lane closures or incidents for duration of hour, 1 = any type of lane closure or incident during hour duration)|  
|95 Express South|int|closed|Staus of major roadway in direction indicated (0 = no lane closures or incidents for duration of hour, 1 = any type of lane closure or incident during hour duration)| 
|I-195 East|int|closed|Staus of major roadway in direction indicated (0 = no lane closures or incidents for duration of hour, 1 = any type of lane closure or incident during hour duration)|  
|I-195 West|int|closed|Staus of major roadway in direction indicated (0 = no lane closures or incidents for duration of hour, 1 = any type of lane closure or incident during hour duration)| 
|SR 826 North|int|closed|Staus of major roadway in direction indicated (0 = no lane closures or incidents for duration of hour, 1 = any type of lane closure or incident during hour duration)|  
|SR 826 South|int|closed|Staus of major roadway in direction indicated (0 = no lane closures or incidents for duration of hour, 1 = any type of lane closure or incident during hour duration)| 
|US-1 North|int|closed|Staus of major roadway in direction indicated (0 = no lane closures or incidents for duration of hour, 1 = any type of lane closure or incident during hour duration)|  
|US-1 South|int|closed|Staus of major roadway in direction indicated (0 = no lane closures or incidents for duration of hour, 1 = any type of lane closure or incident during hour duration)| 

- Merging data2 and closed2 such that only hours with data remain

In [118]:
start_day = 23 #  Day refers to a day in July 2019
end_day = 30 # Range is inclusive (end day is included in data)

In [119]:
def clean_X(day_read):
    data = pd.read_csv('Datasets/timeloop_'+day_read+'.csv')
    data.drop(columns=['User','User_ID','Geo'], inplace = True)
    data['Date'] = pd.to_datetime(data['Date'])
    data['Date'] = data.assign(Date=data['Date'].dt.round('H'))['Date']
    times = []
    times.append(data['Date'][0])

    for i in tqdm_notebook(data.index):
        time = data['Date'][i]
        if time != times[len(times)-1]:
            times.append(time)
        
    dic = {'time': [], 'tweets': []}

    for hour in tqdm_notebook(times):
        total = ''
        tweets = list(data[data['Date'] == hour]['Tweet'])
        for twit in tweets:
            total += str(twit)
        dic['time'].append(hour)
        dic['tweets'].append(total)
    data2 = pd.DataFrame.from_dict(dic)

    data2.set_index('time', inplace=True)
    data2.sort_index(inplace=True)
    return data2

In [120]:
data_X = pd.DataFrame()

for day in range(start_day, end_day+1):
    cleaned_X = clean_X('2019-07-' + str(day))
    data_X = pd.concat([cleaned_X, data_X])

HBox(children=(IntProgress(value=0, max=80000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=14), HTML(value='')))




HBox(children=(IntProgress(value=0, max=80000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=13), HTML(value='')))




HBox(children=(IntProgress(value=0, max=80000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=13), HTML(value='')))




HBox(children=(IntProgress(value=0, max=80000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=12), HTML(value='')))




HBox(children=(IntProgress(value=0, max=80000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=13), HTML(value='')))




HBox(children=(IntProgress(value=0, max=80000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=19), HTML(value='')))




HBox(children=(IntProgress(value=0, max=80000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=19), HTML(value='')))




HBox(children=(IntProgress(value=0, max=80000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=14), HTML(value='')))




In [121]:
data_X.to_csv('./Datasets/data_X_all_tweets')

In [139]:
closed = pd.read_csv('Datasets/manual_y - PDT.csv')
closed.rename({'Unnamed: 0': 'Date'}, axis=1, inplace=True)
closed['Date'] = pd.to_datetime(closed['Date'], utc=True) + timedelta(hours=7)
# added UTC=True to make formatting match X table
closed.set_index('Date', inplace=True)
closed.sort_index(inplace=True)

In [140]:
model_data = pd.concat([data_X, closed], axis=1, join='outer').dropna()

### Creating Model
- Train/Test Split (skipped for small initial dataset)
- Create Pipeline
- Gridsearch Parameters

In [186]:
model_data['SR 836 East'].value_counts()

0    69
1    48
Name: SR 836 East, dtype: int64

In [187]:
X = model_data['tweets']
y = model_data['SR 836 East']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

In [181]:
model_data.columns

Index(['tweets', 'I-95 North', 'I-95 South', '95 Express North',
       '95 Express South', 'I-195 East', 'I-195 West', 'SR 826 North',
       'SR 826 South', 'SR 836 East', 'SR 836 West', 'US-1 North',
       'US-1 South'],
      dtype='object')

#### Multinomial Naive Bayes

Best Model Performance: 
- Train score = 0.87
- Test score = 0.83

In [145]:
pipe = Pipeline([
    ('vec', TfidfVectorizer(token_pattern='[a-zA-z]+ | [A-Za-z]+\-*\d+\W(?:[sS]outh|[Nn]orth|East|West|[NSEW]{1,2}|[nswe]{1,2})*')),
    ('nb', MultinomialNB())
])
pipe_params = {
    'vec__stop_words': ['english'],
    'vec__max_features': [2000],
    'vec__min_df': [1],
    'vec__max_df': [.6],
    'vec__ngram_range': [(2,5)],
    'nb__alpha': [.6]
}

gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3)
gs.fit(X_train, y_train)
print('cvs:', gs.best_score_)
print('train score:', gs.score(X_train, y_train))
print('test score:', gs.score(X_test, y_test))
gs.best_params_

cvs: 0.7471264367816092
train score: 0.8735632183908046
test score: 0.8333333333333334


{'nb__alpha': 0.6,
 'vec__max_df': 0.6,
 'vec__max_features': 2000,
 'vec__min_df': 1,
 'vec__ngram_range': (2, 5),
 'vec__stop_words': 'english'}

#### Random Forest

Best Model Performance: 
- Train score = 0.93
- Test score = 0.8

In [165]:
pipe = Pipeline([
    ('vec', TfidfVectorizer(token_pattern='[a-zA-z]+ | [A-Za-z]+\-*\d+\W(?:[sS]outh|[Nn]orth|East|West|[NSEW]{1,2}|[nswe]{1,2})*')),
    ('rf', RandomForestClassifier())
])

pipe_params = {
    'vec__stop_words': ['english'],
    'vec__max_features': [1000],
    'vec__min_df': [1],
    'vec__max_df': [.5],
    'vec__ngram_range': [(2,5)],
    'rf__n_estimators': [3],
    'rf__max_depth' : [6],
    'rf__min_samples_split' : [.18],
    'rf__criterion' : ['gini'],
    'rf__min_samples_leaf' : [1],
    'rf__max_features' : [.88]
}

gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3, n_jobs=2, verbose=10)
gs.fit(X_train, y_train)
print('cvs:', gs.best_score_)
print('train score:', gs.score(X_train, y_train))
print('test score:', gs.score(X_test, y_test))
gs.best_params_

Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:  2.8min
[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:  5.8min
[Parallel(n_jobs=2)]: Done   7 out of   9 | elapsed: 11.8min remaining:  3.4min
[Parallel(n_jobs=2)]: Done   9 out of   9 | elapsed: 13.8min remaining:    0.0s
[Parallel(n_jobs=2)]: Done   9 out of   9 | elapsed: 13.8min finished


cvs: 0.7816091954022989
train score: 0.9655172413793104
test score: 0.7666666666666667


{'rf__criterion': 'gini',
 'rf__max_depth': 6,
 'rf__max_features': 0.88,
 'rf__min_samples_leaf': 1,
 'rf__min_samples_split': 0.18,
 'rf__n_estimators': 3,
 'vec__max_df': 0.5,
 'vec__max_features': 800,
 'vec__min_df': 1,
 'vec__ngram_range': (2, 5),
 'vec__stop_words': 'english'}

#### Breaking out TFIDF 
- Vectorizing tweets is time consuming
- Tuning models to high performing TFIDF parameters first
- Run pipe param with vectorizer after other model parameters are tuned

In [189]:
tfidf = TfidfVectorizer(
    token_pattern='[a-zA-z]+ | [A-Za-z]+\-*\d+\W(?:[sS]outh|[Nn]orth|East|West|[NSEW]{1,2}|[nswe]{1,2})*',
    stop_words='english',
    max_features=2000,
    min_df=1,
    max_df=.6,
    ngram_range=(2,5)
)
X_train_vec = tfidf.fit_transform(X_train)
X_test_vec = tfidf.transform(X_test)

#### SVC

Best Model Performance: 
- Train score = 1.0, .99
- Test score = 0.9, .87

In [204]:
X_train, X_test, y_train, y_test = train_test_split(X_vec, y, 
                                                        stratify=y, 
                                                        random_state=42)
grid_params = {
    'kernel' : ['poly'],
    'gamma' : [450, 500, 550],
    'C' : [0.002, 0.003, 0.004],
    'degree' : [1]
}

gs = GridSearchCV(SVC(), param_grid=grid_params, cv=3)
gs.fit(X_train, y_train)
print('cvs:', gs.best_score_)
print('train score:', gs.score(X_train, y_train))
print('test score:', gs.score(X_test, y_test))
gs.best_params_

cvs: 0.8505747126436781
train score: 1.0
test score: 0.9


{'C': 0.004, 'degree': 1, 'gamma': 550, 'kernel': 'poly'}

In [177]:
pipe = Pipeline([
    ('vec', TfidfVectorizer(token_pattern='[a-zA-z]+ | [A-Za-z]+\-*\d+\W(?:[sS]outh|[Nn]orth|East|West|[NSEW]{1,2}|[nswe]{1,2})*')),
    ('svc', SVC())
])

pipe_params = {
    'vec__stop_words': ['english'],
    'vec__max_features': [1000, 2000],
    'vec__min_df': [1, 2],
    'vec__max_df': [.5, .6],
    'vec__ngram_range': [(2,5), (3,5)],
    'svc__kernel' : ['poly'],
    'svc__gamma' : [800],
    'svc__C' : [0.009],
    'svc__degree' : [1]
}

gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3, n_jobs=2, verbose=10)
gs.fit(X_train, y_train)
print('cvs:', gs.best_score_)
print('train score:', gs.score(X_train, y_train))
print('test score:', gs.score(X_test, y_test))
gs.best_params_

Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:  2.2min
[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:  4.4min
[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed: 10.0min
[Parallel(n_jobs=2)]: Done  14 tasks      | elapsed: 14.0min
[Parallel(n_jobs=2)]: Done  21 tasks      | elapsed: 22.2min
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed: 28.9min
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed: 38.6min
[Parallel(n_jobs=2)]: Done  48 out of  48 | elapsed: 48.4min finished


cvs: 0.8275862068965517
train score: 1.0
test score: 0.8666666666666667


{'svc__C': 0.009,
 'svc__degree': 1,
 'svc__gamma': 800,
 'svc__kernel': 'poly',
 'vec__max_df': 0.5,
 'vec__max_features': 1000,
 'vec__min_df': 2,
 'vec__ngram_range': (2, 5),
 'vec__stop_words': 'english'}

### Running SVC on all major roads, both directions
- Using parameters that performed best for I-95 North and performed well on several other roads
- `predictions_df` is a matrix which outputs the status of each road for hours that data exists
- `performances_df` records the baseline accuracy and the train accuracy and the test accuracy

In [192]:
roads = ['I-95 North', 'I-95 South', '95 Express North',
       '95 Express South', 'I-195 East', 'I-195 West', 'SR 826 North',
       'SR 826 South', 'SR 836 East', 'SR 836 West', 'US-1 North',
       'US-1 South']

In [229]:
X = model_data['tweets']

# predictions_df = pd.DataFrame()
# performances_df = pd.DataFrame(
#     index=['baseline acc', 'train_acc', 'test_acc'])

roads = ['SR 826 North',
       'SR 826 South', 'SR 836 East', 'SR 836 West', 'US-1 North',
       'US-1 South']

for road in roads: 
    print(road)
    performance = []
    
    # adding baseline to performance
    values = model_data[road].value_counts()
    baseline = values[0]/values.sum()
    if baseline<0.5:
        baseline = values[1]/values.sum()
    performance.append(baseline)
    print(baseline)
    
    # vectorizing 
    tfidf = TfidfVectorizer(
        token_pattern='[a-zA-z]+ | [A-Za-z]+\-*\d+\W(?:[sS]outh|[Nn]orth|East|West|[NSEW]{1,2}|[nswe]{1,2})*',
        stop_words='english',
        max_features=2000,
        min_df=1,
        max_df=.6,
        ngram_range=(2,5)
    )
    X_vec = tfidf.fit_transform(X)
    
    # train, test, split
    y = model_data[road]
    X_train, X_test, y_train, y_test = train_test_split(X_vec, y, 
                                                        stratify=y, 
                                                        random_state=42)
    
    # modeling
    svc = SVC(kernel='poly', gamma=550, C=0.004, degree=1)
    svc.fit(X_train, y_train)
    
    train = svc.score(X_train, y_train)
    performance.append(train)
    test = svc.score(X_test, y_test)
    performance.append(test)
    print('train score:', train)
    print('test score:', test)
    
    performances_df[road] = performance
    
    preds = svc.predict(X_vec)
    preds_df = pd.DataFrame(preds, index=X.index)
    predictions_df = pd.concat([predictions_df,preds_df], axis=1)

SR 826 North
0.7350427350427351
train score: 0.9425287356321839
test score: 0.7333333333333333
SR 826 South
0.6581196581196581
train score: 1.0
test score: 0.7333333333333333
SR 836 East
0.5897435897435898
train score: 0.9655172413793104
test score: 0.7333333333333333
SR 836 West
0.7264957264957265
train score: 0.9540229885057471
test score: 0.7333333333333333
US-1 North
0.8632478632478633
train score: 0.9540229885057471
test score: 0.8666666666666667
US-1 South
0.7350427350427351
train score: 0.9310344827586207
test score: 0.7


In [231]:
predictions_df.columns = ['I-95 North', 'I-95 South', '95 Express North',
       '95 Express South', 'I-195 East', 'SR 826 North',
       'SR 826 South', 'SR 836 East', 'SR 836 West', 'US-1 North',
       'US-1 South']

In [232]:
predictions_df

Unnamed: 0,I-95 North,I-95 South,95 Express North,95 Express South,I-195 East,SR 826 North,SR 826 South,SR 836 East,SR 836 West,US-1 North,US-1 South
2019-07-22 11:00:00+00:00,1,0,0,0,0,0,0,0,0,0,0
2019-07-22 12:00:00+00:00,1,0,0,0,0,0,0,1,0,0,0
2019-07-22 13:00:00+00:00,1,0,0,0,0,0,0,0,0,0,0
2019-07-22 14:00:00+00:00,0,0,0,0,0,0,0,0,0,0,0
2019-07-22 15:00:00+00:00,0,0,0,0,0,0,0,0,0,0,0
2019-07-22 16:00:00+00:00,0,0,0,0,0,0,0,0,0,0,0
2019-07-22 17:00:00+00:00,0,0,0,0,0,0,0,0,0,0,0
2019-07-22 18:00:00+00:00,0,0,0,0,0,0,0,0,0,0,0
2019-07-22 19:00:00+00:00,0,0,0,0,0,0,0,0,0,0,0
2019-07-22 20:00:00+00:00,0,0,0,1,0,0,0,0,0,0,0


In [233]:
performances_df

Unnamed: 0,I-95 North,I-95 South,95 Express North,95 Express South,I-195 East,SR 826 North,SR 826 South,SR 836 East,SR 836 West,US-1 North,US-1 South
baseline acc,0.692308,0.692308,0.623932,0.863248,0.82906,0.735043,0.65812,0.589744,0.726496,0.863248,0.735043
train_acc,1.0,0.988506,0.91954,0.942529,0.988506,0.942529,1.0,0.965517,0.954023,0.954023,0.931034
test_acc,0.9,0.866667,0.766667,0.866667,0.866667,0.733333,0.733333,0.733333,0.733333,0.866667,0.7


In [None]:
performances_df.to_csv()

### Importing Tweets from traffic sources only

In [None]:
data = pd.read_csv('Datasets/timeloop_'+day_read+'.csv')
data.drop(columns=['User','User_ID','Geo'], inplace = True)
data['Date'] = pd.to_datetime(data['Date'])
data['Date'] = data.assign(Date=data['Date'].dt.round('H'))['Date']
times = []
times.append(data['Date'][0])

for i in tqdm_notebook(data.index):
    time = data['Date'][i]
    if time != times[len(times)-1]:
        times.append(time)
        
dic = {'time': [], 'tweets': []}

for hour in tqdm_notebook(times):
    total = ''
    tweets = list(data[data['Date'] == hour]['Tweet'])
    for twit in tweets:
        total += str(twit)
    dic['time'].append(hour)
    dic['tweets'].append(total)
data2 = pd.DataFrame.from_dict(dic)

data2.set_index('time', inplace=True)
data2.sort_index(inplace=True)