In [None]:
import gc
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
import statsmodels.api as sm
from tqdm import tqdm
import time

from sklearn.metrics import mean_squared_error

from xgboost import XGBRegressor
from catboost import CatBoostRegressor

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

np.random.seed(314)

In [None]:
other_columns = ['TimeFromFirstStop_p20', 'TimeFromFirstStop_p40', 'TimeFromFirstStop_p50', 'TimeFromFirstStop_p60', 'TimeFromFirstStop_p80',
                 'TotalTimeStopped_p40', 'TotalTimeStopped_p60', 'DistanceToFirstStop_p40', 'DistanceToFirstStop_p60']
train = pd.read_csv('../input/bigquery-geotab-intersection-congestion/train.csv').set_index('RowId').drop(columns=other_columns)
test = pd.read_csv('../input/bigquery-geotab-intersection-congestion/test.csv').set_index('RowId')
train_idxs = train.index
test_idxs = test.index
data = pd.concat([train, test], axis=0, join='outer')

In [None]:
target_vars = ['TotalTimeStopped_p20', 'TotalTimeStopped_p50', 'TotalTimeStopped_p80', 'DistanceToFirstStop_p20', 'DistanceToFirstStop_p50', 'DistanceToFirstStop_p80']
cities = train.City.unique()
fig, ax = plt.subplots(nrows=6, ncols=4, figsize=(20,30))
bins = list(range(0, 200, 10))
for i, var in enumerate(target_vars):
    for j, city in enumerate(cities):
        sns.distplot(train[train.City == city][var], bins=bins, kde=False, ax=ax[i, j]).set_title(city)

Note that we have significant right skew. Also note that except for TotalTimeStopped_p80, the majority of entries have values in the minimum bin, near zero. We won't commit to using a log transform on these variables yet, but throughout the EDA we may use the logarithm of TotalTimeStopped_p80 as a proxy for overall congestion. This will inform our feature engineering and selection decisions. For example, we can decide whether two merge two categories by their relationship with the logarithm of TotalTimeStopped_p80. 

# Feature Engineering
### EntryStreetName and ExitStreetName

In [None]:
data.isna().sum().sort_values() * 100 / len(data)

We could impute the missing values of EntryStreetName and ExitStreetName with the mode of each respective variable, conditioned on its ExitHeading, EntryHeading, IntersectionId, and City.  Both EntryStreetName and ExitStreetName are redundant when IntersectionId, EntryHeading, and ExitHeading are known. However, the type of roadway (street, road, parkway, etc.) may be predictive of traffic density, so we'll extract EntryRoadway and ExitRoadway from these variable. After doing so we will impute missing values. We could use the described method, but it is quite time consuming for such a large dataset. Instead, we'll simply impute missing values with the 'Other' category. Given that less than 1% of each variable is missing, this is not a significant loss of precision. 
  
1. Compile a list of the common roadway types, and a map from shorthand names to full names
2. Group less common roadway types into a single 'Other' roadway type
3. Convert streetnames to roadway type
4. Impute missing values

In [None]:
roadways = ['Street', 'Road', 'Boulevard', 'Avenue', 'Lane', 'Drive', 'Parkway', 'Place', 'Way', 
            'Circle', 'Highway', 'Pkwy', 'St', 'Connector', 'Broadway', 'Overpass', 'Ave', 'Square', 
            'Tunner', 'Rd', 'Bld', 'Bridge', 'Expressway', 'Pike']
to_longform = {'Rd': 'Road', 'Bld': 'Boulevard', 'Ave': 'Avenue', 'St': 'Street', 'Pkwy': 'Parkway'}

street_names = pd.concat([data['EntryStreetName'], data['ExitStreetName']], ignore_index=True).dropna()

seen = set()
for street in street_names:
    if all([roadway not in street for roadway in roadways]):
        if street not in seen:
            print(street)
            seen.add(street)

1. After a quick inspection of the undetected values, we can conclude that there is not a significant roadway that we have not added to our list of roadway types. We can now move on to step 2.

**Implementation Note**: for loops are generally frowned upon when working with Pandas objects. However, the alternative in this case would be to use the *.apply()* method, which often has worse performance than a for loop. 

In [None]:
def to_roadway(StreetName):
    if pd.isnull(StreetName):
        return 'Other'
    for roadway in roadways:
        if roadway in StreetName:
            if roadway in to_longform:
                return to_longform[roadway]
            else:
                return roadway
    return 'Other'

both_roadway = street_names.apply(to_roadway)

In [None]:
plt.figure(figsize=(20,10))
sns.countplot(both_roadway, order=both_roadway.value_counts().index);

In [None]:
roadways = ['Street', 'Avenue', 'Road', 'Boulevard', 'Drive', 'Parkway']
to_longform = {'St': 'Street', 'Ave': 'Avenue', 'Rd': 'Road', 'Bld': 'Boulevard', 'Pkwy': 'Parkway'}
def to_roadway(StreetName):
    if pd.isnull(StreetName):
        return 'Other'
    for roadway in roadways:
        if roadway in StreetName:
            if roadway in to_longform:
                return to_longform[roadway]
            else:
                return roadway
    return 'Other'

2. The new *to_roadway* function groups the uncommon categories into a single 'Other' category. The countplot shows two sharp discontinuities: between Avenue and Road, and between Parkway and Highway. We choose to cut off at the latter since we'd like to maintain a distinction between a parkway and a drive, for example, since the difference in congestion between the two may be sharp. 

In [None]:
data['EntryRoadway'] = data['EntryStreetName'].apply(to_roadway)
data['ExitRoadway'] = data['ExitStreetName'].apply(to_roadway)
data.drop(columns=['EntryStreetName', 'ExitStreetName'], inplace=True)

3. Converted entry and exit streetnames to their respective roadway type.

In [None]:
fig, ax = plt.subplots(ncols=2, figsize=(20, 6))
sns.countplot(data.EntryRoadway, order=data.EntryRoadway.value_counts().index, ax=ax[0]);
sns.countplot(data.ExitRoadway, order=data.ExitRoadway.value_counts().index, ax=ax[1]);

### EntryHeading and ExitHeading

In [None]:
radians_map = dict(zip('E NE N NW W SW S SE'.split(), [np.pi*i/4 for i in range(8)]))
degrees_map = dict(zip('E NE N NW W SW S SE'.split(), [i*45 for i in range(8)]))
entry_heading_degrees = data.EntryHeading.map(degrees_map)
exit_heading_degrees = data.ExitHeading.map(degrees_map)

In [None]:
fig, ax = plt.subplots(ncols=2, figsize=(22,6))
data['DeltaHeading'] = (exit_heading_degrees - entry_heading_degrees + 180) % 360 - 180
sns.countplot(x='DeltaHeading', data=data, order=sorted(data.DeltaHeading.unique()), ax=ax[0]);
data['TotalTimeStopped_p80_log'] = np.log1p(data['TotalTimeStopped_p80'])
sns.boxplot(x='DeltaHeading', y='TotalTimeStopped_p80_log', data=data, ax=ax[1]);

Note that 90 corresponds to a left turn, and -90 corresponds to a right turn. We see that a sharp left turn (135) and a sharp right turn (-135) do not occur often and that they seem to have roughly the same distribution of TotalTimeStopped_p80_log, so we'll map sharp turns to normal turns. Soft turns (-45 and 45) occur frequently and their distribution over TotalTimeStopped_p80_log is distinct, so we'll leave them as is. "U-Turns" (-180) do not occur often and have a similar distribution over TotalTimeStopped_p80_log as left turns. This makes sense since in countries that drive on the right side of the road, a "U-turn" is equivalent to a very sharp left turn. 

If EntryHeading and DeltaHeading are known then ExitHeading is redundant. This statement isn't true in the even of the two sharp turns and "U-Turns" that have been pooled, but in the vast majority of cases it holds. For the sake of memory savings and since many machine learning models suffer from the presence of strongly correlated predictors, we drop ExitHeading from the data. 

In [None]:
pooled_heading_map = {-180: 'L', -135:'R', -90:'R', -45:'R_soft', 0:'S', 45:'L_soft', 90:'L', 135:'L'}
data['DeltaHeading'] = data['DeltaHeading'].map(pooled_heading_map)
data.drop(columns=['ExitHeading'], inplace=True)

### Month

In [None]:
fig, ax = plt.subplots(ncols=2, figsize=(22,6))
sns.countplot(x='Month', data=data, ax=ax[0]);
sns.violinplot(x='Month', y='TotalTimeStopped_p80_log', data=data, ax=ax[1]);

It's difficult to see a pattern between months and TotalTimeStopped_p80_log, so we'll use them as a categorical, rather than ordinal, variable. 

### Hour

In [None]:
fig, ax = plt.subplots(ncols=2, figsize=(22,6))
sns.countplot(x='Hour', data=data, ax=ax[0]);
sns.boxplot(x='Hour', y='TotalTimeStopped_p80_log', data=data, ax=ax[1]);

In [None]:
data['dist_to_5pm'] = abs(data.Hour - 17)
data['dist_to_8am'] = abs(data.Hour - 8)

We expect to see spikes in congestion around 8am and 5pm, so we create a variable representing distance to each of these times. Hour will be correlated with these two, but that correlation is complicated by the nature of the absolute value function. We'll leave hour as a categorical value for now. 

### Latitude and Longitude

In [None]:
fig, ax = plt.subplots(ncols=2, nrows=4, figsize=(20, 20))
for row, city in enumerate(data.City.unique()):
    city_data = data[(data.City == city) & (data.TotalTimeStopped_p80_log > 0)]
    sns.scatterplot(x='Longitude', y='TotalTimeStopped_p80', data=city_data, ax=ax[row, 0]).set_title(city);
    sns.scatterplot(x='Latitude', y='TotalTimeStopped_p80', data=city_data, ax=ax[row, 1]).set_title(city);

In [None]:
centers_data = [['Atlanta', 33.7490, -84.3880], ['Boston', 42.3601, -71.0589], ['Chicago', 41.8781, -87.6298], ['Philadelphia', 39.9509, -75.1575]]
centers = pd.DataFrame(centers_data, columns=['City', 'Latitude', 'Longitude']).set_index('City')

data['latitude_dist'] = data[['City', 'Latitude']].apply(lambda x : abs(x['Latitude'] - centers.loc[x['City'], 'Latitude']), axis=1)
data['longitude_dist'] = data[['City', 'Longitude']].apply(lambda x : abs(x['Longitude'] - centers.loc[x['City'], 'Longitude']), axis=1)

In [None]:
data.drop(columns=['Latitude', 'Longitude'], inplace=True)

### Path
Path is similar to street, and we don't expect it to be predictive of congestion given the Roadway variables we created. 

In [None]:
data.drop(columns=['Path'], inplace=True)

### City and IntersectionId
Both can be expected to be strongly predictive of congestion. Differing cities have shared intersection ids, so we'll combined these two variables to maintain the uniqueness of the intersections across cities. 

In [None]:
data['city_intersection'] = data.City + data.IntersectionId.astype(str)
data.drop(columns=['IntersectionId'], inplace=True)

## Save

In [None]:
data.drop(columns=['TotalTimeStopped_p80_log'], inplace=True)

In [None]:
num_vars = ['dist_to_5pm', 'dist_to_8am', 'latitude_dist', 'longitude_dist']
cat_vars = ['EntryRoadway', 'ExitRoadway', 'DeltaHeading', 'EntryHeading', 'Hour', 'Month', 'city_intersection']
bool_vars = ['Weekend']
vars_type_map = {'num': num_vars, 'cat': cat_vars, 'bool': bool_vars}
predictor_vars = [var for L in vars_type_map.values() for var in L]
target_vars = ['TotalTimeStopped_p20', 'TotalTimeStopped_p50', 'TotalTimeStopped_p80', 'DistanceToFirstStop_p20', 'DistanceToFirstStop_p50', 'DistanceToFirstStop_p80']

In [None]:
data[vars_type_map['cat']] = data[vars_type_map['cat']].astype('category')
data.loc[train_idxs].to_csv('train_processed.csv')
data.loc[test_idxs].to_csv('test_processed.csv')