# A Broken Neural Network 

I could have tried to find a solution to this challenge using a Random Forest / XGBoost approach, but I decided to use a neural network instead because I'm a complete deep learning newbie. Since I never used that technique for regression problems, I thought it could be fun to give it a try, but after a few hours of hyper parameters tweaking and loss functions testing, my network still doesn't work (at all). Is it the loss function choice that is wrong? Is it the shape of the graph? Is it a standardization problem? Is it just the wrong approach for that kind of problem? I have no idea.

Do you folks know what went wrong? Any feedback is much appreciated!

## Loading & Cleaning

In [None]:
%matplotlib inline

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
from geopy.distance import vincenty

In [None]:
train = pd.read_csv('../input/train.csv')

In [None]:
test = pd.read_csv('../input/test.csv')

### Outliers

In [None]:
# new york city area
min_lon = -74.844,
min_lat = 40.026
max_lon = -72.221
max_lat = 41.372

In [None]:
initial_len = train.shape[0]

In [None]:
# ruling points outside area as outliers
train = train[train['pickup_longitude'].between(min_lon, max_lon)]
train = train[train['pickup_latitude'].between(min_lat, max_lat)]
train = train[train['dropoff_longitude'].between(min_lon, max_lon)]
train = train[train['dropoff_latitude'].between(min_lat, max_lat)]

In [None]:
cleaned_len = train.shape[0]

In [None]:
# outliers removed
initial_len - cleaned_len

### Compute distance

In [None]:
def get_distance(row):
    p1 = (row['pickup_latitude'], row['pickup_longitude'])
    p2 = (row['dropoff_latitude'], row['dropoff_longitude'])
    return vincenty(p1, p2).meters

In [None]:
train['distance'] = train.apply(get_distance, axis=1)

In [None]:
lon = train.loc[0]['pickup_longitude']
lat = train.loc[0]['pickup_latitude']

### NaN Correction

In [None]:
# check for na's
train.isnull().sum()

### Datetime

In [None]:
# to datetime
train['pickup_datetime'] = pd.to_datetime(train['pickup_datetime'])
train['dropoff_datetime'] = pd.to_datetime(train['dropoff_datetime'])

In [None]:
# get day and hour information
train['pickup_hour'] = train['pickup_datetime'].apply(lambda x: x.hour)
train['pickup_day'] = train['pickup_datetime'].apply(lambda x: x.weekday())

In [None]:
# convert timestamps to float
train['pickup_datetime'] = train['pickup_datetime'].apply(lambda x: x.timestamp())
train['dropoff_datetime'] = train['dropoff_datetime'].apply(lambda x: x.timestamp())

In [None]:
# get dummy variables for categorial data
train['store_and_fwd_flag'] = pd.get_dummies(train['store_and_fwd_flag'])

### Irrelevant features

In [None]:
train = train.drop('id', axis=1)

## Exploration

### Distance / Trip Duration

In [None]:
from scipy.stats import pearsonr

In [None]:
pearsonr(train['distance'], train['trip_duration'])

## RNN

In [None]:
import keras

from keras.models import Sequential, Model
from keras.layers import Dense, Activation, LSTM, Merge
from keras.optimizers import SGD
from keras.wrappers.scikit_learn import KerasRegressor

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split

In [None]:
X = train.drop('trip_duration', axis=1)
y = train['trip_duration']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
y_norm = normalize(y.reshape(1, -1))

In [None]:
X_train.shape

In [None]:
y_norm.T.shape

In [None]:
def baseline():
    model = Sequential()
    model.add(Dense(48, input_dim=12, activation='relu', kernel_initializer='normal'))
    model.add(Dense(12, activation='relu'))
    model.add(Dense(1, activation='linear', kernel_initializer='normal'))
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])
    return model

In [None]:
estimators = []
regressor = KerasRegressor(build_fn=baseline, epochs=5, batch_size=50, verbose=1)
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', regressor))
pipeline = Pipeline(estimators)

In [None]:
kfold = KFold(n_splits=5)
results = cross_val_score(pipeline, X, y_norm.T, cv=kfold)

In [None]:
print("Results: %.2f (%.2f) MSE" % (results.mean(), results.std()))