# What's New?
## In this version we implement more feature engineering to the model. In particular, we include information extracted from the pickup_datetime column which was ignored so far

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
import shutil
import os
print('../', os.listdir('../'))
print('../input', os.listdir("../input"))
print('tf version: ', tf.__version__)

# Any results you write to the current directory are saved as output.

In [None]:
df =  pd.read_csv('../input/train.csv', nrows= 200000, parse_dates=['pickup_datetime'])
test = pd.read_csv('../input/test.csv',parse_dates=['pickup_datetime'])

In [None]:
df.pickup_datetime.dt.weekday_name

In [None]:
#calculating the distance between the two points (pickup and dropoff) in km
from math import cos, asin, sqrt
def distance(lat1, lon1, lat2, lon2):
    p = 0.017453292519943295     #Pi/180
    a = 0.5 - cos((lat2 - lat1) * p)/2 + cos(lat1 * p) * cos(lat2 * p) * (1 - cos((lon2 - lon1) * p)) / 2
    return 12742 * asin(sqrt(a)) #2*R*asin...

In [None]:
# to include the daytime features
def add_feats(df):
    df['distance'] = pd.concat([pd.DataFrame([distance(df['pickup_latitude'][i],df['pickup_longitude'][i],df['dropoff_latitude'][i],df['dropoff_longitude'][i])], columns=['distance']) for i in range(len(df))], ignore_index=True)
    df['hour'] = df.pickup_datetime.dt.hour
   # df['day'] = df.pickup_datetime.dt.day
   # df['month'] = df.pickup_datetime.dt.month
    df['weekday'] = df.pickup_datetime.dt.weekday
   # df['weekday'] = df.pickup_datetime.dt.weekday_name
    return df

In [None]:
add_feats(df)

In [None]:
df.dtypes

In [None]:
df.pickup_datetime.isnull().sum().sum()

# Perform the cut
## maybe it can be optimized

In [None]:
dfc = df[((df.pickup_longitude >= -75.0) & (df.pickup_longitude <= -72)) 
         & ((df.pickup_latitude >= 38) & (df.pickup_latitude <= 42)) 
         & ((df.dropoff_longitude >= -75.0) & (df.dropoff_longitude <= -72)) 
         & ((df.dropoff_latitude >= 38) & (df.dropoff_latitude <= 42)) 
         & (df.fare_amount > 2.5) & (df.passenger_count > 0) & (df.passenger_count < 7) & (df.distance > 0.2)]

# Split the data into traindf and evaldf 


In [None]:
# now, we split the data into the train and validation sets
np.random.seed(seed=1) #makes result reproducible
msk = np.random.rand(len(dfc)) < 0.8
traindf = dfc[msk].drop(['key', 'pickup_datetime'], axis=1)
evaldf = dfc[~msk].drop(['key', 'pickup_datetime'], axis=1)

In [None]:
testdf = add_feats(test)

In [None]:
testdf = test.drop(['key', 'pickup_datetime'], axis=1)

In [None]:
traindf.weekday.head()

In [None]:
def build_model_columns(nbuckets = 10):
    """Builds a set of wide and deep feature columns."""
    # Numeric columns
    plon = tf.feature_column.numeric_column('pickup_longitude')
    plat = tf.feature_column.numeric_column('pickup_latitude')
    dlon = tf.feature_column.numeric_column('dropoff_longitude')
    dlat = tf.feature_column.numeric_column('dropoff_latitude')
    pcount = tf.feature_column.numeric_column('passenger_count')
    dist = tf.feature_column.numeric_column('distance') # this should be an engineered feature for the final model
    
    #Categorical columns for weekday and hour, make it numerical... it is not working because of that (I think)
    wday = tf.feature_column.numeric_column('weekday')
    wday_b = tf.feature_column.categorical_column_with_identity('weekday', num_buckets= 7)    
    hour = tf.feature_column.numeric_column('hour')
    # then bucketize to make the feature cross later and then feed the day_hr feat cross into the N.N.
     # hour_b = tf.feature_column.bucketized_column(hour, boundaries = [-1, 5, 7, 10 ,16, 21, 23])
    hour_b = tf.feature_column.categorical_column_with_identity('hour',num_buckets= 24)

   # Bucketized columns for pickup and dropoff coordinates  
    latbuckets = np.linspace(38.0, 42.0, nbuckets).tolist()
    lonbuckets = np.linspace(-75.0, -72.0, nbuckets).tolist()
    b_plat = tf.feature_column.bucketized_column(plat, latbuckets)
    b_dlat = tf.feature_column.bucketized_column(dlat, latbuckets)
    b_plon = tf.feature_column.bucketized_column(plon, lonbuckets)
    b_dlon = tf.feature_column.bucketized_column(dlon, lonbuckets)
    

    # Feature cross
    ploc = tf.feature_column.crossed_column([b_plat, b_plon], nbuckets * nbuckets)
    dloc = tf.feature_column.crossed_column([b_dlat, b_dlon], nbuckets * nbuckets)
    pd_pair = tf.feature_column.crossed_column([ploc, dloc], nbuckets ** 4 )
    day_hr =  tf.feature_column.crossed_column([hour_b, wday_b], 24 * 7)
    
    
    # Wide columns
    wide_columns = [
        # crossed columns go here
        #dloc, ploc,
        dloc, ploc, pd_pair,
        day_hr,

        # Sparse columns
        wday, hour,

        # Anything with a linear relationship
        pcount 
    ]
    
    #Deep columns
    deep_columns = [
        # Embedding_column to "group" together ...
        tf.feature_column.embedding_column(pd_pair, 10),
        tf.feature_column.embedding_column(day_hr, 10),

        # Numeric columns
        plat, plon, dlat, dlon, dist
        #latdiff, londiff, euclidean
    ]
    return wide_columns, deep_columns

In [None]:
def build_estimator(model_dir, nbuckets = 10):
    """Build an estimator appropriate for the given model type."""
    wide_columns, deep_columns = build_model_columns()
    hidden_units = [512, 256, 128, 64, 32, 4]
    
  # Create a tf.estimator.RunConfig to ensure the model is run on CPU, which
  # trains faster than GPU for this model.
    run_config = tf.estimator.RunConfig().replace(
    session_config=tf.ConfigProto(device_count={'GPU': 0}))
    return tf.estimator.DNNLinearCombinedRegressor(
        model_dir=model_dir,
        dnn_activation_fn=tf.nn.relu,
        linear_feature_columns=wide_columns,
        dnn_feature_columns=deep_columns,
        dnn_hidden_units=hidden_units,
        config=run_config)

In [None]:
OUTDIR = './taxi_trained'

In [None]:
BATCH_SIZE = 512
train_input_fn = tf.estimator.inputs.pandas_input_fn(x = traindf[list(traindf.drop(['fare_amount'], axis=1).keys())],
                                                    y = traindf['fare_amount'],
                                                    num_epochs = 100,
                                                    batch_size = BATCH_SIZE,
                                                    shuffle = True)
eval_input_fn = tf.estimator.inputs.pandas_input_fn(x = evaldf[list(traindf.drop(['fare_amount'], axis=1).keys())],
                                                    y = evaldf["fare_amount"],
                                                    num_epochs = 1, 
                                                    batch_size = len(evaldf), 
                                                    shuffle=False)
predict_input_fn = tf.estimator.inputs.pandas_input_fn(x = testdf[list(testdf.keys())],
                                                    y = None,
                                                    num_epochs = 1, 
                                                    batch_size = len(testdf), 
                                                    shuffle=False)

In [None]:
shutil.rmtree(OUTDIR, ignore_errors = True) # start fresh each time
num_train_steps = (100 * len(traindf)) / BATCH_SIZE
  #myopt = tf.train.FtrlOptimizer(learning_rate = 0.01) # note the learning rate
estimator = build_estimator(OUTDIR)
def rmse(labels, predictions):
    pred_values = tf.cast(predictions['predictions'],tf.float64)
    return {'rmse': tf.metrics.root_mean_squared_error(labels, pred_values)}
estimator = tf.contrib.estimator.add_metrics(estimator,rmse)
train_spec=tf.estimator.TrainSpec(
                                input_fn = train_input_fn,
                                max_steps = num_train_steps)
eval_spec=tf.estimator.EvalSpec(
                   input_fn = eval_input_fn,
                   steps = None,
                   start_delay_secs = 1, # start evaluating after N seconds
                   throttle_secs = 10,  # evaluate every N seconds
                   )
tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)


In [None]:
predictions = estimator.predict(input_fn= predict_input_fn)

In [None]:
predlist = list(predictions)

In [None]:
predlist[0].get('predictions')

In [None]:
predval = [predlist[i].get('predictions') for i in range(len(predlist))]

In [None]:
pconc = np.concatenate(predval)

In [None]:
pconc.reshape(-1,1) #made it!

In [None]:
test['key'].values.reshape(-1,1)

In [None]:
output = np.hstack((test['key'].values.reshape(-1,1),pconc.reshape(-1,1)))

In [None]:
output

In [None]:
dataset_output = pd.DataFrame({'key':output[:,0],'fare_amount':output[:,1]})

In [None]:
dataset_output.to_csv('submission_file.csv', index = False)