In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler

In [None]:
df = pd.read_csv('./boston/train.csv')
df.info()

In [None]:
X = df[['nox', 'rm', 'chas', 'dis', 'ptratio', 'lstat', 'rad']].values
y = df['medv'].values

In [None]:
print(X.shape)

In [None]:
print(X)

In [None]:
scaler = MinMaxScaler()
scaled_X = scaler.fit_transform(X)

In [None]:
print(scaled_X)

In [None]:
poly = PolynomialFeatures(2)

In [None]:
new_X = poly.fit_transform(scaled_X)

In [None]:
print(new_X.shape)

In [None]:
print(new_X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(new_X, y, test_size=0.1, random_state=40)

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)
print('Score: {}'.format(lr.score(X_test, y_test)))

# New York Taxi Cab

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('./nyc/train.csv', nrows=100000)

In [None]:
df.info()

In [None]:
df.head()

In [3]:
X = df[['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'passenger_count']].values
y = df[['fare_amount']].values

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)

In [6]:
from sklearn.linear_model import LinearRegression

In [7]:
linear_model = LinearRegression()

In [8]:
linear_model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [9]:
y_pred = linear_model.predict(X_test)

In [10]:
from sklearn.metrics import mean_squared_error

In [11]:
print('MSE: {}'.format(mean_squared_error(y_test, y_pred)))

MSE: 105.96878801867155


In [None]:
print('Score: {}'.format(linear_model.score(X_test, y_test)))

In [12]:
_c = pd.DataFrame({'actuals': y_test.reshape(-1), 'predicted': y_pred.reshape(-1)})

In [13]:
_c.tail()

Unnamed: 0,actuals,predicted
9995,4.9,11.233749
9996,7.0,11.87342
9997,9.7,11.234805
9998,5.7,11.234372
9999,7.7,11.362982


## train a better model
### Engineer new features

In [14]:
df[['fare_amount']].describe()

Unnamed: 0,fare_amount
count,100000.0
mean,11.354652
std,9.716777
min,-44.9
25%,6.0
50%,8.5
75%,12.5
max,200.0


## limit the fare_amount to between 1 and 39 USD

In [15]:
new_df = df[df.fare_amount >= 1]

In [16]:
new_df[['fare_amount']].describe()

Unnamed: 0,fare_amount
count,99986.0
mean,11.356954
std,9.714608
min,2.5
25%,6.0
50%,8.5
75%,12.5
max,200.0


In [17]:
new_df = new_df[df.fare_amount < 39]

  """Entry point for launching an IPython kernel.


In [18]:
new_df[['fare_amount']].describe()

Unnamed: 0,fare_amount
count,97059.0
mean,10.121116
std,6.30629
min,2.5
25%,6.0
50%,8.1
75%,12.1
max,38.9


In [19]:
X = new_df[['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'passenger_count']].values
y = new_df[['fare_amount']].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)

In [20]:
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
y_pred = linear_model.predict(X_test)
print('MSE: {}'.format(mean_squared_error(y_test, y_pred)))
print('Score: {}'.format(linear_model.score(X_test, y_test)))

MSE: 39.25882356889852
Score: 0.0006770765674232182


In [21]:
_c = pd.DataFrame({'actuals': y_test.reshape(-1), 'predicted': y_pred.reshape(-1)})

In [22]:
_c.head()

Unnamed: 0,actuals,predicted
0,6.1,10.062809
1,12.0,10.061406
2,12.5,10.062802
3,7.3,10.147191
4,5.7,10.063485


In [23]:
new_df.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,97059.0,97059.0,97059.0,97059.0,97059.0,97059.0
mean,10.121116,-72.505701,39.920732,-72.501626,39.925708,1.671962
std,6.30629,10.671218,6.208676,10.438647,6.19464,1.299321
min,2.5,-736.55,-74.00767,-84.654241,-74.00114,0.0
25%,6.0,-73.992165,40.735877,-73.991252,40.734989,1.0
50%,8.1,-73.981987,40.753121,-73.980171,40.753565,1.0
75%,12.1,-73.967915,40.767382,-73.964247,40.768304,2.0
max,38.9,40.787575,401.083332,40.851027,404.616667,6.0


In [24]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 97059 entries, 0 to 99999
Data columns (total 8 columns):
key                  97059 non-null object
fare_amount          97059 non-null float64
pickup_datetime      97059 non-null object
pickup_longitude     97059 non-null float64
pickup_latitude      97059 non-null float64
dropoff_longitude    97059 non-null float64
dropoff_latitude     97059 non-null float64
passenger_count      97059 non-null int64
dtypes: float64(5), int64(1), object(2)
memory usage: 6.7+ MB


## take date and time into consideration

In [25]:
new_df['_pickup_datetime'] = pd.to_datetime(new_df['pickup_datetime'])

In [26]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 97059 entries, 0 to 99999
Data columns (total 9 columns):
key                  97059 non-null object
fare_amount          97059 non-null float64
pickup_datetime      97059 non-null object
pickup_longitude     97059 non-null float64
pickup_latitude      97059 non-null float64
dropoff_longitude    97059 non-null float64
dropoff_latitude     97059 non-null float64
passenger_count      97059 non-null int64
_pickup_datetime     97059 non-null datetime64[ns, UTC]
dtypes: datetime64[ns, UTC](1), float64(5), int64(1), object(2)
memory usage: 7.4+ MB


## extract interesting information from datetime

In [27]:
new_df['_month'] = new_df._pickup_datetime.dt.month

In [28]:
new_df['_day'] = new_df._pickup_datetime.dt.day

In [29]:
new_df['_dayofweek'] = new_df._pickup_datetime.dt.dayofweek

In [30]:
new_df['_hour'] = new_df._pickup_datetime.dt.hour

## extract direction of travel

In [31]:
new_df['_lon_diff'] = new_df['pickup_longitude'] - new_df['dropoff_longitude']
new_df['_lat_diff'] = new_df['pickup_latitude'] - new_df['dropoff_latitude']

In [32]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 97059 entries, 0 to 99999
Data columns (total 15 columns):
key                  97059 non-null object
fare_amount          97059 non-null float64
pickup_datetime      97059 non-null object
pickup_longitude     97059 non-null float64
pickup_latitude      97059 non-null float64
dropoff_longitude    97059 non-null float64
dropoff_latitude     97059 non-null float64
passenger_count      97059 non-null int64
_pickup_datetime     97059 non-null datetime64[ns, UTC]
_month               97059 non-null int64
_day                 97059 non-null int64
_dayofweek           97059 non-null int64
_hour                97059 non-null int64
_lon_diff            97059 non-null float64
_lat_diff            97059 non-null float64
dtypes: datetime64[ns, UTC](1), float64(7), int64(5), object(2)
memory usage: 11.8+ MB


# train our model again

In [33]:
X = new_df[['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'passenger_count', '_month', '_day', '_dayofweek', '_hour', '_lon_diff', '_lat_diff']].values
y = new_df[['fare_amount']].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)

In [34]:
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
y_pred = linear_model.predict(X_test)
print('MSE: {}'.format(mean_squared_error(y_test, y_pred)))
print('Score: {}'.format(linear_model.score(X_test, y_test)))

MSE: 39.23920125276473
Score: 0.0011765574112372335


# try to visualize

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
sns.scatterplot(x='_lon_diff', y='_lat_diff', data=new_df)

In [None]:
new_df = new_df[new_df._lon_diff > -100]

In [None]:
new_df = new_df[new_df._lat_diff < 50]

In [None]:
sns.scatterplot(x='_lon_diff', y='_lat_diff', data=new_df)

In [None]:
sns.scatterplot(x='_lon_diff', y='fare_amount', data=new_df)

In [None]:
sns.scatterplot(x='_lat_diff', y='fare_amount', data=new_df)

In [None]:
sns.distplot(df['fare_amount'])

In [None]:
sns.boxplot(df['fare_amount'])

In [None]:
sns.catplot(y='fare_amount', x='_month', kind='box', data=new_df)

In [None]:
sns.catplot(y='fare_amount', x='_dayofweek', kind='box', data=new_df)

In [None]:
sns.catplot(y='fare_amount', x='_hour', kind='box', data=new_df)

In [None]:
sns.countplot(x='_hour', data=new_df)

In [None]:
new_df.info()

In [None]:
sns.countplot(x='_month', data=new_df)

In [None]:
new_df.head()

# convert all categorical variables

In [35]:
month_df = pd.get_dummies(new_df['_month'], prefix='_month', drop_first=True)

In [36]:
new_df = pd.concat([new_df, month_df], axis=1)

In [37]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 97059 entries, 0 to 99999
Data columns (total 26 columns):
key                  97059 non-null object
fare_amount          97059 non-null float64
pickup_datetime      97059 non-null object
pickup_longitude     97059 non-null float64
pickup_latitude      97059 non-null float64
dropoff_longitude    97059 non-null float64
dropoff_latitude     97059 non-null float64
passenger_count      97059 non-null int64
_pickup_datetime     97059 non-null datetime64[ns, UTC]
_month               97059 non-null int64
_day                 97059 non-null int64
_dayofweek           97059 non-null int64
_hour                97059 non-null int64
_lon_diff            97059 non-null float64
_lat_diff            97059 non-null float64
_month_2             97059 non-null uint8
_month_3             97059 non-null uint8
_month_4             97059 non-null uint8
_month_5             97059 non-null uint8
_month_6             97059 non-null uint8
_month_7             

In [38]:
new_df.head()

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,_pickup_datetime,_month,...,_month_3,_month_4,_month_5,_month_6,_month_7,_month_8,_month_9,_month_10,_month_11,_month_12
0,2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.84161,40.712278,1,2009-06-15 17:26:21+00:00,6,...,0,0,0,1,0,0,0,0,0,0
1,2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1,2010-01-05 16:52:16+00:00,1,...,0,0,0,0,0,0,0,0,0,0
2,2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2,2011-08-18 00:35:00+00:00,8,...,0,0,0,0,0,1,0,0,0,0
3,2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42 UTC,-73.98713,40.733143,-73.991567,40.758092,1,2012-04-21 04:30:42+00:00,4,...,0,1,0,0,0,0,0,0,0,0
4,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1,2010-03-09 07:51:00+00:00,3,...,1,0,0,0,0,0,0,0,0,0


In [39]:
X = new_df.drop(['fare_amount', 'key', 'pickup_datetime', '_pickup_datetime'], axis=1).values
y = new_df[['fare_amount']].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)

linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
y_pred = linear_model.predict(X_test)
print('MSE: {}'.format(mean_squared_error(y_test, y_pred)))
print('Score: {}'.format(linear_model.score(X_test, y_test)))

MSE: 39.195163605618085
Score: 0.0022975240191754898


# repeat for _day

In [40]:
day_df = pd.get_dummies(new_df['_day'], prefix='_day', drop_first=True)

# repeat for _dayofweek

In [41]:
dow_df = pd.get_dummies(new_df['_dayofweek'], prefix='_dayofweek', drop_first=True)

# repeat for _hour

In [42]:
hour_df = pd.get_dummies(new_df['_hour'], prefix='_hour', drop_first=True)

In [43]:
new_df = pd.concat([new_df, day_df, dow_df, hour_df], axis=1)

In [44]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 97059 entries, 0 to 99999
Data columns (total 85 columns):
key                  97059 non-null object
fare_amount          97059 non-null float64
pickup_datetime      97059 non-null object
pickup_longitude     97059 non-null float64
pickup_latitude      97059 non-null float64
dropoff_longitude    97059 non-null float64
dropoff_latitude     97059 non-null float64
passenger_count      97059 non-null int64
_pickup_datetime     97059 non-null datetime64[ns, UTC]
_month               97059 non-null int64
_day                 97059 non-null int64
_dayofweek           97059 non-null int64
_hour                97059 non-null int64
_lon_diff            97059 non-null float64
_lat_diff            97059 non-null float64
_month_2             97059 non-null uint8
_month_3             97059 non-null uint8
_month_4             97059 non-null uint8
_month_5             97059 non-null uint8
_month_6             97059 non-null uint8
_month_7             

# train a model again

In [45]:
X = new_df.drop(['fare_amount', 'key', 'pickup_datetime', '_pickup_datetime'], axis=1).values
y = new_df[['fare_amount']].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)

linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
y_pred = linear_model.predict(X_test)
print('MSE: {}'.format(mean_squared_error(y_test, y_pred)))
print('Score: {}'.format(linear_model.score(X_test, y_test)))

MSE: 39.02413080565588
Score: 0.006651118501219467


* we have more data but the model isn't improving because it's too simple. But, we don't want to try and fit a polynomial feature, so let's try a deep neural network

In [46]:
import tensorflow as tf
import numpy as np

In [59]:
feature_columns = []

# numeric columns
pickup_lon = tf.feature_column.numeric_column('pickup_longitude', dtype=tf.dtypes.float64)
dropoff_lon = tf.feature_column.numeric_column('dropoff_longitude', dtype=tf.dtypes.float64)
pickup_lat = tf.feature_column.numeric_column('pickup_latitude', dtype=tf.dtypes.float64)
dropoff_lat = tf.feature_column.numeric_column('dropoff_latitude', dtype=tf.dtypes.float64)

lat_diff = tf.feature_column.numeric_column('_lat_diff', dtype=tf.dtypes.float64)
lon_diff = tf.feature_column.numeric_column('_lon_diff', dtype=tf.dtypes.float64)

# categorical columns
month = tf.feature_column.categorical_column_with_identity('_month', num_buckets=13)
day = tf.feature_column.categorical_column_with_identity('_day', num_buckets=32)
dayofweek = tf.feature_column.categorical_column_with_identity('_dayofweek', num_buckets=7)
hour = tf.feature_column.categorical_column_with_identity('_hour', num_buckets=24)

# bucketized columns
b_pickup_lat = tf.feature_column.bucketized_column(pickup_lat, np.linspace(38.0, 42.0, 60 * 4).tolist())
b_pickup_lon = tf.feature_column.bucketized_column(pickup_lon, np.linspace(-76.0, -72.0, 60 * 4).tolist())
b_dropoff_lat = tf.feature_column.bucketized_column(dropoff_lat, np.linspace(38.0, 42.0, 60 * 4).tolist())
b_dropoff_lon = tf.feature_column.bucketized_column(dropoff_lon, np.linspace(-76.0, -72.0, 60 * 4).tolist())

# crossed columns
dayofweek_x_hour = tf.feature_column.crossed_column([dayofweek, hour], 7 * 24)
b_p_lat_x_b_p_lon = tf.feature_column.crossed_column([b_pickup_lat, b_pickup_lon], (60 * 4) ** 2)
b_d_lat_x_b_d_lon = tf.feature_column.crossed_column([b_dropoff_lat, b_dropoff_lon], (60 * 4) ** 2)
b_pickup_x_b_dropoff = tf.feature_column.crossed_column([b_p_lat_x_b_p_lon, b_d_lat_x_b_d_lon], (60 * 4) ** 4)



feature_columns = [pickup_lon, dropoff_lon, pickup_lat, dropoff_lat, month, day, dayofweek, hour]
dense_columns = [b_pickup_lat, b_pickup_lon, b_dropoff_lat, b_dropoff_lon]
for i in [month, day, dayofweek, hour, dayofweek_x_hour, b_p_lat_x_b_p_lon, b_d_lat_x_b_d_lon]:
    dense_columns.append(tf.feature_column.embedding_column(i, 64))

In [60]:
# we need a new way of getting data into the model
def df_to_dataset(df, columns, shuffle=True, batch_size=64):
  df = df.copy()
  labels = df.pop('fare_amount')
  features_df = df[columns]
  ds = tf.data.Dataset.from_tensor_slices( (dict(features_df), labels) )
  if shuffle:
    ds = ds.shuffle(buffer_size=len(df))
  ds = ds.batch(batch_size)
  return ds

In [61]:
columns = ['pickup_longitude', 'dropoff_longitude', 'pickup_latitude', 'dropoff_latitude', '_lat_diff', '_lon_diff', '_month', '_day', '_dayofweek', '_hour']

In [63]:
# extract train and test sets
train = new_df.sample(frac=0.9,random_state=0)
val = new_df.drop(train.index)

In [64]:
train_ds = df_to_dataset(train, columns)
val_ds = df_to_dataset(val, columns)

In [65]:
train[columns].describe()

Unnamed: 0,pickup_longitude,dropoff_longitude,pickup_latitude,dropoff_latitude,_lat_diff,_lon_diff,_month,_day,_dayofweek,_hour
count,87353.0,87353.0,87353.0,87353.0,87353.0,87353.0,87353.0,87353.0,87353.0,87353.0
mean,-72.523106,-72.51966,39.92977,39.935285,-0.005515,-0.003445,6.243083,15.651117,3.041121,13.508431
std,10.643096,10.380706,6.204873,6.188968,2.341799,3.716294,3.449162,8.679536,1.948656,6.521724
min,-736.55,-84.654241,-74.00767,-74.00114,-363.934787,-662.561258,1.0,1.0,0.0,0.0
25%,-73.992177,-73.991257,40.7359,40.734955,-0.013707,-0.013531,3.0,8.0,1.0,9.0
50%,-73.981998,-73.980167,40.753122,40.753553,0.0,-0.000734,6.0,16.0,3.0,14.0
75%,-73.967913,-73.964292,40.767385,40.7683,0.013325,0.010717,9.0,23.0,5.0,19.0
max,40.787575,40.851027,401.083332,404.616667,360.304405,74.014015,12.0,31.0,6.0,23.0


# Train a DNN

In [47]:
from tensorflow import keras

In [48]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 97059 entries, 0 to 99999
Data columns (total 85 columns):
key                  97059 non-null object
fare_amount          97059 non-null float64
pickup_datetime      97059 non-null object
pickup_longitude     97059 non-null float64
pickup_latitude      97059 non-null float64
dropoff_longitude    97059 non-null float64
dropoff_latitude     97059 non-null float64
passenger_count      97059 non-null int64
_pickup_datetime     97059 non-null datetime64[ns, UTC]
_month               97059 non-null int64
_day                 97059 non-null int64
_dayofweek           97059 non-null int64
_hour                97059 non-null int64
_lon_diff            97059 non-null float64
_lat_diff            97059 non-null float64
_month_2             97059 non-null uint8
_month_3             97059 non-null uint8
_month_4             97059 non-null uint8
_month_5             97059 non-null uint8
_month_6             97059 non-null uint8
_month_7             

In [54]:
X = new_df.drop(['fare_amount', 'key', 'pickup_datetime', '_pickup_datetime'], axis=1).values
y = new_df[['fare_amount']].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)

#featuresLayer = keras.layers.DenseFeatures(dense_columns)
model = keras.Sequential([
    keras.layers.Dense(128, input_shape=(X_train.shape[1],), activation='relu'),
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dense(1)
])

model.compile(optimizer='adam', loss='mse')

model.fit(X_train, y_train, epochs=10, validation_split=0.1)

Train on 78617 samples, validate on 8736 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1a2bf4c550>

In [55]:
y_pred = model.predict(X_test)
comp = pd.DataFrame({'actuals': y_test.reshape(-1), 'predicted': y_pred.reshape(-1)})
comp.head(n=10)

Unnamed: 0,actuals,predicted
0,6.1,10.42191
1,12.0,10.281181
2,12.5,10.129665
3,7.3,10.427169
4,5.7,10.393727
5,12.5,9.742668
6,13.3,9.824142
7,7.3,9.882808
8,7.3,11.196821
9,31.5,11.032345


In [56]:
print('MSE: {}'.format(mean_squared_error(y_test, y_pred)))

MSE: 40.004712725239806


# Use Estimators

## Linear Model

In [71]:
def train_fn():
  _df = train.copy()
  labels = _df.pop('fare_amount')
  features_df = _df[columns]
  ds = tf.data.Dataset.from_tensor_slices( (dict(features_df), labels) )
  _ds = ds.shuffle(train.shape[0]).batch(64).repeat(5)
  
  return _ds

def eval_fn():
  _df = val.copy()
  labels = _df.pop('fare_amount')
  features_df = _df[columns]
  ds = tf.data.Dataset.from_tensor_slices( (dict(features_df), labels) )
  _ds = ds.batch(64).repeat(1)
  
  return _ds

In [72]:
estimator = tf.estimator.LinearRegressor(feature_columns=feature_columns)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/var/folders/xm/rvswt1sx4rdf56_wpqz5rt4c0000gn/T/tmpomjus_q7', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x1a2bfda150>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [73]:
estimator.train(input_fn=train_fn, steps=None)

INFO:tensorflow:Calling model_fn.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

Instructions for updating:
Please use `layer.add_weight` method instead.
Instructions for updating:
Use `tf.cast` instead.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into /var/folders/xm/rvswt1sx4rdf56_wpqz5rt4c0000gn/T/tmpomjus_q7/model.ckpt.
INFO:tensorflow:loss = 155.76907, step = 0
INFO:tensorflow:global_step/sec: 251.042
INFO:tensorflow:lo

INFO:tensorflow:loss = 38.05324, step = 6200 (0.144 sec)
INFO:tensorflow:global_step/sec: 708.251
INFO:tensorflow:loss = 22.794987, step = 6300 (0.141 sec)
INFO:tensorflow:global_step/sec: 642.575
INFO:tensorflow:loss = 32.891815, step = 6400 (0.156 sec)
INFO:tensorflow:global_step/sec: 652.903
INFO:tensorflow:loss = 40.28315, step = 6500 (0.153 sec)
INFO:tensorflow:global_step/sec: 663.896
INFO:tensorflow:loss = 71.13335, step = 6600 (0.150 sec)
INFO:tensorflow:global_step/sec: 640.7
INFO:tensorflow:loss = 27.422268, step = 6700 (0.156 sec)
INFO:tensorflow:global_step/sec: 641.256
INFO:tensorflow:loss = 57.875298, step = 6800 (0.156 sec)
INFO:tensorflow:Saving checkpoints for 6825 into /var/folders/xm/rvswt1sx4rdf56_wpqz5rt4c0000gn/T/tmpomjus_q7/model.ckpt.
INFO:tensorflow:Loss for final step: 22.776964.


<tensorflow_estimator.python.estimator.canned.linear.LinearRegressorV2 at 0x1a2bfd0e90>

In [74]:
linear_evaluation = estimator.evaluate(input_fn=eval_fn)

INFO:tensorflow:Calling model_fn.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2020-04-22T15:50:25Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /var/folders/xm/rvswt1sx4rdf56_wpqz5rt4c0000gn/T/tmpomjus_q7/model.ckpt-6825
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2020-04-22-15:50:26
INFO:tensorflow:Saving dict for global step 6825: average_loss = 40.17797, global_step = 6825, label/mean = 10.078173, loss = 40.17694, prediction/mean = 9.942054
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 6825: /var/folders/xm/rvswt1sx4rdf56_wpq

In [75]:
print(linear_evaluation)

{'average_loss': 40.17797, 'label/mean': 10.078173, 'loss': 40.17694, 'prediction/mean': 9.942054, 'global_step': 6825}


In [76]:
p = estimator.predict(input_fn = eval_fn)

In [77]:
preds = np.array([item['predictions'][0] for item in p])

INFO:tensorflow:Calling model_fn.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /var/folders/xm/rvswt1sx4rdf56_wpqz5rt4c0000gn/T/tmpomjus_q7/model.ckpt-6825
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [78]:
_c = pd.DataFrame({'actual': val[['fare_amount']].values.reshape(-1), 'prediction': preds.reshape(-1)})

In [79]:
_c.head()

Unnamed: 0,actual,prediction
0,5.3,9.517053
1,7.0,9.404129
2,11.5,9.603594
3,5.5,9.598499
4,5.3,9.470545


In [80]:
from sklearn.metrics import mean_squared_error

In [81]:
print('MSE: {}'.format(mean_squared_error(val[['fare_amount']].values.reshape(-1), preds.reshape(-1))))

MSE: 40.17796967422556


## DNN

In [83]:
dense = tf.estimator.DNNRegressor(feature_columns=dense_columns, hidden_units=[64, 64])

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/var/folders/xm/rvswt1sx4rdf56_wpqz5rt4c0000gn/T/tmp3jd07rzn', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x1a39743a50>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [84]:
dense.train(input_fn=train_fn, steps=None)

INFO:tensorflow:Calling model_fn.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into /var/folders/xm/rvswt1sx4rdf56_wpqz5rt4c0000gn/T/tmp3jd07rzn/model.ckpt.
INFO:tensorflow:loss = 121.14745, step = 0
INFO:tensorflow:global_step/sec: 144.908
INFO:tensorflow:loss = 137.7486, step = 100 (0.691 sec)
INFO:tensorflow:global_step/sec: 282.23
INFO:tensorflow:loss = 36.71854, step = 200 (0.354 sec)
INFO:tensorflow:global_step/sec: 313.061
INFO:tensorflow:loss = 44.92758, step = 300 (0.319 sec)
INFO:tenso

<tensorflow_estimator.python.estimator.canned.dnn.DNNRegressorV2 at 0x1a39756a90>

In [85]:
dense_evaluation = dense.evaluate(input_fn=eval_fn)

INFO:tensorflow:Calling model_fn.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2020-04-22T15:51:57Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /var/folders/xm/rvswt1sx4rdf56_wpqz5rt4c0000gn/T/tmp3jd07rzn/model.ckpt-6825
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2020-04-22-15:51:58
INFO:tensorflow:Saving dict for global step 6825: average_loss = 24.925728, global_step = 6825, label/mean = 10.078173, loss = 24.914385, prediction/mean = 10.093367
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 6825: /var/folders/xm/rvswt1sx4rdf56_

In [86]:
print(dense_evaluation)

{'average_loss': 24.925728, 'label/mean': 10.078173, 'loss': 24.914385, 'prediction/mean': 10.093367, 'global_step': 6825}


In [87]:
p = dense.predict(input_fn = eval_fn)
preds = np.array([item['predictions'][0] for item in p])
_c = pd.DataFrame({'actual': val[['fare_amount']].values.reshape(-1), 'prediction': preds.reshape(-1)})

INFO:tensorflow:Calling model_fn.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /var/folders/xm/rvswt1sx4rdf56_wpqz5rt4c0000gn/T/tmp3jd07rzn/model.ckpt-6825
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [88]:
_c.head()

Unnamed: 0,actual,prediction
0,5.3,8.07678
1,7.0,13.595176
2,11.5,8.504304
3,5.5,7.975273
4,5.3,8.014244


In [89]:
print('MSE: {}'.format(mean_squared_error(val[['fare_amount']].values.reshape(-1), preds.reshape(-1))))

MSE: 24.925729059525974


## Combine the models

In [90]:
combined = tf.estimator.DNNLinearCombinedRegressor(
    #Linear Model also called Wide
    linear_feature_columns=feature_columns,
    
    #DNN
    dnn_feature_columns=dense_columns,
    dnn_hidden_units=[512,512]
)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/var/folders/xm/rvswt1sx4rdf56_wpqz5rt4c0000gn/T/tmpd_uh8jsd', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x1a395d2d90>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [91]:
combined.train(input_fn=train_fn, steps=None)

INFO:tensorflow:Calling model_fn.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into /var/folders/xm/rvswt1sx4rdf56_wpqz5rt4c0000gn/T/tmpd_uh8jsd/model.ckpt.
INFO:tensorflow:loss = 160.21262, step = 0

INFO:tensorflow:global_step/sec: 81.285
INFO:tensorflow:loss = 16.588932, step = 6600 (1.230 sec)
INFO:tensorflow:global_step/sec: 77.2839
INFO:tensorflow:loss = 8.375839, step = 6700 (1.294 sec)
INFO:tensorflow:global_step/sec: 82.1491
INFO:tensorflow:loss = 10.846728, step = 6800 (1.217 sec)
INFO:tensorflow:Saving checkpoints for 6825 into /var/folders/xm/rvswt1sx4rdf56_wpqz5rt4c0000gn/T/tmpd_uh8jsd/model.ckpt.
INFO:tensorflow:Loss for final step: 12.000773.


<tensorflow_estimator.python.estimator.canned.dnn_linear_combined.DNNLinearCombinedRegressorV2 at 0x1a395d2910>

In [92]:
print(combined.evaluate(input_fn=train_fn))

INFO:tensorflow:Calling model_fn.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2020-04-22T15:55:03Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /var/folders/xm/rvswt1sx4rdf56_wpqz5rt4c0000gn/T/tmpd_uh8jsd/model.ckpt-6825
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished e

In [93]:
p = combined.predict(input_fn = eval_fn)
preds = np.array([item['predictions'][0] for item in p])
_c = pd.DataFrame({'actual': val[['fare_amount']].values.reshape(-1), 'prediction': preds.reshape(-1)})

INFO:tensorflow:Calling model_fn.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /var/folders/xm/rvswt1sx4rdf56_wpqz5rt4c0000gn/T/tmpd_uh8jsd/model.ckpt-6825
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [94]:
_c.head()

Unnamed: 0,actual,prediction
0,5.3,5.807065
1,7.0,10.740663
2,11.5,10.835695
3,5.5,5.799201
4,5.3,6.721664


In [95]:
print('MSE: {}'.format(mean_squared_error(val[['fare_amount']].values.reshape(-1), preds.reshape(-1))))

MSE: 15.310430079120882


# number of parameters we were dealing with

In [96]:
d = keras.Sequential([
    keras.layers.Dense(512, input_shape=(116000, ), activation='relu'),
    keras.layers.Dense(512, activation='relu'),
    keras.layers.Dense(1)
])
print(d.summary())

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_8 (Dense)              (None, 512)               59392512  
_________________________________________________________________
dense_9 (Dense)              (None, 512)               262656    
_________________________________________________________________
dense_10 (Dense)             (None, 1)                 513       
Total params: 59,655,681
Trainable params: 59,655,681
Non-trainable params: 0
_________________________________________________________________
None
