In [None]:
import numpy as np
import pandas as pd
import os

from tensorflow.keras import losses, models, optimizers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (Dense, Dropout, Activation, Flatten) 
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
from sklearn.datasets import load_boston 
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import ElasticNet, Lasso, Ridge
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from geopy import distance
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
data = pd.read_csv("../input/new-york-city-taxi-fare-prediction/train.csv", sep=',', nrows=5000, parse_dates=["pickup_datetime"])
is_null = pd.isnull(data).sum()
print(is_null)

In [None]:
TRAIN_DATA = 100000

In [None]:
train = pd.read_csv("../input/new-york-city-taxi-fare-prediction/train.csv", nrows = TRAIN_DATA)
test = pd.read_csv("../input/new-york-city-taxi-fare-prediction/test.csv")

In [None]:
train.head()

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
train.describe()

In [None]:
train[train.isnull().any(axis=1)]

In [None]:
train.dropna(inplace=True)

In [None]:
train.columns

In [None]:
train = train[(-76 <= train['pickup_longitude']) & (train['pickup_longitude'] <= -72)]
train = train[(-76 <= train['dropoff_longitude']) & (train['dropoff_longitude'] <= -72)]
train = train[(38 <= train['pickup_latitude']) & (train['pickup_latitude'] <= 42)]
train = train[(38 <= train['dropoff_latitude']) & (train['dropoff_latitude'] <= 42)]

In [None]:
train.shape

In [None]:
print("Max fare value:", train['fare_amount'].max())
print("Min fare value:", train['fare_amount'].min())

In [None]:
len(train[train['fare_amount']<0])

In [None]:
train = train[train['fare_amount']>=0]

In [None]:
train.shape

In [None]:
import seaborn as sns

plt.figure(figsize=(12,4))
sns.boxplot(train['fare_amount'])

In [None]:
len(train[train['fare_amount']>200])

In [None]:
train = train[train['fare_amount']<=200]

In [None]:
train.shape

In [None]:
import datetime

In [None]:
train['pickup_datetime'] = pd.to_datetime(train['pickup_datetime']) - datetime.timedelta(hours=4)

In [None]:
train['Year'] = train['pickup_datetime'].dt.year
train['Month'] = train['pickup_datetime'].dt.month
train['Day'] = train['pickup_datetime'].dt.day
train['Hour'] = train['pickup_datetime'].dt.hour
train['Minutes'] = train['pickup_datetime'].dt.minute
train['Day of Week'] = train['pickup_datetime'].dt.dayofweek

In [None]:
train.head(2)

In [None]:
test['pickup_datetime'] = pd.to_datetime(test['pickup_datetime']) - datetime.timedelta(hours=4)

In [None]:
test['Year'] = test['pickup_datetime'].dt.year
test['Month'] = test['pickup_datetime'].dt.month
test['Day'] = test['pickup_datetime'].dt.day
test['Hour'] = test['pickup_datetime'].dt.hour
test['Minutes'] = test['pickup_datetime'].dt.minute
test['Day of Week'] = test['pickup_datetime'].dt.dayofweek

In [None]:
from sklearn.metrics.pairwise import haversine_distances
from math import radians

In [None]:
def haversine(df):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    lat1= np.radians(df["pickup_latitude"])
    lat2 = np.radians(df["dropoff_latitude"])
    #### Based on the formula  x1=drop_lat,x2=dropoff_long 
    dlat = np.radians(df['dropoff_latitude']-df["pickup_latitude"])
    dlong = np.radians(df["dropoff_longitude"]-df["pickup_longitude"])
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlong/2)**2

    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    r = 3956 #  Radius of earth in miles. Use 6371 for kilometers
    return c * r

In [None]:
train['Total distance']=haversine(train)

test['Total distance']=haversine(test)

In [None]:
test.head(2)

In [None]:
# Fare vs distance (Taking 1000 samples)
sns.lmplot(x='Total distance', y='fare_amount', data=train[1:1000])

In [None]:
def is_peak_hour(df):
    peak = False
    if df['Day of Week'] >= 0 and df['Day of Week'] <= 4:
        if (df['Hour'] >= 8 and df['Hour'] <= 10) or (df['Hour'] >= 16 and df['Hour'] <= 18):
            peak = True
        else:
            peak = False
    else:
        peak = False
    return peak

In [None]:
train['Peak hours'] = train.apply(is_peak_hour, axis=1)
test['Peak hours'] = test.apply(is_peak_hour, axis=1)

In [None]:
early_late_hours = [0,1,2,3,4,5,22,23]
train['Early late hours'] = train['Hour'].apply(lambda x: x in early_late_hours)
test['Early late hours'] = test['Hour'].apply(lambda x: x in early_late_hours)

In [None]:
train.head()

In [None]:
train['Peak hours'] = train['Peak hours'].replace({True: 1, False: 0})
train['Early late hours'] = train['Early late hours'].replace({True: 1, False: 0})

In [None]:
test['Peak hours'] = test['Peak hours'].replace({True: 1, False: 0})
test['Early late hours'] = test['Early late hours'].replace({True: 1, False: 0})

In [None]:
test.head(2)

In [None]:
train.head()

In [None]:
drop_columns = ['key', 'pickup_datetime', 'pickup_longitude', 'pickup_latitude', 
                'dropoff_longitude', 'dropoff_latitude', 'Minutes']

In [None]:
train.head(2)

In [None]:
train.drop(drop_columns, axis = 1, inplace=True)
test.drop(drop_columns, axis = 1, inplace=True)

In [None]:
def model_results(X_train, y_train, X_test, y_test, model):
    """Print model parameters of RMSE and R-square on training and testing sets.
    """
    
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    print("----Training Data results (2015 data set)----")
    print("RMSE: ${:.1f}".format(mean_squared_error(y_train, y_train_pred)**0.5))
    print("R2: {:.2f}\n".format(r2_score(y_train, y_train_pred)))
    
    print("----Test Data results (2016 data set)----")
    print("RMSE: ${:.1f}".format(mean_squared_error(y_test, y_test_pred)**0.5))
    print("R2: {:.2f}\n".format(r2_score(y_test, y_test_pred)))

In [None]:
train.head()

In [None]:
test.head(2)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = train.drop(['fare_amount'], axis=1).values
y = train['fare_amount'].values

In [None]:
from sklearn.linear_model import Lasso, Ridge, ElasticNet

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
print('- LASSO -')
las = Lasso().fit(X_train,y_train)
model_results(X_train, y_train, X_test, y_test, las)

print('- RIDGE -')
ridge = Ridge().fit(X_train,y_train)
model_results(X_train, y_train, X_test, y_test, ridge)

print('- ELASTIC NET -')
elast = ElasticNet().fit(X_train,y_train)
model_results(X_train, y_train, X_test, y_test, elast)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=4)

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [None]:
import tensorflow.keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [None]:
from sklearn import preprocessing

In [None]:
# Scale data
# Note: Scaling is needed for DL models
scaler = preprocessing.MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
test_scaled = scaler.transform(test)

In [None]:
test

In [None]:
NN_model = Sequential()

# Input Layer
NN_model.add(Dense(128, activation='relu', input_dim=X_train_scaled.shape[1]))

# Hidden Layers
NN_model.add(Dense(256, activation='relu'))
NN_model.add(Dense(256, activation='relu'))
NN_model.add(Dense(256, activation='relu'))

# Output Layer
NN_model.add(Dense(1, activation='linear'))

In [None]:
#Compiling the model
NN_model.compile(loss='mse', optimizer='adam', metrics=['mae'])

In [None]:
NN_model.summary()

In [None]:
# Fitting the model
NN_history = NN_model.fit(x=X_train_scaled, y=y_train, batch_size=512, epochs=50, 
                    validation_data=(X_val_scaled, y_val), shuffle=True)

In [None]:
from keras.layers import Dropout, BatchNormalization

In [None]:
NN_model1 = Sequential()

# Input Layer
NN_model1.add(Dense(128, kernel_initializer='normal', activation='relu', input_dim=X_train_scaled.shape[1]))
#NN_model1.add(BatchNormalization())

# Hidden Layers
NN_model1.add(Dense(256, kernel_initializer='normal', activation='relu'))

NN_model1.add(Dense(256, kernel_initializer='normal', activation='relu'))

NN_model1.add(Dense(256, kernel_initializer='normal', activation='relu'))

NN_model1.add(Dense(256, kernel_initializer='normal', activation='relu'))

NN_model1.add(Dense(256, kernel_initializer='normal', activation='relu'))

# Output Layer
NN_model1.add(Dense(1, kernel_initializer='normal', activation='linear'))

#Compiling the model
NN_model1.compile(loss='mse', optimizer='adam', metrics=['mae'])

In [None]:
NN_model1.summary()

In [None]:
# Fitting the model
NN_history1 = NN_model1.fit(x=X_train_scaled, y=y_train, batch_size=512, epochs=50, 
                    validation_data=(X_val_scaled, y_val), shuffle=True)

In [None]:
# Plot the loss of NN_history
plt.plot(NN_history.history['loss'], label='train loss')
plt.plot(NN_history.history['val_loss'], label='valdn loss')
plt.legend()
plt.show()

In [None]:
print('Model:NN_history')
print('Min value of training Loss:', min(NN_history.history['loss']))
print('Min value of validation Loss:', min(NN_history.history['val_loss']))

In [None]:
# Plot the loss of NN_history1
plt.plot(NN_history1.history['loss'], label='train loss')
plt.plot(NN_history1.history['val_loss'], label='valdn loss')
plt.legend()
plt.show()

In [None]:
print('Model:NN_history1')
print('Min value of training Loss:', min(NN_history1.history['loss']))
print('Min value of validation Loss:', min(NN_history1.history['val_loss']))

In [None]:
# Make prediction with NN_model1
NN_prediction = NN_model1.predict(test_scaled, verbose=1)

In [None]:
NN_prediction

In [None]:
# define some handy analysis support function
from sklearn.metrics import mean_squared_error, explained_variance_score

def plot_prediction_analysis(y, y_pred, figsize=(10,4), title=''):
    fig, axs = plt.subplots(1, 2, figsize=figsize)
    axs[0].scatter(y, y_pred)
    mn = min(np.min(y), np.min(y_pred))
    mx = max(np.max(y), np.max(y_pred))
    axs[0].plot([mn, mx], [mn, mx], c='red')
    axs[0].set_xlabel('$y$')
    axs[0].set_ylabel('$\hat{y}$')
    rmse = np.sqrt(mean_squared_error(y, y_pred))
    evs = explained_variance_score(y, y_pred)
    axs[0].set_title('rmse = {:.2f}, evs = {:.2f}'.format(rmse, evs))
    
    axs[1].hist(y-y_pred, bins=50)
    avg = np.mean(y-y_pred)
    std = np.std(y-y_pred)
    axs[1].set_xlabel('$y - \hat{y}$')
    axs[1].set_title('Histrogram prediction error, $\mu$ = {:.2f}, $\sigma$ = {:.2f}'.format(avg, std))
    
    if title!='':
        fig.suptitle(title)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

model_lin = Pipeline((
        ("standard_scaler", StandardScaler()),
        ("lin_reg", LinearRegression()),
    ))
model_lin.fit(X_train, y_train)

y_train_pred = model_lin.predict(X_train)
plot_prediction_analysis(y_train, y_train_pred, title='Linear Model - Trainingset')

y_test_pred = model_lin.predict(X_test)
plot_prediction_analysis(y_test, y_test_pred, title='Linear Model - Testset')

In [None]:
train

In [None]:
data

In [None]:
from sklearn.datasets import load_boston

boston = load_boston()
lr = LinearRegression()
x = data.passenger_count
y = data.fare_amount

x = np.transpose(np.atleast_2d(x))
lr.fit(x,y)
y_pred = lr.predict(x)
mse_lin_rm = mean_squared_error(y, y_pred)
rmse_lin_rm = np.sqrt(mse_lin_rm)
r2_lin_rm = r2_score(y, y_pred) 

# Перехресна перевірка

kf = KFold(n_splits=5, random_state=None, shuffle=False)
mse_lin_rm_kf = []
r2_lin_rm_kf = []  
for train_index, test_index in kf.split(x):
    lr.fit(x[train_index],y[train_index])
    mse_lin_rm_kf.append(mean_squared_error(y[test_index], lr.predict(x[test_index])))
    r2_lin_rm_kf.append(r2_score(y[test_index], lr.predict(x[test_index])))
print("Accuracy: %0.2f (+/- %0.2f)" % (np.mean(mse_lin_rm_kf), np.std(mse_lin_rm_kf) * 2))
print("Mean R^2: %0.2f" % (np.mean(r2_lin_rm_kf)))

fig=plt.figure()
ax5=fig.add_subplot(1,1,1)
ax5.scatter(data.passenger_count, data.fare_amount,color='r')
ax5.plot(x,y_pred)
#Назва діаграми
ax5.set_title('Linear Regression '+ boston.feature_names[5])

In [None]:
X_train

In [None]:
X_test

In [None]:
from sklearn.linear_model import Lasso, Ridge, ElasticNet

print('- LASSO -')
las = Lasso().fit(X_train,y_train)
model_results(X_train, y_train, X_test, y_test, las)

print('- RIDGE -')
ridge = Ridge().fit(X_train,y_train)
model_results(X_train, y_train, X_test, y_test, ridge)

print('- ELASTIC NET -')
elast = ElasticNet().fit(X_train,y_train)

(X_train, y_train, X_test, y_test, elast)

In [None]:
# Множинна лінійна регресія

x_mul = boston.data
y = boston.target

# Spliting

X_train, X_test, y_train, y_test = train_test_split(x_mul, y, 
                                                    test_size=0.4, random_state=0)    

lr_mul = LinearRegression()
lr_mul.fit(x_mul,y)
p = lr_mul.predict(x_mul)
mse_lin_mul = mean_squared_error(y, p)
fig=plt.figure()
mulreg=fig.add_subplot(1,1,1)
mulreg.scatter(p, y, color='r')
mulreg.set_title('multiLinear Regression ')

# Validation
lr_mul = LinearRegression()
lr_mul.fit(X_train, y_train)
y_pred = lr_mul.predict(X_test)
mse_lin_mul = mean_squared_error(y_test, y_pred)
r2_mul = r2_score(y_pred, y_test) 
print('r2_linear', r2_mul)

kf = KFold(n_splits=5, random_state=None, shuffle=False)
mse_lin_mul_kf = []
r2_lin_mul_kf = []  
for train_index, test_index in kf.split(x_mul):
    lr_mul.fit(x_mul[train_index],y[train_index])
    mse_lin_mul_kf.append(mean_squared_error(y[test_index], lr_mul.predict(x_mul[test_index])))
    r2_lin_mul_kf.append(r2_score(y[test_index], lr_mul.predict(x_mul[test_index])))
print("Accuracy: %0.2f (+/- %0.2f)" % (np.mean(mse_lin_mul_kf), np.std(mse_lin_mul_kf) * 2))
print("Mean R^2: %0.2f" % (np.mean(r2_lin_mul_kf)))