In [None]:
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


In [None]:
# # Initial Python environment setup...
# import numpy as np # linear algebra
# import pandas as pd # CSV file I/O (e.g. pd.read_csv)
# import os # reading the input files we have access to

# print(os.listdir('../input'))

In [None]:
# Read train data
taxi_train = pd.read_csv('../input/new-york-city-taxi-fare-prediction/train.csv', nrows = 10_000_000)

In [None]:
taxi_train.columns.to_list()

In [None]:
taxi_train.dtypes

In [None]:
import matplotlib.pyplot as plt
# Plot a histogram
taxi_train.fare_amount.hist(bins=30, alpha=0.5)
plt.show() 

In [None]:
# Given a dataframe, add two new features 'abs_diff_longitude' and
# 'abs_diff_latitude' reprensenting the "Manhattan vector" from
# the pickup location to the dropoff location.
def add_travel_vector_features(df):
    df['abs_diff_longitude'] = (df.dropoff_longitude - df.pickup_longitude).abs()
    df['abs_diff_latitude'] = (df.dropoff_latitude - df.pickup_latitude).abs()

add_travel_vector_features(taxi_train)

In [None]:
print(taxi_train.isnull().sum())

In [None]:
print('Old size: %d' % len(taxi_train))
taxi_train = taxi_train.dropna(how = 'any', axis = 'rows')
print('New size: %d' % len(taxi_train))

In [None]:
plot = taxi_train.iloc[:2000].plot.scatter('abs_diff_longitude', 'abs_diff_latitude')

In [None]:
print('Old size: %d' % len(taxi_train))
taxi_train = taxi_train[(taxi_train.abs_diff_longitude < 5.0) & (taxi_train.abs_diff_latitude < 5.0)]
print('New size: %d' % len(taxi_train))

### Train our model
Our model will take the form  X⋅w=y  where  X  is a matrix of input features, and  y  is a column of the target variable, fare_amount, for each row. The weight column  w  is what we will "learn".

First let's setup our input matrix  X  and target column  y  from our training set. The matrix  X  should consist of the two GPS coordinate differences, plus a third term of 1 to allow the model to learn a constant bias term. The column  y  should consist of the target fare_amount values.

In [None]:
# Construct and return an Nx3 input matrix for our linear model
# using the travel vector, plus a 1.0 for a constant bias term.
def get_input_matrix(df):
    return np.column_stack((df.abs_diff_longitude, df.abs_diff_latitude, np.ones(len(df))))

taxi_train_X = get_input_matrix(taxi_train)
taxi_train_y = np.array(taxi_train['fare_amount'])

print(taxi_train_X.shape)
print(taxi_train_y.shape)

Now let's use numpy's lstsq library function to find the optimal weight column  w .

In [None]:
# The lstsq function returns several things, and we only care about the actual weight vector w.
(w, _, _, _) = np.linalg.lstsq(taxi_train_X, taxi_train_y, rcond = None)
print(w)

These weights pass a quick sanity check, since we'd expect the first two values -- the weights for the absolute longitude and latitude differences -- to be positive, as more distance should imply a higher fare, and we'd expect the bias term to loosely represent the cost of a very short ride.

Sidenote: we can actually calculate the weight column  w  directly using the Ordinary Least Squares method:  w=(XT⋅X)−1⋅XT⋅y

In [None]:
w_OLS = np.matmul(np.matmul(np.linalg.inv(np.matmul(taxi_train_X.T, taxi_train_X)), taxi_train_X.T), taxi_train_y)
print(w_OLS)

### Make predictions on the test set
Now let's load up our test inputs and predict the fare_amounts for them using our learned weights!

In [None]:
# Read test data
taxi_test = pd.read_csv('../input/new-york-city-taxi-fare-prediction/test.csv')
taxi_test.columns.to_list()

In [None]:
# Reuse the above helper functions to add our features and generate the input matrix.
add_travel_vector_features(taxi_test)
taxi_test_X = get_input_matrix(taxi_test)
# Predict fare_amount on the test set using our model (w) trained on the training set.
taxi_test_y_predictions = np.matmul(taxi_test_X, w).round(decimals = 2)

# Write the predictions to a CSV file which we can submit to the competition.
submission = pd.DataFrame(
    {'key': taxi_test.key, 'fare_amount': taxi_test_y_predictions},
    columns = ['key', 'fare_amount'])
submission.to_csv('submission.csv', index = False)

print(os.listdir('.'))

### Ideas for Improvement
The output here will score an RMSE of $5.74, but you can do better than that! Here are some suggestions:

* Use more columns from the input data. Here we're only using the start/end GPS points from columns [pickup|dropoff]_[latitude|longitude]. Try to see if the other columns -- pickup_datetime and passenger_count -- can help improve your results.
* Use absolute location data rather than relative. Here we're only looking at the difference between the start and end points, but maybe the actual values -- indicating where in NYC the taxi is traveling -- would be useful.
* Use a non-linear model to capture more intricacies within the data.
* Try to find more outliers to prune, or construct useful feature crosses.
* Use the entire dataset -- here we're only using about 20% of the training data!

Special thanks to Dan Becker, Will Cukierski, and Julia Elliot for reviewing this Kernel and providing suggestions!

In [None]:
from sklearn.linear_model import LinearRegression
# Create a LinearRegression object
lr = LinearRegression()

In [None]:
# Fit the model on the train data
lr.fit(X=taxi_train[['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'passenger_count']],
y=taxi_train['fare_amount']) 

In [None]:
# Select features
features = ['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'passenger_count']

In [None]:
# Make predictions on the test data
taxi_test['fare_amount'] = lr.predict(taxi_test[features]) 

In [None]:
# Read sample submission
taxi_sample_sub = pd.read_csv('../input/new-york-city-taxi-fare-prediction/sample_submission.csv')
taxi_sample_sub.head()

In [None]:
# Read a sample submission file
taxi_sample_sub = pd.read_csv('taxi_sample_submission.csv')
taxi_sample_sub.head(1)

In [None]:
# Prepare a submission file
taxi_submission = taxi_test[['key', 'fare_amount']]
# Save the submission file as .csv
taxi_submission.to_csv('first_sub.csv', index=False) 

In [None]:
# # Write a submission file to the disk
# submission[['id', 'target']].to_csv('submission_1.csv', index=False)

In [None]:
# Some classification and regression metrics
from sklearn.metrics import roc_auc_score, f1_score, mean_squared_error

In [None]:
import numpy as np
def rmsle(y_true, y_pred):
    diffs = np.log(y_true + 1) - np.log(y_pred + 1)
    squares = np.power(diffs, 2)
    err = np.sqrt(np.mean(squares))
    return err

# Another Prediction

In [None]:
import numpy as np 
import pandas as pd
import datetime as dt
from sklearn.model_selection import train_test_split
import xgboost as xgb
import os

print(os.listdir("../input"))


#### Load dataset

First we will load the train.csv dataset. Since this dataset has 55M rows, we will only use the first 6M to build our model to prevent memory issues and speed up preprocessing and model building.

In [None]:
train_df =  pd.read_csv('../input/new-york-city-taxi-fare-prediction/train.csv', nrows = 6_000_000) #1M to test models
train_df.dtypes

### Data exploration

Now we will explore the loaded data to identify outliers and other problems that might need fixing such as null values.

In [None]:
#Identify null values
print(train_df.isnull().sum())

We have a few rows with null values so it is safe to remove them.

In [None]:
#Drop rows with null values
train_df = train_df.dropna(how = 'any', axis = 'rows')

Now let's explore the variables in the dataset. First we will look at the first rows to get an idea of the format of the values and then we will plot them to get a sense of their distribution and identify outliers.

In [None]:
#Look at the first rows
train_df.head()

In [None]:
#Plot variables using only 1000 rows for efficiency
train_df.iloc[:1000].plot.scatter('pickup_longitude', 'pickup_latitude')
train_df.iloc[:1000].plot.scatter('dropoff_longitude', 'dropoff_latitude')

#Get distribution of values
train_df.describe()

Okay, that was interesting. We learned a few things about the dataset:

* Fare_amount has negative values. We will remove those.
* Latitudes and longitudes have values near 0 that cannot be correct since NYC is at (40,-74) aprox. We will remove points not near these coordinates.
* Passenger_count has values of 0 and as high as 200, which are also unrealistic. We will remove those.

In [None]:
#Clean dataset
def clean_df(df):
    return df[(df.fare_amount > 0) & 
            (df.pickup_longitude > -80) & (df.pickup_longitude < -70) &
            (df.pickup_latitude > 35) & (df.pickup_latitude < 45) &
            (df.dropoff_longitude > -80) & (df.dropoff_longitude < -70) &
            (df.dropoff_latitude > 35) & (df.dropoff_latitude < 45) &
            (df.passenger_count > 0) & (df.passenger_count < 10)]

train_df = clean_df(train_df)
print(len(train_df))

#### Feature engineering

Now that we have cleaned some extreme values, we will add some interesting features in the dataset.

* total_distance: distance from pickup to dropoff. The longer the trip, the more expensive.
* Extract information from datetime (day of week, month, hour, day). Taxi fares change day/night or on weekdays/holidays.
* Add columns indicating distance from pickup or dropoff coordinates to airports. Trips from/to an airport have a fixed fee.

In [None]:
def sphere_dist(pickup_lat, pickup_lon, dropoff_lat, dropoff_lon):
    """
    Return distance along great radius between pickup and dropoff coordinates.
    """
    #Define earth radius (km)
    R_earth = 6371
    #Convert degrees to radians
    pickup_lat, pickup_lon, dropoff_lat, dropoff_lon = map(np.radians,
                                                             [pickup_lat, pickup_lon, 
                                                              dropoff_lat, dropoff_lon])
    #Compute distances along lat, lon dimensions
    dlat = dropoff_lat - pickup_lat
    dlon = dropoff_lon - pickup_lon
    
    #Compute haversine distance
    a = np.sin(dlat/2.0)**2 + np.cos(pickup_lat) * np.cos(dropoff_lat) * np.sin(dlon/2.0)**2
    
    return 2 * R_earth * np.arcsin(np.sqrt(a))

def add_airport_dist(dataset):
    """
    Return minumum distance from pickup or dropoff coordinates to each airport.
    JFK: John F. Kennedy International Airport
    EWR: Newark Liberty International Airport
    LGA: LaGuardia Airport
    """
    jfk_coord = (40.639722, -73.778889)
    ewr_coord = (40.6925, -74.168611)
    lga_coord = (40.77725, -73.872611)
    
    pickup_lat = dataset['pickup_latitude']
    dropoff_lat = dataset['dropoff_latitude']
    pickup_lon = dataset['pickup_longitude']
    dropoff_lon = dataset['dropoff_longitude']
    
    pickup_jfk = sphere_dist(pickup_lat, pickup_lon, jfk_coord[0], jfk_coord[1]) 
    dropoff_jfk = sphere_dist(jfk_coord[0], jfk_coord[1], dropoff_lat, dropoff_lon) 
    pickup_ewr = sphere_dist(pickup_lat, pickup_lon, ewr_coord[0], ewr_coord[1])
    dropoff_ewr = sphere_dist(ewr_coord[0], ewr_coord[1], dropoff_lat, dropoff_lon) 
    pickup_lga = sphere_dist(pickup_lat, pickup_lon, lga_coord[0], lga_coord[1]) 
    dropoff_lga = sphere_dist(lga_coord[0], lga_coord[1], dropoff_lat, dropoff_lon) 
    
    dataset['jfk_dist'] = pd.concat([pickup_jfk, dropoff_jfk], axis=1).min(axis=1)
    dataset['ewr_dist'] = pd.concat([pickup_ewr, dropoff_ewr], axis=1).min(axis=1)
    dataset['lga_dist'] = pd.concat([pickup_lga, dropoff_lga], axis=1).min(axis=1)
    
    return dataset
    
def add_datetime_info(dataset):
    #Convert to datetime format
    dataset['pickup_datetime'] = pd.to_datetime(dataset['pickup_datetime'],format="%Y-%m-%d %H:%M:%S UTC")
    
    dataset['hour'] = dataset.pickup_datetime.dt.hour
    dataset['day'] = dataset.pickup_datetime.dt.day
    dataset['month'] = dataset.pickup_datetime.dt.month
    dataset['weekday'] = dataset.pickup_datetime.dt.weekday
    dataset['year'] = dataset.pickup_datetime.dt.year
    
    return dataset

train_df = add_datetime_info(train_df)
train_df = add_airport_dist(train_df)
train_df['distance'] = sphere_dist(train_df['pickup_latitude'], train_df['pickup_longitude'], 
                                   train_df['dropoff_latitude'] , train_df['dropoff_longitude'])

train_df.head()

Now we need to drop the columns that we will not use to train our model.

* key
* pickup_datetime

In [None]:
train_df.drop(columns=['key', 'pickup_datetime'], inplace=True)
train_df.head()

### Model training

Now that we have the dataframe that we wanted we can start to train the XGBoost model. First we will split the dataset into train (99%) and test (1%). With this amount of data 1% should be enough to test performance.

In [None]:
y = train_df['fare_amount']
train = train_df.drop(columns=['fare_amount'])

x_train,x_test,y_train,y_test = train_test_split(train,y,random_state=0,test_size=0.01)

In [None]:
#Cross-validation
params = {
    # Parameters that we are going to tune.
    'max_depth': 8, #Result of tuning with CV
    'eta':.03, #Result of tuning with CV
    'subsample': 1, #Result of tuning with CV
    'colsample_bytree': 0.8, #Result of tuning with CV
    # Other parameters
    'objective':'reg:linear',
    'eval_metric':'rmse',
    'silent': 1
}

#Block of code used for hypertuning parameters. Adapt to each round of parameter tuning.
#Turn off CV in submission
CV=False
if CV:
    dtrain = xgb.DMatrix(train,label=y)
    gridsearch_params = [
        (eta)
        for eta in np.arange(.04, 0.12, .02)
    ]

    # Define initial best params and RMSE
    min_rmse = float("Inf")
    best_params = None
    for (eta) in gridsearch_params:
        print("CV with eta={} ".format(
                                 eta))

        # Update our parameters
        params['eta'] = eta

        # Run CV
        cv_results = xgb.cv(
            params,
            dtrain,
            num_boost_round=1000,
            nfold=3,
            metrics={'rmse'},
            early_stopping_rounds=10
        )

        # Update best RMSE
        mean_rmse = cv_results['test-rmse-mean'].min()
        boost_rounds = cv_results['test-rmse-mean'].argmin()
        print("\tRMSE {} for {} rounds".format(mean_rmse, boost_rounds))
        if mean_rmse < min_rmse:
            min_rmse = mean_rmse
            best_params = (eta)

    print("Best params: {}, RMSE: {}".format(best_params, min_rmse))
else:
    #Print final params to use for the model
    params['silent'] = 0 #Turn on output
    print(params)

In [None]:
def XGBmodel(x_train,x_test,y_train,y_test,params):
    matrix_train = xgb.DMatrix(x_train,label=y_train)
    matrix_test = xgb.DMatrix(x_test,label=y_test)
    model=xgb.train(params=params,
                    dtrain=matrix_train,num_boost_round=5000, 
                    early_stopping_rounds=10,evals=[(matrix_test,'test')])
    return model

model = XGBmodel(x_train,x_test,y_train,y_test,params)

#### Prediction

Finally we can use our trained model to predict the submission. First we will need to load and preprocess the test dataset just like we did for the training dataset.

In [None]:
#Read and preprocess test set
test_df =  pd.read_csv('../input/test.csv')
test_df = add_datetime_info(test_df)
test_df = add_airport_dist(test_df)
test_df['distance'] = sphere_dist(test_df['pickup_latitude'], test_df['pickup_longitude'], 
                                   test_df['dropoff_latitude'] , test_df['dropoff_longitude'])

test_key = test_df['key']
x_pred = test_df.drop(columns=['key', 'pickup_datetime'])

#Predict from test set
prediction = model.predict(xgb.DMatrix(x_pred), ntree_limit = model.best_ntree_limit)

## Create submission file

In [None]:
submission = pd.DataFrame({
        "key": test_key,
        "fare_amount": prediction.round(2)
})

submission.to_csv('taxi_fare_submission.csv',index=False)
submission.head()