In [1]:
import numpy as np # linear algebra
import pandas as pd # CSV file I/O (e.g. pd.read_csv)
import os # reading the input files
import matplotlib.pyplot as plt
import ssl

In [2]:
def plot_on_map(df, BB, nyc_map, s=10, alpha=0.2):
    fig, axs = plt.subplots(1, 2, figsize=(16,10))
    axs[0].scatter(df.pickup_longitude, df.pickup_latitude, zorder=1, alpha=alpha, c='r', s=s)
    axs[0].set_xlim((BB[0], BB[1]))
    axs[0].set_ylim((BB[2], BB[3]))
    axs[0].set_title('Pickup locations')
    axs[0].imshow(nyc_map, zorder=0, extent=BB)

    axs[1].scatter(df.dropoff_longitude, df.dropoff_latitude, zorder=1, alpha=alpha, c='r', s=s)
    axs[1].set_xlim((BB[0], BB[1]))
    axs[1].set_ylim((BB[2], BB[3]))
    axs[1].set_title('Dropoff locations')
    axs[1].imshow(nyc_map, zorder=0, extent=BB)

In [3]:
def add_travel_vector_features(df):
    df['abs_diff_longitude'] = (df.dropoff_longitude - df.pickup_longitude).abs()
    df['abs_diff_latitude'] = (df.dropoff_latitude - df.pickup_latitude).abs()

In [4]:
def select_within_boundingbox(df, BB):
    return (df.pickup_longitude >= BB[0]) & (df.pickup_longitude <= BB[1]) & \
           (df.pickup_latitude >= BB[2]) & (df.pickup_latitude <= BB[3]) & \
           (df.dropoff_longitude >= BB[0]) & (df.dropoff_longitude <= BB[1]) & \
           (df.dropoff_latitude >= BB[2]) & (df.dropoff_latitude <= BB[3])

In [6]:
def lonlat_to_xy(longitude, latitude, dx, dy, BB):
    return (dx*(longitude - BB[0])/(BB[1]-BB[0])).astype('int'), \
           (dy - dy*(latitude - BB[2])/(BB[3]-BB[2])).astype('int')


In [7]:
def remove_datapoints_from_water(df):
    def lonlat_to_xy(longitude, latitude, dx, dy, BB):
        return (dx * (longitude - BB[0]) / (BB[1] - BB[0])).astype('int'), \
               (dy - dy * (latitude - BB[2]) / (BB[3] - BB[2])).astype('int')

    # define bounding box
    BB = (-74.5, -72.8, 40.5, 41.8)

    # read nyc mask and turn into boolean map with
    # land = True, water = False
    nyc_mask = plt.imread('https://aiblog.nl/download/nyc_mask-74.5_-72.8_40.5_41.8.png')[:, :, 0] > 0.9

    # calculate for each lon,lat coordinate the xy coordinate in the mask map
    pickup_x, pickup_y = lonlat_to_xy(df.pickup_longitude, df.pickup_latitude,
                                      nyc_mask.shape[1], nyc_mask.shape[0], BB)
    dropoff_x, dropoff_y = lonlat_to_xy(df.dropoff_longitude, df.dropoff_latitude,
                                        nyc_mask.shape[1], nyc_mask.shape[0], BB)
    # calculate boolean index
    idx = nyc_mask[pickup_y, pickup_x] & nyc_mask[dropoff_y, dropoff_x]

    # return only datapoints on land
    return df[idx]

In [8]:
def get_input_matrix(df):
    return np.column_stack((df.abs_diff_longitude, df.abs_diff_latitude, np.ones(len(df))))

In [10]:
ssl._create_default_https_context = ssl._create_unverified_context
# print(os.listdir('./'))
plt.style.use('seaborn-whitegrid')
train_df =  pd.read_csv('./train.csv', nrows = 10_000_000)
print(train_df.dtypes)
add_travel_vector_features(train_df)

key                   object
fare_amount          float64
pickup_datetime       object
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count        int64
dtype: object


In [11]:
# remove NaN data
print(train_df.isnull().sum())
print('Old size: %d' % len(train_df))
train_df = train_df.dropna(how = 'any', axis = 'rows')
print('New size(without NaN): %d' % len(train_df))

key                    0
fare_amount            0
pickup_datetime        0
pickup_longitude       0
pickup_latitude        0
dropoff_longitude     69
dropoff_latitude      69
passenger_count        0
abs_diff_longitude    69
abs_diff_latitude     69
dtype: int64
Old size: 10000000
New size(without NaN): 9999931


In [12]:
# remove unreasonable data which difference of longitude and latitude is too large(which is impossible in NYC)
print('Old size: %d' % len(train_df))
train_df = train_df[(train_df.abs_diff_longitude < 5.0) & (train_df.abs_diff_latitude < 5.0)]
print('New size(diff longitude & latitude not to large): %d' % len(train_df))

Old size: 9999931
New size(diff longitude & latitude not to large): 9979187


In [13]:
# remove fare amount < 0
print('Old size: %d' % len(train_df))
train_df = train_df[train_df.fare_amount>=0]
print('New size(fare_amount > 0): %d' % len(train_df))

Old size: 9979187
New size(fare_amount > 0): 9978783


In [15]:
# load image of NYC map
BB = (-74.5, -72.8, 40.5, 41.8)
nyc_map = plt.imread('https://aiblog.nl/download/nyc_-74.5_-72.8_40.5_41.8.png')
plt.show()

# load extra image to zoom in on NYC
BB_zoom = (-74.3, -73.7, 40.5, 40.9)
nyc_map_zoom = plt.imread('https://aiblog.nl/download/nyc_-74.3_-73.7_40.5_40.9.png')
# remove data not in NYC
print('Old size: %d' % len(train_df))
train_df = train_df[select_within_boundingbox(train_df, BB)]
print('New size(in NYC): %d' % len(train_df))

nyc_mask = plt.imread('https://aiblog.nl/download/nyc_mask-74.5_-72.8_40.5_41.8.png')[:,:,0] > 0.9


Old size: 9787199
New size(in NYC): 9787199


In [16]:
pickup_x, pickup_y = lonlat_to_xy(train_df.pickup_longitude, train_df.pickup_latitude,
                                  nyc_mask.shape[1], nyc_mask.shape[0], BB)
dropoff_x, dropoff_y = lonlat_to_xy(train_df.dropoff_longitude, train_df.dropoff_latitude,
                                  nyc_mask.shape[1], nyc_mask.shape[0], BB)
idx = (nyc_mask[pickup_y, pickup_x] & nyc_mask[dropoff_y, dropoff_x])
print("Number of trips in water: {}".format(np.sum(~idx)))
# remove data in watter
print('Old size: %d' % len(train_df))
train_df = remove_datapoints_from_water(train_df)
print('New size(Not in Water): %d' % len(train_df))

Number of trips in water: 2025
Old size: 9787199
New size(Not in Water): 9785174


In [17]:
# remove data with unreasonable passenger numbers
print('Old size: %d' % len(train_df))
train_df = train_df[(train_df.passenger_count < 7.0) & (train_df.passenger_count > 0.0)]
print('New size(reasonable passenger count): %d' % len(train_df))

Old size: 9785174
New size(reasonable passenger count): 9750645


In [18]:
#save the trained data 
train_df.to_csv('Train_Processed.csv')

train_X = get_input_matrix(train_df)
train_y = np.array(train_df['fare_amount'])
print(train_X.shape)
print(train_y.shape)

# call lstsq to train input data
(w, _, _, _) = np.linalg.lstsq(train_X, train_y, rcond = None)
print(w)
w_OLS = np.matmul(np.matmul(np.linalg.inv(np.matmul(train_X.T, train_X)), train_X.T), train_y)
print(w_OLS)
print('ww')

(9750645, 3)
(9750645,)
[167.54495465 118.43263208   4.98442274]
[167.54495465 118.43263208   4.98442274]
ww


In [19]:
test_df = pd.read_csv('./test_copy.csv')
# Reuse the above helper functions to add our features and generate the input matrix.
add_travel_vector_features(test_df)
test_X = get_input_matrix(test_df)
print(test_X)
# Predict fare_amount on the test set using our model (w) trained on the training set.
test_y_predictions = np.matmul(test_X, w).round(decimals = 2)

# Write the predictions to a CSV file which we can submit to the competition.
submission = pd.DataFrame(
    {'key': test_df.key, 'fare_amount': test_y_predictions},
    columns = ['key', 'fare_amount'])
submission.to_csv('submission.csv', index = False)
print(os.listdir('.'))

[[0.00811005 0.01996994 1.        ]
 [0.01202393 0.01981735 1.        ]
 [0.00287    0.005121   1.        ]
 ...
 [0.20185852 0.07959747 1.        ]
 [0.04639435 0.06629944 1.        ]
 [0.01226044 0.00514984 1.        ]]
['XgboostWithZhibinData.pdf', '1_alldata_allfeatures.pkl', 'Untitled1.ipynb', 'add_features.ipynb', 'preprocess_and_linear_model.py', '.DS_Store', 'process_test.py', 'Untitled.ipynb', 'record.txt', 'Record.docx', 'XgboostWithZhibinData.ipynb', 'test.csv', 'submission.csv', 'Preprocess_data_4.23.py', 'pre.txt', 'Train_Processed_4.23.csv.zip', '__pycache__', 'test.py', 'new-york-city-taxi-fare-prediction', 'GCP-Coupons-Instructions.rtf', 'test_new_feature.csv', 'test_copy.csv', 'pickle_model.pkl', 'Train_Processed_4.23.csv', 'train.csv', 'test_new_feature.csv.zip', '.ipynb_checkpoints', 'venv', 'Zhibin_xgboost.py', '1_alldata_allfeatures.csv', 'sample_submission.csv', 'nyc-taxi-fare-data-exploration.ipynb', '.idea']
