In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
import datetime as dt
from pandas.tseries.holiday import USFederalHolidayCalendar
import calendar
plt.style.use('ggplot') 
import xgboost as xgb

In [None]:
parser = lambda x: pd.datetime.strptime(x[:19], '%Y-%m-%d %H:%M:%S')
df = pd.read_csv('/kaggle/input/new-york-city-taxi-fare-prediction/train.csv',nrows=2_000_000, usecols=[1,2,3,4,5,6,7],  parse_dates=["pickup_datetime"], date_parser=parser)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
def clean(df):
    df = df[(-76 <= df['pickup_longitude']) & (df['pickup_longitude'] <= -72)]
    df = df[(-76 <= df['dropoff_longitude']) & (df['dropoff_longitude'] <= -72)]
    df = df[(38 <= df['pickup_latitude']) & (df['pickup_latitude'] <= 42)]
    df = df[(38 <= df['dropoff_latitude']) & (df['dropoff_latitude'] <= 42)]

    df = df[(0 < df['fare_amount']) & (df['fare_amount'] <= 250)]

    df = df[(df['dropoff_longitude'] != df['pickup_longitude'])]
    df = df[(df['dropoff_latitude'] != df['pickup_latitude'])]
    df = df[(df['passenger_count'] >= 1) & (df['passenger_count'] <= 7)]
    
    return df

In [None]:
def add_coordinate_features(df):
    lat1 = df['pickup_latitude']
    lat2 = df['dropoff_latitude']
    lon1 = df['pickup_longitude']
    lon2 = df['dropoff_longitude']
    
    df['latdiff'] = np.abs(lat1 - lat2)
    df['londiff'] = np.abs(lon1 - lon2)
    
    df['manhattan'] = df['latdiff'] + df['londiff'] 
    df['euclidean'] = (df['latdiff'] ** 2 + df['londiff'] ** 2) ** 0.5
    return df

In [None]:
def add_datatime_features(df):
    df['year'] = df.pickup_datetime.apply(lambda x: x.year)
    df['month'] = df.pickup_datetime.apply(lambda x: x.month)
    df['hour'] = df.pickup_datetime.apply(lambda x: x.hour)
    df['pickup_day_of_week'] = df.pickup_datetime.apply(lambda x: x.weekday())
    
    cal = USFederalHolidayCalendar()
    holidays = cal.holidays(start='2009-01-01', end='2015-12-31').to_pydatetime()
    
    df['holidat_or_not'] = df.pickup_datetime.apply(lambda x: 1 if x in holidays else 0)
    df = df.drop('pickup_datetime', axis=1)
    return df

In [None]:
df = clean(df)

In [None]:
df = add_coordinate_features(df)

In [None]:
df = add_datatime_features(df)

In [None]:
df.describe()

In [None]:
matrix = df.corr()
sns.heatmap(matrix)

In [None]:
sns.barplot(x = 'year', y = 'fare_amount',  data = df)

In [None]:
sns.barplot(x = 'hour', y = 'fare_amount',  data = df)

In [None]:
sns.barplot(x = 'holidat_or_not', y = 'fare_amount',  data = df)

In [None]:
sns.barplot(x = 'pickup_day_of_week', y = 'fare_amount',  data = df)

In [None]:
#info = df[['fare_amount', 'hour']].copy()
#holiday_count = info.groupby('hour').aggregate(np.mean)
#holiday_count.plot(kind='bar')

In [None]:
sns.countplot(df["passenger_count"])

In [None]:
sns.kdeplot(df["fare_amount"])

In [None]:
dtrain = xgb.DMatrix(df.drop('fare_amount', axis=1), df['fare_amount'])

In [None]:
 params = {'eval_metric': 'rmse',
              'max_depth': 7,
              'subsample': 0.8,
              'eta': 0.1,
              'gamma': 1.0,
              'colsample_bytree': 0.9}

In [None]:
model2 = xgb.train(params, dtrain, num_boost_round=250)

In [None]:
fig, ax = plt.subplots(figsize=(10,7))
xgb.plot_importance(model2, height=0.6, ax=ax) 
plt.show()

In [None]:
test = pd.read_csv('/kaggle/input/new-york-city-taxi-fare-prediction/test.csv',  parse_dates=["pickup_datetime"], date_parser=parser).set_index('key')

test = add_coordinate_features(test)
test = add_datatime_features(test)

dtest = xgb.DMatrix(test)
y_pred_test = model2.predict(dtest)

In [None]:
holdout = pd.DataFrame({'key': test.index, 'fare_amount': y_pred_test})
holdout.to_csv('submission.csv', index=False)