In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Import the packages that we will be using
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
import optuna
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [None]:
# # Set columns to most suitable type to optimize for memory usage
# traintypes = {'fare_amount': 'float32',
#               'pickup_datetime': 'str', 
#               'pickup_longitude': 'float32',
#               'pickup_latitude': 'float32',
#               'dropoff_longitude': 'float32',
#               'dropoff_latitude': 'float32',
#               'passenger_count': 'uint8'}

In [None]:
train = pd.read_csv("/kaggle/input/new-york-city-taxi-fare-prediction/train.csv", nrows = 10000000)
test = pd.read_csv("/kaggle/input/new-york-city-taxi-fare-prediction/test.csv")

In [None]:
train.shape

In [None]:
test.shape

In [None]:
train.head(10)

In [None]:
train.describe()

In [None]:
#check for missing values in train data
train.isnull().sum().sort_values(ascending=False)

In [None]:
#check for missing values in test data
test.isnull().sum().sort_values(ascending=False)

In [None]:
#drop the missing values
train = train.drop(train[train.isnull().any(1)].index, axis = 0)

In [None]:
train.shape

In [None]:
#check the target column
train['fare_amount'].describe()

In [None]:
#38 fields have negative fare_amount values.
from collections import Counter
Counter(train['fare_amount']<0)

In [None]:
#drop the negative fare_amount values.
train = train.drop(train[train['fare_amount']<0].index, axis=0)
train.shape

In [None]:
#no more negative values in the fare field
train['fare_amount'].describe()

In [None]:
train.columns

In [None]:
test.columns

In [None]:
train.dtypes

Next check the passenger_count variable

In [None]:
train['passenger_count'].describe()

In [None]:
# 208人のpassenger_countがある。dropする
train[train['passenger_count']>6]

In [None]:
# If there is more than 6 passengers, also drop the observation
train = train[train["passenger_count"] <= 6]

In [None]:
train['passenger_count'].describe()

In [None]:
#Next, let us explore the pickup latitude and longitudes
train['pickup_latitude'].describe()

緯度は-90から90まで。
経度は-180から180まで。
異常値をフィルタリングする。

In [None]:
train[train['pickup_latitude']<-90]

In [None]:
train[train['pickup_latitude']>90]

In [None]:
#We need to drop these outliers
train = train.drop(train[train['pickup_latitude']<-90].index, axis=0)
train = train.drop(train[train['pickup_latitude']>90].index, axis=0)

In [None]:
train.shape

In [None]:
#similar operation for pickup longitude
train['pickup_longitude'].describe()

In [None]:
train[train['pickup_longitude']<-180]

In [None]:
train[train['pickup_longitude']>180]

In [None]:
train = train.drop(train[train['pickup_longitude']<-180].index, axis=0)
train = train.drop(train[train['pickup_longitude']>180].index, axis=0)

In [None]:
train.shape

In [None]:
#similar operation for dropoff latitude and longitude
train[train['dropoff_latitude']<-90]

In [None]:
train[train['dropoff_latitude']>90]

In [None]:
train = train.drop(train[train['dropoff_latitude']<-90].index, axis=0)
train = train.drop(train[train['dropoff_latitude']>90].index, axis=0)

In [None]:
train.shape

In [None]:
train[train['dropoff_latitude']<-180]|train[train['dropoff_latitude']>180]

ニューヨーク市外の緯度、経度でのピックアップとドロップオフを除外する

In [None]:
# Drop any records that have zero longitude/latitude, or long/lats that are outside bounds. Rembmer our longitude for NYC is always negative.
# This is going to be a rough chop, there is a significant amount of outliers to discuss
train = train[(train["pickup_longitude"] < -70) & (train["pickup_longitude"] > -83)]
train = train[(train["pickup_latitude"] > 36) & (train["pickup_latitude"] < 46 )]
train = train[(train["dropoff_longitude"] < -70) & (train["dropoff_longitude"] > -83)]
train = train[(train["dropoff_latitude"] > 36) & (train["dropoff_latitude"] < 46 )]

In [None]:
# Confirm that all our longitude and latitude values now fall within acceptable bounds
train.describe()

In [None]:
# See the new shape of our data after dropping the outlier coordinates
print("Train data shape: ", train.shape)

key と pickup_datetime を日付型に変換する

In [None]:
# key and pickup_datetime seem to be datetime columns which are in object format.
# Let's convert them to datetime
train['key'] = pd.to_datetime(train['key'])
train['pickup_datetime']  = pd.to_datetime(train['pickup_datetime'])

In [None]:
#Convert for test data
test['key'] = pd.to_datetime(test['key'])
test['pickup_datetime']  = pd.to_datetime(test['pickup_datetime'])

create columns for the following -
* year
* month
* date
* hour
* day of week

In [None]:
data = [train,test]
for i in data:
    i['Year'] = i['pickup_datetime'].dt.year
    i['Month'] = i['pickup_datetime'].dt.month
    i['Date'] = i['pickup_datetime'].dt.day
    i['Day of Week'] = i['pickup_datetime'].dt.dayofweek
    i['Hour'] = i['pickup_datetime'].dt.hour

In [None]:
# I don't include the pickup_datetime columns 
# because datetime columns cannot be used directly for modeling.
# Features need to extracted from the timestamp fields
# which will later be used as features for modelling.
train = train.drop(['key','pickup_datetime'], axis = 1)
test = test.drop(['key','pickup_datetime'], axis = 1)

In [None]:
#check the dtypes after conversion
train.dtypes

In [None]:
test.dtypes

In [None]:
#check the data
train.head()

In [None]:
test.head()

緯度と経度が与えられているとき、球面内の距離はHaversineの式で計算することができる。

haversine(θ) = sin²(θ/2)

結局、φは緯度、λは経度、Rは地球の半径（平均半径＝6,371km）で、緯度と経度の座標（ここではA、B）を含めると、以下の形式に落ち着く。

a = sin²((φB - φA)/2) + cos φA . cos φB . sin²((λB - λA)/2)

c = 2 * atan2( √a, √(1−a) )

d = R ⋅ c

d = Haversine distance

In [None]:
def haversine_distance(lat1, long1, lat2, long2):
    data = [train, test]
    for i in data:
        R = 6371  #radius of earth in kilometers
        #R = 3959 #radius of earth in miles
        phi1 = np.radians(i[lat1])
        phi2 = np.radians(i[lat2])
    
        delta_phi = np.radians(i[lat2]-i[lat1])
        delta_lambda = np.radians(i[long2]-i[long1])
    
        #a = sin²((φB - φA)/2) + cos φA . cos φB . sin²((λB - λA)/2)
        a = np.sin(delta_phi / 2.0) ** 2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda / 2.0) ** 2
    
        #c = 2 * atan2( √a, √(1−a) )
        c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    
        #d = R*c
        d = (R * c) #in kilometers
        i['H_Distance'] = d
    return d

herversine distanceの計算とセル作成

In [None]:
haversine_distance('pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude')

In [None]:
train['H_Distance'].head(10)

In [None]:
test['H_Distance'].head(10)

In [None]:
train.head(10)

In [None]:
test.head(10)

In [None]:
# x_train <- trainのfare_amount(目的変数)以外のすべての行
# y_train <- trainのfare_amount(目的変数)行
x_train = train.iloc[:,train.columns!='fare_amount']
y_train = train['fare_amount'].values
x_test = test

In [None]:
x_train.shape

In [None]:
x_train.columns

In [None]:
y_train.shape

In [None]:
x_test.shape

In [None]:
x_test.columns

lightgbmを利用して推定


In [None]:
import lightgbm as lgbm

In [None]:
# trainのデータセットの3割をモデル学習時のバリデーションデータとして利用する
x_train, x_valid, y_train, y_valid = train_test_split(x_train,
                                                    y_train,
                                                    test_size=0.3,
                                                    random_state=10)

# LightGBMを利用するのに必要なフォーマットに変換
lgb_train = lgbm.Dataset(x_train, y_train)
lgb_eval = lgbm.Dataset(x_valid, y_valid, reference=lgb_train)

In [None]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'nthread': -1,
    'verbose': 0,
    'num_leaves': 256,
    'learning_rate': 0.05,
    'max_depth': -1,
    'reg_aplha': 1,
    'metric': 'rmse',
    'scale_pos_weight': 1,
    'min_child_samples': 20
}

In [None]:
pred_test_y = np.zeros(x_test.shape[0])
pred_test_y.shape

In [None]:
train_set = lgbm.Dataset(x_train, y_train, silent=True)
train_set

In [None]:
model = lgbm.train(params, train_set = train_set, num_boost_round=300)

In [None]:
print(model)

In [None]:
pred_test_y = model.predict(x_test, num_iteration = model.best_iteration)

In [None]:
print(pred_test_y)

In [None]:
submission = pd.read_csv('/kaggle/input/new-york-city-taxi-fare-prediction/sample_submission.csv')
submission['fare_amount'] = pred_test_y
submission.to_csv('submission_LGB.csv', index=False)
submission.head(20)