In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))
# Any results you write to the current directory are saved as output.

 # Dataset

In [None]:
import numpy as np
import pandas as pd

In [None]:
train = pd.read_csv("/kaggle/input/new-york-city-taxi-fare-prediction/train.csv", nrows = 3000000)
test = pd.read_csv("/kaggle/input/new-york-city-taxi-fare-prediction/test.csv")

In [None]:
train.shape

In [None]:
test.shape

In [None]:
train.head(10)

In [None]:
train.describe()

In [None]:
#check for missing values in train data
train.isnull().sum().sort_values(ascending=False)

# Dealing with missing data in the Data

In [None]:

test.isnull().sum().sort_values(ascending=False)

 # drop the missing values


In [None]:
train = train.dropna(subset=['dropoff_latitude'])

In [None]:
train.isnull().sum().sort_values(ascending=False)

# Handling the target column fare amount 

In [None]:
train['fare_amount'].describe()


Fare amount has a negative value, which doesn't make sense. Remove these fields

In [None]:
#38 fields have negative fare_amount values.
from collections import Counter
Counter(train['fare_amount']<0)


In [None]:
train = train.drop(train[train['fare_amount']<0].index, axis=0)

In [None]:
#no more negative values in the fare field
train['fare_amount'].describe()

In [None]:
#highest fare is $500
train['fare_amount'].sort_values(ascending=False)

In [None]:
train.describe()

# Handing Passanger Count

In [None]:
train['passenger_count'].describe()

Next check the passenger_count variable

In [None]:
train[train['passenger_count']>6]

In [None]:
train = train.drop(train[train['passenger_count']>6].index, axis = 0)

In [None]:
train[train['passenger_count']>6]

In [None]:
#much neater now! Max number of passengers are 6. Which makes sense is the cab is an SUV :)
train['passenger_count'].sort_values(ascending=False)

### Dealing with long and latitude

Quick Googling gave me this info

   - Latitudes range from -90 to 90.
   - Longitudes range from -180 to 180.

The above describe clearly shows some outliers. Let's filter them


In [None]:
test['passenger_count'].sort_values(ascending= False)

# Handling Latitude and Longitudes

In [None]:
print(f'Rows before removing coordinate outliers - {train.shape[0]}')

train = train[train.pickup_longitude.between(test.pickup_longitude.min(), test.pickup_longitude.max())]
train = train[train.pickup_latitude.between(test.pickup_latitude.min(), test.pickup_latitude.max())]
train = train[train.dropoff_longitude.between(test.dropoff_longitude.min(), test.dropoff_longitude.max())]
train = train[train.dropoff_latitude.between(test.dropoff_latitude.min(), test.dropoff_latitude.max())]

print(f'Rows after removing coordinate outliers - {train.shape[0]}')

In [None]:
train.describe()

# Feature Engineering 
- total_distance: distance from pickup to dropoff. The longer the trip, the more expensive.
- Extract information from datetime (day of week, month, hour, day). Taxi fares change day/night or on weekdays/holidays.
- Add columns indicating distance from pickup or dropoff coordinates to airports. Trips from/to an airport have a fixed fee.


In [None]:
def distance(pickup_lat, pickup_lon, dropoff_lat, dropoff_lon):
    """
    Return distance along great radius between pickup and dropoff coordinates.
    """
    #Define earth radius (km)
    R_earth = 6371
    #Convert degrees to radians
    pickup_lat, pickup_lon, dropoff_lat, dropoff_lon = map(np.radians,
                                                             [pickup_lat, pickup_lon, 
                                                              dropoff_lat, dropoff_lon])
    #Compute distances along lat, lon dimensions
    dlat = dropoff_lat - pickup_lat
    dlon = dropoff_lon - pickup_lon
    
    #Compute haversine distance
    a = np.sin(dlat/2.0)**2 + np.cos(pickup_lat) * np.cos(dropoff_lat) * np.sin(dlon/2.0)**2
    
    return 2 * R_earth * np.arcsin(np.sqrt(a))

def add_airport_dist(dataset):
    """
    Return minumum distance from pickup or dropoff coordinates to each airport.
    JFK: John F. Kennedy International Airport
    EWR: Newark Liberty International Airport
    LGA: LaGuardia Airport
    """
    jfk_coord = (40.639722, -73.778889)
    ewr_coord = (40.6925, -74.168611)
    lga_coord = (40.77725, -73.872611)
    
    pickup_lat = dataset['pickup_latitude']
    dropoff_lat = dataset['dropoff_latitude']
    pickup_lon = dataset['pickup_longitude']
    dropoff_lon = dataset['dropoff_longitude']
    
    pickup_jfk  = distance(pickup_lat, pickup_lon, jfk_coord[0], jfk_coord[1]) 
    dropoff_jfk = distance(jfk_coord[0], jfk_coord[1], dropoff_lat, dropoff_lon) 
    pickup_ewr  = distance(pickup_lat, pickup_lon, ewr_coord[0], ewr_coord[1])
    dropoff_ewr = distance(ewr_coord[0], ewr_coord[1], dropoff_lat, dropoff_lon) 
    pickup_lga  = distance(pickup_lat, pickup_lon, lga_coord[0], lga_coord[1]) 
    dropoff_lga = distance(lga_coord[0], lga_coord[1], dropoff_lat, dropoff_lon) 
    
    dataset['jfk_dist'] = pd.concat([pickup_jfk, dropoff_jfk], axis=1).min(axis=1)
    dataset['ewr_dist'] = pd.concat([pickup_ewr, dropoff_ewr], axis=1).min(axis=1)
    dataset['lga_dist'] = pd.concat([pickup_lga, dropoff_lga], axis=1).min(axis=1)
    
    return dataset
    
def add_datetime_info(dataset):
    #Convert to datetime format
    dataset['pickup_datetime'] = pd.to_datetime(dataset['pickup_datetime'],format="%Y-%m-%d %H:%M:%S UTC")
    
    dataset['hour'] = dataset.pickup_datetime.dt.hour
    dataset['day'] = dataset.pickup_datetime.dt.day
    dataset['month'] = dataset.pickup_datetime.dt.month
    dataset['weekday'] = dataset.pickup_datetime.dt.weekday
    dataset['year'] = dataset.pickup_datetime.dt.year
    
    return dataset



In [None]:
train = add_datetime_info(train)
train = add_airport_dist(train)
train['distance'] = distance(train['pickup_latitude'], train['pickup_longitude'], 
                                   train['dropoff_latitude'] , train['dropoff_longitude'])

train.head()

In [None]:
train.shape

In [None]:
train.sort_values(by = 'distance',ascending =False).head(100)

In [None]:
train.distance[(train.distance==0)].count()

In [None]:
train[(train.pickup_latitude != train.dropoff_latitude) &
              (train.pickup_longitude != train.dropoff_latitude) &
              (train.distance == 0)].count()

### Lets look at  Fare and Distance are both 0. According to the table above, we shall delete them as they do not provide us any info with regards to the data.

In [None]:
train[(train['distance']==0)&(train['fare_amount']==0)]

In [None]:
train = train.drop(train[(train['distance']==0)&(train['fare_amount']==0)].index, axis = 0)

In [None]:
train[(train['distance']==0)&(train['fare_amount']==0)]

In [None]:
# good

In [None]:
sns.distplot(a=train.fare_amount)

In [None]:
train['fare_amount'].skew()

In [None]:
# lets create a copy of train set
train_data = train.copy()

In [None]:
train_data.shape

### lets drop key and Pickup_datetime because we dont need it as we have extracted featuresfrom it

In [None]:
train_data.drop(columns=['key', 'pickup_datetime'],inplace=True)

In [None]:
train_data.head()

In [None]:
train_data.sort_values(by = 'fare_amount',ascending =False).head(100)

# lets look for corellation with fare_amount 

In [None]:
corr_matrix = train_data.corr()

In [None]:
corr_matrix['fare_amount']

In [None]:
train_data.sort_values(by = 'passenger_count',ascending =True).head(10)

### Lets do EDA(Explority Data Analysis). The following are my considerations -
- Does the number of passengers affect the fare?
- Does the date and time of pickup affect the fare?
- Does the day of the week affect the fare?
- Does the distance travelled affect the fare?


In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(10,7))
plt.hist(train_data['passenger_count'],bins=15)
plt.xlabel('No of Passanger')
plt.ylabel('Frequency')

In [None]:
plt.figure(figsize=(10,7))
plt.scatter(x= train_data['passenger_count'],y = train_data['fare_amount'],s = 1.5)
plt.xlabel('No of Passengers')
plt.ylabel('fare_amount')

- From above plots i can see that mostly cabs are booked by single passanger
- And i can also see that prices are high for single passanger 
- And i can see there are fare for 0 passanger which is very unsual for this we can check test set is there is fare for empty cab

In [None]:
# lets drop it where there are fares for 0 passanger
print('Train_data befor ',train_data.shape)

In [None]:
train_data[(train_data['passenger_count']==0)&(train_data['fare_amount']>0)].sort_values(by = 'fare_amount',ascending=False)

### Lets Drop it 

In [None]:
train_data = train_data.drop(train_data[(train_data['passenger_count']==0)&(train_data['fare_amount']>0)].index, axis = 0)

In [None]:
train_data.shape

### Dose Pickup dare and time affect Price

In [None]:
plt.figure(figsize=(15,7))
plt.scatter(x=train_data['day'], y=train_data['fare_amount'], s=1.5)
plt.xlabel('Date')
plt.ylabel('Fare')

### On 12 th day price is highest

In [None]:
plt.figure(figsize=(10,7))
plt.hist(train_data['hour'],bins=50)
plt.xlabel('Hour')
plt.ylabel('Frequency')

### cabs are less between 2 to 6 hour

In [None]:
plt.figure(figsize=(15,7))
plt.scatter(x=train_data['distance'], y=train_data['fare_amount'], s=1.5)
plt.xlabel('Distance')
plt.ylabel('Fare')

In [None]:
train_data.head()

In [None]:
train_data.shape

In [None]:
train_data_org = train_data.copy()

In [None]:
del train_data

In [None]:
y = train_data_org['fare_amount']
train_data = train_data_org.drop(columns=['fare_amount'])

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler

In [None]:
data_pipeline = Pipeline([('rob_scale',RobustScaler())])

In [None]:
traindata_scaled = data_pipeline.fit_transform(train_data)

In [None]:
traindata_scaled

In [None]:
train_data.head()

In [None]:
traindata_scaled = pd.DataFrame(traindata_scaled,columns=train_data.columns,index=train_data.index)

In [None]:
traindata_scaled.head()

# Data cooked now Feed the food to Machine learnig Algorithms

In [None]:
traindata_scaled.shape

# test Set

In [None]:
test = add_datetime_info(test)
test = add_airport_dist(test)
test['distance'] = distance(test['pickup_latitude'], test['pickup_longitude'], 
                                   test['dropoff_latitude'] , test['dropoff_longitude'])

test.head()

In [None]:
test.drop(columns=['key', 'pickup_datetime'],inplace=True)

In [None]:
test.head()

In [None]:
test.shape

In [None]:
testdata_scaled = data_pipeline.fit_transform(test)

In [None]:
testdata_scaled

In [None]:
testdata_scaled = pd.DataFrame(testdata_scaled,columns=test.columns,index=test.index)

In [None]:
testdata_scaled.head()

In [None]:
print(train_data.shape,traindata_scaled.shape)

In [None]:
print(test.shape,testdata_scaled.shape)

In [None]:
traindata_scaled.to_csv('trained_scaled.csv',index=False)

In [None]:
train_data.to_csv('trained_data.csv',index=False)

In [None]:
testdata_scaled.to_csv('testdata_scaled.csv',index=False)
test.to_csv('test.csv',index=False)

In [None]:
y.to_csv('train_labels.csv',index=False)

In [None]:
train_data_org.to_csv('train_data_org.csv',index= False)

In [None]:
train_data_org.head()

In [None]:
train_data.head()

In [None]:
from sklearn.model_selection import train_test_split
import lightgbm as lgbm

In [None]:
x_train,x_test,y_train,y_test = train_test_split(traindata_scaled,y,random_state=123,test_size=0.10)

In [None]:
params = {
        'boosting_type':'gbdt',
        'objective': 'regression',
        'nthread': 4,
        'num_leaves': 31,
        'learning_rate': 0.05,
        'max_depth': -1,
        'subsample': 0.8,
        'bagging_fraction' : 1,
        'max_bin' : 5000 ,
        'bagging_freq': 20,
        'colsample_bytree': 0.6,
        'metric': 'rmse',
        'min_split_gain': 0.5,
        'min_child_weight': 1,
        'min_child_samples': 10,
        'scale_pos_weight':1,
        'zero_as_missing': True,
        'seed':0,
        'num_rounds':50000
    }



In [None]:
testdata_scaled.head()

In [None]:

# train_set = lgbm.Dataset(x_train, y_train, silent=False,categorical_feature=['year','month','day','weekday'])
# valid_set = lgbm.Dataset(x_test, y_test, silent=False,categorical_feature=['year','month','day','weekday'])
# model = lgbm.train(params, train_set = train_set, num_boost_round=10000,early_stopping_rounds=500,verbose_eval=500, valid_sets=valid_set)

# prediction = model.predict(testdata_scaled, num_iteration = model.best_iteration)      



# submission = pd.read_csv("/kaggle/input/new-york-city-taxi-fare-prediction/sample_submission.csv")
# submission['fare_amount'] = prediction
# submission.to_csv('lgbm_taxi_fare1.csv', index=False)





#submission.head()

#submission.head(20)

In [None]:
train_data.head()