In [13]:
import sys
!{sys.executable} -m pip install python-dotenv

Collecting python-dotenv
  Downloading python_dotenv-0.15.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-0.15.0


In [76]:
import boto3
from boto3 import s3
import pandas as pd
import numpy as np
from dotenv import find_dotenv, load_dotenv
import os
import s3fs
import io
from pathlib import Path
from sklearn.model_selection import train_test_split

from src.features.dates import convert_to_date
from src.data.sets import split_sets_random
from src.data.sets import save_sets
from src.models.null import NullModel

In [17]:
%load_ext autoreload
%autoreload 2
load_dotenv(find_dotenv())

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


True

In [19]:
aws_access_key_id = os.environ.get('aws_access_key_id')
aws_secret_access_key = os.environ.get('aws_secret_access_key')

# Set up directories

In [74]:
project_dir = Path.cwd().parent
data_dir = project_dir / 'data'
raw_data_dir = data_dir / 'raw'
interim_data_dir = data_dir / 'interim'
processed_data_dir = data_dir / 'processed'

In [10]:
# Where is boto3 looking for my credentials?
boto3.set_stream_logger('botocore', level='DEBUG')

In [21]:
def list_bucket_contents(bucket, 
                         aws_access_key_id,
                         aws_secret_access_key,
                         match=''):
    s3 = boto3.resource('s3',
                        aws_access_key_id=aws_access_key_id,
                        aws_secret_access_key=aws_secret_access_key)
    bucket_resource = s3.Bucket(bucket)
#     bucket_resource = s3_resource.Bucket(bucket)
    for key in bucket_resource.objects.all():
        if match in key.key:
            print(key.key)

In [23]:
list_bucket_contents(bucket='nyc-tlc',
                     aws_access_key_id=aws_access_key_id,
                     aws_secret_access_key=aws_secret_access_key,
                     match='2020')

2021-02-10 07:00:46,340 botocore.hooks [DEBUG] Event choose-service-name: calling handler <function handle_service_name_alias at 0x7f8601d82830>
2021-02-10 07:00:46,346 botocore.hooks [DEBUG] Event creating-client-class.s3: calling handler <function add_generate_presigned_post at 0x7f8601e39680>
2021-02-10 07:00:46,347 botocore.hooks [DEBUG] Event creating-client-class.s3: calling handler <function lazy_call.<locals>._handler at 0x7f85deb6acb0>
2021-02-10 07:00:46,349 botocore.hooks [DEBUG] Event creating-client-class.s3: calling handler <function add_generate_presigned_url at 0x7f8601e39440>
2021-02-10 07:00:46,358 botocore.endpoint [DEBUG] Setting s3 timeout as (60, 60)
2021-02-10 07:00:46,362 botocore.client [DEBUG] Registering retry handlers for service: s3
2021-02-10 07:00:46,369 botocore.hooks [DEBUG] Event creating-resource-class.s3.Bucket: calling handler <function lazy_call.<locals>._handler at 0x7f85deaa5d40>
2021-02-10 07:00:46,372 botocore.hooks [DEBUG] Event before-paramet

trip data/fhv_tripdata_2020-01.csv
trip data/fhv_tripdata_2020-02.csv
trip data/fhv_tripdata_2020-03.csv
trip data/fhv_tripdata_2020-04.csv
trip data/fhv_tripdata_2020-05.csv
trip data/fhv_tripdata_2020-06.csv
trip data/fhvhv_tripdata_2020-01.csv
trip data/fhvhv_tripdata_2020-02.csv
trip data/fhvhv_tripdata_2020-03.csv
trip data/fhvhv_tripdata_2020-04.csv
trip data/fhvhv_tripdata_2020-05.csv
trip data/fhvhv_tripdata_2020-06.csv
trip data/green_tripdata_2020-01.csv
trip data/green_tripdata_2020-02.csv
trip data/green_tripdata_2020-03.csv
trip data/green_tripdata_2020-04.csv
trip data/green_tripdata_2020-05.csv
trip data/green_tripdata_2020-06.csv
trip data/yellow_tripdata_2020-01.csv
trip data/yellow_tripdata_2020-02.csv
trip data/yellow_tripdata_2020-03.csv
trip data/yellow_tripdata_2020-04.csv
trip data/yellow_tripdata_2020-05.csv
trip data/yellow_tripdata_2020-06.csv


In [30]:
s3 = boto3.client('s3',
                  aws_access_key_id=aws_access_key_id,
                  aws_secret_access_key=aws_secret_access_key)
obj = s3.get_object(Bucket='nyc-tlc', Key='trip data/yellow_tripdata_2020-04.csv')
df = pd.read_csv(io.BytesIO(obj['Body'].read()))

  interactivity=interactivity, compiler=compiler, result=result)


In [31]:
df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
0,1.0,2020-04-01 00:41:22,2020-04-01 01:01:53,1.0,1.2,1.0,N,41,24,2.0,5.5,0.5,0.5,0.0,0.0,0.3,6.8,0.0
1,1.0,2020-04-01 00:56:00,2020-04-01 01:09:25,1.0,3.4,1.0,N,95,197,1.0,12.5,0.5,0.5,2.75,0.0,0.3,16.55,0.0
2,1.0,2020-04-01 00:00:26,2020-04-01 00:09:25,1.0,2.8,1.0,N,237,137,1.0,10.0,3.0,0.5,1.0,0.0,0.3,14.8,2.5
3,1.0,2020-04-01 00:24:38,2020-04-01 00:34:38,0.0,2.6,1.0,N,68,142,1.0,10.0,3.0,0.5,1.0,0.0,0.3,14.8,2.5
4,2.0,2020-04-01 00:13:24,2020-04-01 00:18:26,1.0,1.44,1.0,Y,263,74,1.0,6.5,0.5,0.5,3.0,0.0,0.3,13.3,2.5


In [32]:
df.shape

(237993, 18)

In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 237993 entries, 0 to 237992
Data columns (total 18 columns):
VendorID                 218480 non-null float64
tpep_pickup_datetime     237993 non-null object
tpep_dropoff_datetime    237993 non-null object
passenger_count          218480 non-null float64
trip_distance            237993 non-null float64
RatecodeID               218480 non-null float64
store_and_fwd_flag       218480 non-null object
PULocationID             237993 non-null int64
DOLocationID             237993 non-null int64
payment_type             218480 non-null float64
fare_amount              237993 non-null float64
extra                    237993 non-null float64
mta_tax                  237993 non-null float64
tip_amount               237993 non-null float64
tolls_amount             237993 non-null float64
improvement_surcharge    237993 non-null float64
total_amount             237993 non-null float64
congestion_surcharge     237993 non-null float64
dtypes: float6

In [35]:
df.describe()

Unnamed: 0,VendorID,passenger_count,trip_distance,RatecodeID,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
count,218480.0,218480.0,237993.0,218480.0,237993.0,237993.0,218480.0,237993.0,237993.0,237993.0,237993.0,237993.0,237993.0,237993.0,237993.0
mean,1.564949,1.296764,4.039981,1.034081,154.908422,150.361414,1.425673,11.666027,1.066739,0.487,1.530229,0.220504,0.296331,16.408621,1.927536
std,0.495765,0.983595,294.879052,0.865044,70.749496,74.474108,0.555915,11.728767,1.26017,0.094993,2.295523,1.342351,0.045429,13.155858,1.072839
min,1.0,0.0,0.0,1.0,1.0,1.0,1.0,-118.0,-4.5,-0.5,-5.0,-19.87,-0.3,-138.17,-2.5
25%,1.0,1.0,0.95,1.0,97.0,75.0,1.0,5.5,0.0,0.5,0.0,0.0,0.3,9.8,2.5
50%,2.0,1.0,1.74,1.0,143.0,143.0,1.0,8.0,0.5,0.5,1.0,0.0,0.3,12.8,2.5
75%,2.0,1.0,3.4,1.0,234.0,233.0,2.0,13.0,2.5,0.5,2.46,0.0,0.3,18.36,2.5
max,2.0,7.0,126501.77,99.0,265.0,265.0,4.0,903.02,7.0,1.1,117.28,98.75,0.3,903.32,2.5


In [40]:
path = raw_data_dir / 'df.csv'
df.to_csv(path, index=False)

# Cleaning the Data

In [52]:
df_cleaned = df.copy(deep=True)

In [53]:
date_cols = df_cleaned.filter(like='date').columns
df_cleaned.loc[:, date_cols] = df_cleaned[date_cols].apply(convert_to_date)

## Add `trip_duration`

In [55]:
df_cleaned.loc[:, 'trip_duration'] = (df_cleaned.tpep_dropoff_datetime - df_cleaned.tpep_pickup_datetime).dt.seconds

## Binning `trip_duration`

In [56]:
df_cleaned.loc[:, 'trip_duration'] = pd.cut(df_cleaned['trip_duration'], 
                                            bins=[0, 300, 600, 1800, 100000], 
                                            labels=['x<5min', 'x<10min', 'x<30min', 'x>=30min'])

## Extract date features

In [57]:
df_cleaned.loc[:, 'tpep_pickup_month'] = df_cleaned.tpep_pickup_datetime.dt.month

In [58]:
df_cleaned.loc[:, 'tpep_pickup_hourofday'] = df_cleaned.tpep_pickup_datetime.dt.hour

In [59]:
df_cleaned.loc[:, 'tpep_pickup_dayofweek'] = df_cleaned.tpep_pickup_datetime.dt.day

## One-hot encoding

In [61]:
dummy_cols = ['VendorID', 'RatecodeID', 'store_and_fwd_flag']
df_cleaned = pd.get_dummies(df_cleaned, columns=dummy_cols)

## Drop columns

In [63]:
drop_cols = ['tpep_pickup_datetime',
             'tpep_dropoff_datetime',
             'PULocationID',
             'DOLocationID']
df_cleaned.drop(columns=drop_cols, inplace=True)

## Save data

In [64]:
path = interim_data_dir / 'df_cleaned.csv'
df_cleaned.to_csv(path, index=False)

# Split Data

In [71]:
X_train, X_test, y_train, y_test = split_sets_random(df_cleaned,
                                                     target_col='passenger_count',
                                                     to_numpy=True)

## Save the data sets

In [75]:
save_sets(X_train, X_test, y_train, y_test, processed_data_dir)

In [70]:
df_cleaned

Unnamed: 0,passenger_count,trip_distance,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,...,VendorID_2.0,RatecodeID_1.0,RatecodeID_2.0,RatecodeID_3.0,RatecodeID_4.0,RatecodeID_5.0,RatecodeID_6.0,RatecodeID_99.0,store_and_fwd_flag_N,store_and_fwd_flag_Y
0,1.0,1.20,2.0,5.50,0.5,0.5,0.00,0.00,0.3,6.80,...,0,1,0,0,0,0,0,0,1,0
1,1.0,3.40,1.0,12.50,0.5,0.5,2.75,0.00,0.3,16.55,...,0,1,0,0,0,0,0,0,1,0
2,1.0,2.80,1.0,10.00,3.0,0.5,1.00,0.00,0.3,14.80,...,0,1,0,0,0,0,0,0,1,0
3,0.0,2.60,1.0,10.00,3.0,0.5,1.00,0.00,0.3,14.80,...,0,1,0,0,0,0,0,0,1,0
4,1.0,1.44,1.0,6.50,0.5,0.5,3.00,0.00,0.3,13.30,...,1,1,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
237988,,10.41,,28.82,0.0,0.5,0.00,0.00,0.3,29.62,...,0,0,0,0,0,0,0,0,0,0
237989,,0.93,,5.05,0.0,0.5,0.00,0.00,0.3,5.85,...,0,0,0,0,0,0,0,0,0,0
237990,,2.44,,8.09,0.0,0.0,0.00,0.00,0.3,10.89,...,0,0,0,0,0,0,0,0,0,0
237991,,6.68,,22.42,0.0,0.5,0.00,6.12,0.3,31.84,...,0,0,0,0,0,0,0,0,0,0


# Baseline model

In [77]:
base_model = NullModel(target_type='classification')

TypeError: __init__() missing 3 required positional arguments: 'y', 'pred_value', and 'preds'