In [1]:
import pandas as pd

In [2]:
# Read in the dataset 
df_climate = pd.read_parquet('data/climate_data.parquet', engine='pyarrow')

In [3]:
df_climate.head()

Unnamed: 0,cloudCover,day,dewPoint,humidity,mjd,month,moonPhase,ozone,precipAccumulation,precipIntensity,...,sunsetTime,temperatureHigh,temperatureLow,time,uvIndex,visibility,windBearing,windGust,windSpeed,year
0,0.82,1,22.76,0.8,39856.0,1,0.03,,,0.004,...,-63166859,32.93,26.09,-63226800,1,3.539,221,,3.0,1968
1,0.24,2,9.36,0.59,39857.0,1,0.07,,0.2,0.0011,...,-63080411,27.69,5.98,-63140400,2,8.075,317,,11.79,1968
2,0.37,3,2.56,0.59,39858.0,1,0.11,,,0.0,...,-62993962,21.68,18.0,-63054000,2,9.43,33,,6.18,1968
3,1.0,4,25.29,0.85,39859.0,1,0.14,,,0.001,...,-62907512,34.15,30.54,-62967600,1,3.794,273,,6.0,1968
4,0.75,5,23.89,0.77,39860.0,1,0.17,,,0.0,...,-62821060,33.98,12.92,-62881200,1,5.088,274,,9.81,1968


In [4]:
# Assuming your DataFrame is named df
for col in df_climate.select_dtypes(include='number').columns:
    col_range = df_climate[col].max() - df_climate[col].min()
    print(f"{col}: Range = {col_range}")

cloudCover: Range = 1.0
day: Range = 30
dewPoint: Range = 92.14
humidity: Range = 0.76
mjd: Range = 18892.0
month: Range = 11
moonPhase: Range = 1.0
ozone: Range = 224.60000000000002
precipAccumulation: Range = 20.705
precipIntensity: Range = 0.2236
precipProbability: Range = 1.0
pressure: Range = 72.41000000000008
sunriseTime: Range = 1632435875
sunsetTime: Range = 1632446077
temperatureHigh: Range = 98.08000000000001
temperatureLow: Range = 87.53
time: Range = 1632438000
uvIndex: Range = 9
visibility: Range = 9.42
windBearing: Range = 359
windGust: Range = 135.79000000000002
windSpeed: Range = 29.779999999999998
year: Range = 51


In [44]:
# Columns names
df_climate.columns

Index(['cloudCover', 'day', 'dewPoint', 'humidity', 'mjd', 'month',
       'moonPhase', 'ozone', 'precipAccumulation', 'precipIntensity',
       'precipProbability', 'precipType', 'pressure', 'sunriseTime',
       'sunsetTime', 'temperatureHigh', 'temperatureLow', 'time', 'uvIndex',
       'visibility', 'windBearing', 'windGust', 'windSpeed', 'year'],
      dtype='object')

In [5]:
df_climate['day']

0         1
1         2
2         3
3         4
4         5
         ..
19255    19
19256    20
19257    21
19258    22
19259    23
Name: day, Length: 19260, dtype: int64

In [6]:
df_climate.dtypes

cloudCover            float64
day                     int64
dewPoint              float64
humidity              float64
mjd                   float64
month                   int64
moonPhase             float64
ozone                 float64
precipAccumulation    float64
precipIntensity       float64
precipProbability     float64
precipType             object
pressure              float64
sunriseTime             int64
sunsetTime              int64
temperatureHigh       float64
temperatureLow        float64
time                    int64
uvIndex                 int64
visibility            float64
windBearing             int64
windGust              float64
windSpeed             float64
year                    int64
dtype: object

In [97]:
df_climate.isnull().sum()

cloudCover                0
day                       0
dewPoint                  0
humidity                  0
mjd                       1
month                     0
moonPhase                 0
ozone                 18904
precipAccumulation    18470
precipIntensity           1
precipProbability         1
precipType            12159
pressure                  0
sunriseTime               0
sunsetTime                0
temperatureHigh           0
temperatureLow            0
time                      0
uvIndex                   0
visibility                0
windBearing               0
windGust               9867
windSpeed                 0
year                      0
dtype: int64

In [96]:
df_climate['time']

0         -63226800
1         -63140400
2         -63054000
3         -62967600
4         -62881200
            ...    
19255    1568779200
19256    1568865600
19257    1568952000
19258    1569038400
19259    1569211200
Name: time, Length: 19260, dtype: int64

In [45]:
len(df_climate[df_climate['year'] == 2019])

266

In [46]:
# Check for nulls
df_climate.isnull().sum()

cloudCover                0
day                       0
dewPoint                  0
humidity                  0
mjd                       1
month                     0
moonPhase                 0
ozone                 18904
precipAccumulation    18470
precipIntensity           1
precipProbability         1
precipType            12159
pressure                  0
sunriseTime               0
sunsetTime                0
temperatureHigh           0
temperatureLow            0
time                      0
uvIndex                   0
visibility                0
windBearing               0
windGust               9867
windSpeed                 0
year                      0
dtype: int64

In [99]:
df_subset = df_climate

For analysis, do the following steps
1. Load the data 
2. Convert yr.-month-day to a single date column (can be done in DBT)
3. Check for 2019 only
3. Drop `ozone`, `precipAccumulation`, `precipType` and `windGust` columns 
4. Drop `moonPhase`, `precipProbability`, `pressure`, `uvIndex`, `windBearing` columns

In [100]:
df_subset = df_subset.drop(["ozone", "precipAccumulation", "precipType", "windGust",
                            "moonPhase", "precipProbability", "pressure", "uvIndex", "windBearing", ], axis=1)

In [101]:
df_subset.isnull().sum()

cloudCover         0
day                0
dewPoint           0
humidity           0
mjd                1
month              0
precipIntensity    1
sunriseTime        0
sunsetTime         0
temperatureHigh    0
temperatureLow     0
time               0
visibility         0
windSpeed          0
year               0
dtype: int64

In [102]:
df_subset['mjd'] = df_subset['mjd'].fillna(method='ffill')
df_subset['precipIntensity'] = df_subset['precipIntensity'].fillna(0)

  df_subset['mjd'] = df_subset['mjd'].fillna(method='ffill')


In [103]:
df_subset.dtypes

cloudCover         float64
day                  int64
dewPoint           float64
humidity           float64
mjd                float64
month                int64
precipIntensity    float64
sunriseTime          int64
sunsetTime           int64
temperatureHigh    float64
temperatureLow     float64
time                 int64
visibility         float64
windSpeed          float64
year                 int64
dtype: object

In [51]:
from datetime import datetime, timedelta

In [52]:
# Define MJD epoch start
base_date = datetime(1858, 11, 17)

# Vectorized conversion
df_subset['datetime'] = pd.to_datetime(df_subset['mjd'].apply(lambda x: base_date + timedelta(days=x)))
df_subset['date'] = df_subset['datetime'].dt.date

In [53]:
len(df_subset)

19260

In [54]:
df_subset.head()

Unnamed: 0,cloudCover,day,dewPoint,humidity,mjd,month,precipIntensity,sunriseTime,sunsetTime,temperatureHigh,temperatureLow,time,visibility,windSpeed,year,datetime,date
0,0.82,1,22.76,0.8,39856.0,1,0.004,-63200329,-63166859,32.93,26.09,-63226800,3.539,3.0,1968,1968-01-01,1968-01-01
1,0.24,2,9.36,0.59,39857.0,1,0.0011,-63113920,-63080411,27.69,5.98,-63140400,8.075,11.79,1968,1968-01-02,1968-01-02
2,0.37,3,2.56,0.59,39858.0,1,0.0,-63027514,-62993962,21.68,18.0,-63054000,9.43,6.18,1968,1968-01-03,1968-01-03
3,1.0,4,25.29,0.85,39859.0,1,0.001,-62941110,-62907512,34.15,30.54,-62967600,3.794,6.0,1968,1968-01-04,1968-01-04
4,0.75,5,23.89,0.77,39860.0,1,0.0,-62854708,-62821060,33.98,12.92,-62881200,5.088,9.81,1968,1968-01-05,1968-01-05


In [104]:
df_subset['mjd']

0        39856.0
1        39857.0
2        39858.0
3        39859.0
4        39860.0
          ...   
19255    58745.0
19256    58746.0
19257    58747.0
19258    58748.0
19259    58748.0
Name: mjd, Length: 19260, dtype: float64

In [57]:
df_2015 = df_subset[df_subset['year'] == 2015]

In [58]:
df_green_2015 = pd.read_csv("data/green_tripdata_2015-01_modified.csv")

In [59]:
df_green_2015.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,...,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge,pickup_date,dropoff_date,year
0,2,2015-01-01 00:31:10,2015-01-01 00:50:41,N,1,255,234,1,5.88,20.0,...,0.0,,0.3,25.4,1,1.0,,2015-01-01,2015-01-01,2015
1,2,2015-01-01 00:01:05,2015-01-01 00:03:30,N,1,75,74,1,0.89,4.5,...,0.0,,0.3,5.8,2,1.0,,2015-01-01,2015-01-01,2015
2,2,2015-01-01 00:09:01,2015-01-01 00:33:26,N,1,43,186,1,5.71,22.0,...,0.0,,0.3,23.3,1,1.0,,2015-01-01,2015-01-01,2015
3,2,2015-01-01 00:17:34,2015-01-01 00:27:07,N,1,80,36,1,1.89,8.5,...,0.0,,0.3,9.8,2,1.0,,2015-01-01,2015-01-01,2015
4,2,2015-01-01 00:32:38,2015-01-01 00:40:32,N,1,37,17,1,1.07,6.5,...,0.0,,0.3,7.8,2,1.0,,2015-01-01,2015-01-01,2015


In [69]:
len(df_2015)

365

In [72]:
df_2015

Unnamed: 0,cloudCover,day,dewPoint,humidity,mjd,month,precipIntensity,sunriseTime,sunsetTime,temperatureHigh,temperatureLow,time,visibility,windSpeed,year,datetime,date
17533,0.42,1,11.79,0.50,57023.0,1,0.0000,1420028468,1420061964,31.87,24.32,1420002000,10.00,2.79,2015,2015-01-01,2015-01-01
17534,0.11,2,12.90,0.46,57024.0,1,0.0000,1420114875,1420148412,38.47,34.19,1420088400,10.00,4.75,2015,2015-01-02,2015-01-02
17535,0.36,3,19.84,0.50,57025.0,1,0.0000,1420201280,1420234862,42.03,30.01,1420174800,10.00,4.00,2015,2015-01-03,2015-01-03
17536,0.66,4,28.04,0.76,57026.0,1,0.0234,1420287683,1420321314,38.92,39.90,1420261200,6.47,2.03,2015,2015-01-04,2015-01-04
17537,0.95,5,44.38,0.91,57027.0,1,0.0158,1420374084,1420407767,53.94,38.70,1420347600,6.14,2.60,2015,2015-01-05,2015-01-05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17893,0.94,27,37.93,0.60,57383.0,12,0.0001,1451132395,1451165740,52.02,47.02,1451106000,9.87,5.88,2015,2015-12-27,2015-12-27
17894,0.98,28,49.54,0.89,57384.0,12,0.0053,1451218814,1451252179,60.56,39.01,1451192400,5.73,4.44,2015,2015-12-28,2015-12-28
17895,0.57,29,27.52,0.61,57385.0,12,0.0005,1451305230,1451338619,41.18,34.40,1451278800,9.81,4.58,2015,2015-12-29,2015-12-29
17896,0.99,30,37.43,0.92,57386.0,12,0.0221,1451391644,1451425062,43.64,39.12,1451365200,4.68,3.38,2015,2015-12-30,2015-12-30


In [85]:
df_green_2015['pickup_date'].dtype

dtype('O')

In [84]:
df_2015['date'].dtype

dtype('O')

In [87]:
df_2015['date'].isnull().sum()

0

In [92]:
df_green_2015['pickup_date'] = df_green_2015['pickup_date'].str.strip()
df_2015['date'] = df_2015['date'].astype(str).str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2015['date'] = df_2015['date'].astype(str).str.strip()


In [93]:
df_green_merged = df_green_2015.merge(df_2015, how='left', left_on='pickup_date', right_on='date')

In [94]:
df_green_merged.isnull().sum()

VendorID                       0
lpep_pickup_datetime           0
lpep_dropoff_datetime          0
store_and_fwd_flag             0
RatecodeID                     0
PULocationID                   0
DOLocationID                   0
passenger_count                0
trip_distance                  0
fare_amount                    0
extra                          0
mta_tax                        0
tip_amount                     0
tolls_amount                   0
ehail_fee                1508493
improvement_surcharge          0
total_amount                   0
payment_type                   0
trip_type                      5
congestion_surcharge     1508493
pickup_date                    0
dropoff_date                   0
year_x                         0
cloudCover                     0
day                            0
dewPoint                       0
humidity                       0
mjd                            0
month                          0
precipIntensity                0
sunriseTim

In [75]:
df_2015.isnull().sum()

cloudCover         0
day                0
dewPoint           0
humidity           0
mjd                0
month              0
precipIntensity    0
sunriseTime        0
sunsetTime         0
temperatureHigh    0
temperatureLow     0
time               0
visibility         0
windSpeed          0
year               0
datetime           0
date               0
dtype: int64

In [66]:
df_green_merged.isnull().sum()

VendorID                 0
lpep_pickup_datetime     0
lpep_dropoff_datetime    0
store_and_fwd_flag       0
RatecodeID               0
PULocationID             0
DOLocationID             0
passenger_count          0
trip_distance            0
fare_amount              0
extra                    0
mta_tax                  0
tip_amount               0
tolls_amount             0
ehail_fee                0
improvement_surcharge    0
total_amount             0
payment_type             0
trip_type                0
congestion_surcharge     0
pickup_date              0
dropoff_date             0
year_x                   0
cloudCover               0
day                      0
dewPoint                 0
humidity                 0
mjd                      0
month                    0
precipIntensity          0
sunriseTime              0
sunsetTime               0
temperatureHigh          0
temperatureLow           0
time                     0
visibility               0
windSpeed                0
y

In [95]:
df_green_merged

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,...,sunriseTime,sunsetTime,temperatureHigh,temperatureLow,time,visibility,windSpeed,year_y,datetime,date
0,2,2015-01-01 00:31:10,2015-01-01 00:50:41,N,1,255,234,1,5.88,20.0,...,1420028468,1420061964,31.87,24.32,1420002000,10.00,2.79,2015,2015-01-01,2015-01-01
1,2,2015-01-01 00:01:05,2015-01-01 00:03:30,N,1,75,74,1,0.89,4.5,...,1420028468,1420061964,31.87,24.32,1420002000,10.00,2.79,2015,2015-01-01,2015-01-01
2,2,2015-01-01 00:09:01,2015-01-01 00:33:26,N,1,43,186,1,5.71,22.0,...,1420028468,1420061964,31.87,24.32,1420002000,10.00,2.79,2015,2015-01-01,2015-01-01
3,2,2015-01-01 00:17:34,2015-01-01 00:27:07,N,1,80,36,1,1.89,8.5,...,1420028468,1420061964,31.87,24.32,1420002000,10.00,2.79,2015,2015-01-01,2015-01-01
4,2,2015-01-01 00:32:38,2015-01-01 00:40:32,N,1,37,17,1,1.07,6.5,...,1420028468,1420061964,31.87,24.32,1420002000,10.00,2.79,2015,2015-01-01,2015-01-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1508488,2,2015-01-31 23:50:32,2015-02-01 00:05:43,N,1,75,230,1,3.91,14.5,...,1422619749,1422655914,36.92,13.13,1422594000,8.04,5.17,2015,2015-01-31,2015-01-31
1508489,1,2015-01-31 23:20:01,2015-01-31 23:34:29,N,1,80,112,1,2.00,11.0,...,1422619749,1422655914,36.92,13.13,1422594000,8.04,5.17,2015,2015-01-31,2015-01-31
1508490,1,2015-01-31 23:52:30,2015-02-01 00:18:53,N,1,112,79,1,4.70,20.5,...,1422619749,1422655914,36.92,13.13,1422594000,8.04,5.17,2015,2015-01-31,2015-01-31
1508491,2,2015-01-31 23:47:51,2015-01-31 23:59:03,N,1,74,250,1,5.98,18.0,...,1422619749,1422655914,36.92,13.13,1422594000,8.04,5.17,2015,2015-01-31,2015-01-31


In [23]:
df_subset.to_csv('data/climate_2019.csv', index=False)