# Data Preprocessing- Grouping Data

In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import scipy.stats as stats
from datetime import datetime
# a nice way of filtering out deprecated warnings
import warnings
warnings.filterwarnings("ignore")

## Open and read Taxi, FHV and weather data

In [2]:
df_taxi= pd.read_feather("../preprocessed_data/feather/yellow_tripdata_2019.feather")
print("df_taxi read")
df_fhv= pd.read_feather("../preprocessed_data/feather/fhv_tripdata_2019.feather")
print("df_fhv read")

df_taxi read
df_fhv read


In [3]:
dfweather= pd.read_feather('../preprocessed_data/feather/dfweather.feather')

Since feather file doesn't support datetime datatype, convert it again into datetime

In [4]:
df_taxi['pickup_datetime']= pd.to_datetime(df_taxi['pickup_datetime'])
print("converted taxi pickup to ", type(df_taxi['pickup_datetime'][0]))
df_fhv['pickup_datetime']= pd.to_datetime(df_fhv['pickup_datetime'])
print("converted fhv pickup to ", type(df_taxi['dropoff_datetime'][0]))

converted taxi pickup to  <class 'pandas._libs.tslibs.timestamps.Timestamp'>
converted fhv pickup to  <class 'pandas._libs.tslibs.timestamps.Timestamp'>


In [5]:
train_taxi= df_taxi.loc[df_taxi['pickup_datetime']< pd.Timestamp(datetime(2019, 5, 1))]
print("train_taxi made")
train_fhv= df_fhv.loc[df_fhv['pickup_datetime']< pd.Timestamp(datetime(2019, 5, 1))]
print("train_fhv made")
test_taxi= df_taxi.loc[df_taxi['pickup_datetime']> pd.Timestamp(datetime(2019, 4, 30))]
print("test_taxi made")
test_fhv= df_fhv.loc[df_fhv['pickup_datetime']> pd.Timestamp(datetime(2019, 4, 30))]
print("test_fhv made")

train_taxi made
train_fhv made
test_taxi made
test_fhv made


In [6]:
train_df = train_taxi.append(train_fhv, ignore_index = True)
print("train_df made")
test_df = test_taxi.append(test_fhv, ignore_index = True)
print("test_df made")

train_df made
test_df made


In [7]:
dfweather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2774 entries, 0 to 2773
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   index          2774 non-null   int64   
 1   tempF          2774 non-null   float64 
 2   relhumidity    2774 non-null   float64 
 3   precipitation  2774 non-null   float64 
 4   time session   2774 non-null   category
 5   date           2774 non-null   object  
dtypes: category(1), float64(3), int64(1), object(1)
memory usage: 111.4+ KB


## Group and split data 

In [8]:
print("Null values in training df:")
print(train_df.isnull().sum())
print("Null values in testing df:")
print(test_df.isnull().sum())

Null values in training df:
index                           0
pickup_datetime                 0
dropoff_datetime                0
passenger_count                 0
trip_distance            45411866
PULocationID                    0
DOLocationID                    0
fare_amount              45411866
extra                    45411866
mta_tax                  45411866
tip_amount               45411866
tolls_amount             45411866
improvement_surcharge    45411866
total_amount             45411866
congestion_surcharge     45411866
total_trip_duration             0
avespeed_mileshr         45411866
time session                    0
date                            0
hour                            0
day                             0
dtype: int64
Null values in testing df:
index                           0
pickup_datetime                 0
dropoff_datetime                0
passenger_count                 0
trip_distance            22122416
PULocationID                    0
DOLocationID  

In [9]:
del train_df["index"]
del test_df["index"]

In [10]:
train_df.columns, test_df.columns

(Index(['pickup_datetime', 'dropoff_datetime', 'passenger_count',
        'trip_distance', 'PULocationID', 'DOLocationID', 'fare_amount', 'extra',
        'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
        'total_amount', 'congestion_surcharge', 'total_trip_duration',
        'avespeed_mileshr', 'time session', 'date', 'hour', 'day'],
       dtype='object'),
 Index(['pickup_datetime', 'dropoff_datetime', 'passenger_count',
        'trip_distance', 'PULocationID', 'DOLocationID', 'fare_amount', 'extra',
        'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
        'total_amount', 'congestion_surcharge', 'total_trip_duration',
        'avespeed_mileshr', 'time session', 'date', 'hour', 'day'],
       dtype='object'))

In [11]:
train_taxi.columns

Index(['index', 'pickup_datetime', 'dropoff_datetime', 'passenger_count',
       'trip_distance', 'PULocationID', 'DOLocationID', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount', 'congestion_surcharge', 'total_trip_duration',
       'avespeed_mileshr', 'time session', 'date', 'hour', 'day'],
      dtype='object')

In [12]:
#Since payment is only taken from the train_taxi data, 
# we find and save the median of the payments
X_train= train_taxi.groupby(['date', 'time session','PULocationID'], as_index= False).median().dropna(subset= ["DOLocationID"])
X_train= X_train.drop(columns= ["DOLocationID", "total_trip_duration", "hour"]) 
#this includes the part that is a combination of taxi and fhv trips
# with removing parts where DOLocationID is null since that means
# neither fhv data or taxi data are present
X_train_2= train_df.groupby(['date', 'time session','PULocationID'], as_index= False).median().dropna()
X_train_2= X_train_2[['date', 'time session','PULocationID', "DOLocationID","total_trip_duration", "hour"]] 
X_train= pd.merge(X_train, X_train_2, on=['date', 'time session', 'PULocationID'] )
X_train.shape

(23347, 19)

In [13]:
del train_taxi
del train_fhv

In [14]:
X_train.describe().round()

Unnamed: 0,PULocationID,index,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,avespeed_mileshr,day,DOLocationID,total_trip_duration,hour
count,23347.0,23347.0,23347.0,23347.0,23347.0,23347.0,23347.0,23347.0,23347.0,23347.0,23347.0,23347.0,23347.0,23347.0,23347.0,23347.0,23347.0
mean,141.0,7825810.0,1.0,4.0,15.0,1.0,0.0,3.0,0.0,0.0,23.0,3.0,13.0,16.0,143.0,16.0,14.0
std,77.0,4417033.0,1.0,4.0,10.0,1.0,0.0,2.0,2.0,0.0,13.0,0.0,5.0,9.0,23.0,4.0,6.0
min,1.0,1021.0,1.0,0.0,3.0,-2.0,0.0,0.0,0.0,0.0,7.0,2.0,1.0,1.0,12.0,3.0,0.0
25%,75.0,4064190.0,1.0,2.0,9.0,0.0,0.0,2.0,0.0,0.0,15.0,2.0,10.0,8.0,138.0,13.0,9.0
50%,143.0,7860542.0,1.0,3.0,12.0,0.0,0.0,3.0,0.0,0.0,18.0,2.0,12.0,16.0,148.0,15.0,15.0
75%,217.0,11721806.0,1.0,5.0,18.0,1.0,0.0,4.0,0.0,0.0,26.0,2.0,15.0,23.0,161.0,17.0,19.0
max,263.0,15260883.0,6.0,37.0,102.0,7.0,0.0,78.0,30.0,0.0,179.0,3.0,47.0,31.0,262.0,76.0,23.0


In [15]:
#merge weather data
X_train['date']= pd.to_datetime(X_train['date'])
dfweather['date']= pd.to_datetime(dfweather['date'])

In [16]:
dfweather= dfweather.groupby(['date', 'time session'], as_index= False).mean()

In [17]:
X_train= pd.merge(X_train, dfweather, on= ["date", "time session"])
X_train.columns, X_train.shape

(Index(['date', 'time session', 'PULocationID', 'index_x', 'passenger_count',
        'trip_distance', 'fare_amount', 'extra', 'mta_tax', 'tip_amount',
        'tolls_amount', 'improvement_surcharge', 'total_amount',
        'congestion_surcharge', 'avespeed_mileshr', 'day', 'DOLocationID',
        'total_trip_duration', 'hour', 'index_y', 'tempF', 'relhumidity',
        'precipitation'],
       dtype='object'),
 (23347, 23))

In [18]:
del X_train["index_x"]
del X_train["index_y"]

In [19]:
# Find Trip Demand Values
y_train_val= train_df.groupby(['date', 'time session','PULocationID'], as_index= False)["passenger_count"].sum()
y_train_val['date']= pd.to_datetime(y_train_val['date'])
y_train_val= pd.merge(X_train, y_train_val, on=['date', 'time session', 'PULocationID'])

In [20]:
y_train_val.columns

Index(['date', 'time session', 'PULocationID', 'passenger_count_x',
       'trip_distance', 'fare_amount', 'extra', 'mta_tax', 'tip_amount',
       'tolls_amount', 'improvement_surcharge', 'total_amount',
       'congestion_surcharge', 'avespeed_mileshr', 'day', 'DOLocationID',
       'total_trip_duration', 'hour', 'tempF', 'relhumidity', 'precipitation',
       'passenger_count_y'],
      dtype='object')

In [21]:
y_train_val=y_train_val.rename(columns={"passenger_count_y": "trip demand"}, errors="raise")
y_train_val= y_train_val['trip demand']
y_train_val

0         585
1        1528
2        2229
3          36
4        2564
         ... 
23342    1109
23343     456
23344     644
23345     437
23346    1124
Name: trip demand, Length: 23347, dtype: int64

In [22]:
#Label trips demands according to low, medium and high class
# Save bins to get y_test according to the same bins as train data since test data is assumingly not seen
y_train,bins= pd.qcut(y_train_val, 3,labels=["low","med","high"], retbins=True)

In [23]:
#Do the same for X_test and y_test
X_test= test_taxi.groupby(['date', 'time session','PULocationID'], as_index= False).median().dropna(subset= ["DOLocationID"])
X_test= X_test.drop(columns= ["DOLocationID", "total_trip_duration", "hour"]) 

X_test_2= test_df.groupby(['date', 'time session','PULocationID'], as_index= False).median().dropna()
X_test_2= X_test_2[['date', 'time session','PULocationID', "DOLocationID","total_trip_duration", "hour"]] 
X_test= pd.merge(X_test, X_test_2, on=['date', 'time session', 'PULocationID'] )
X_test['date']= pd.to_datetime(X_test['date'])
X_test= pd.merge(X_test, dfweather, on= ["date", "time session"])
y_test_val= test_df.groupby(['date', 'time session','PULocationID'], as_index= False)["passenger_count"].sum()
y_test_val['date']= pd.to_datetime(y_test_val['date'])
y_test_val= pd.merge(X_test, y_test_val, on=['date', 'time session', 'PULocationID'])
y_test_val=y_test_val.rename(columns={"passenger_count_y": "trip demand"}, errors="raise")
y_test_val= y_test_val['trip demand']
y_test= pd.cut(y_test_val, bins, labels=["low","med","high"], include_lowest=True)

### Create Date Num Column

In [24]:
X_train['date_num']=pd.to_datetime(X_train["date"]).dt.strftime("%Y%m%d")
X_test['date_num']=pd.to_datetime(X_test["date"]).dt.strftime("%Y%m%d")

### Save Data

In [25]:
train_df= pd.concat([y_train, X_train],axis=1)
test_df= pd.concat([y_test, X_test],axis=1)
train_df.to_csv('../preprocessed_data/trainingdf.csv', index=False)
test_df.to_csv('../preprocessed_data/testingdf.csv', index=False)