In [21]:
import pandas as pd
import random

In [22]:
sample_frac = 0.01

selected_cols = 'fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count'.split(',')
dtypes = {
    'fare_amount': 'float32',
    'pickup_longitude': 'float32',
    'pickup_latitude': 'float32',
    'dropoff_longitude': 'float32',
    'passenger_count': 'float32'
}
def skip_row(row_idx):
    if row_idx == 0:
        return False
    return random.random() > sample_frac

random.seed(42)
df = pd.read_csv("train.csv", 
                 usecols=selected_cols, 
                 dtype=dtypes, 
                 parse_dates=['pickup_datetime'], 
                 skiprows=skip_row)

In [23]:
df.head()

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,4.0,2014-12-06 20:36:22+00:00,-73.979813,40.751904,-73.979446,40.755481,1.0
1,8.0,2013-01-17 17:22:00+00:00,0.0,0.0,0.0,0.0,2.0
2,8.9,2011-06-15 18:07:00+00:00,-73.99633,40.753223,-73.978897,40.766963,3.0
3,6.9,2009-12-14 12:33:00+00:00,-73.98243,40.745747,-73.98243,40.745747,1.0
4,7.0,2013-11-06 11:26:54+00:00,-73.959061,40.781059,-73.962059,40.768604,1.0


In [24]:
df.columns

Index(['fare_amount', 'pickup_datetime', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'passenger_count'],
      dtype='object')

In [25]:
test_df = pd.read_csv("test.csv",dtype=dtypes, parse_dates=['pickup_datetime'])

In [26]:
test_df.head()

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2015-01-27 13:08:24.0000002,2015-01-27 13:08:24+00:00,-73.97332,40.763805,-73.98143,40.743835,1.0
1,2015-01-27 13:08:24.0000003,2015-01-27 13:08:24+00:00,-73.986862,40.719383,-73.998886,40.739201,1.0
2,2011-10-08 11:53:44.0000002,2011-10-08 11:53:44+00:00,-73.982521,40.751259,-73.979652,40.746139,1.0
3,2012-12-01 21:12:12.0000002,2012-12-01 21:12:12+00:00,-73.981163,40.767807,-73.990448,40.751635,1.0
4,2012-12-01 21:12:12.0000003,2012-12-01 21:12:12+00:00,-73.966049,40.789776,-73.988564,40.744427,1.0


In [27]:
test_df.dtypes

key                               object
pickup_datetime      datetime64[ns, UTC]
pickup_longitude                 float32
pickup_latitude                  float32
dropoff_longitude                float32
dropoff_latitude                 float64
passenger_count                  float32
dtype: object

In [28]:
len(df)

552450

In [29]:
len(test_df)

9914

In [30]:
# EXPLORE Dataset
df["passenger_count"] = df["passenger_count"].astype(int)
df.dtypes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 552450 entries, 0 to 552449
Data columns (total 7 columns):
 #   Column             Non-Null Count   Dtype              
---  ------             --------------   -----              
 0   fare_amount        552450 non-null  float32            
 1   pickup_datetime    552450 non-null  datetime64[ns, UTC]
 2   pickup_longitude   552450 non-null  float32            
 3   pickup_latitude    552450 non-null  float32            
 4   dropoff_longitude  552450 non-null  float32            
 5   dropoff_latitude   552450 non-null  float64            
 6   passenger_count    552450 non-null  int32              
dtypes: datetime64[ns, UTC](1), float32(4), float64(1), int32(1)
memory usage: 19.0 MB


In [31]:
df.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,552450.0,552450.0,552450.0,552450.0,552450.0,552450.0
mean,11.354059,-72.497063,39.9105,-72.504326,39.934257,1.684983
std,9.811924,11.618246,8.061114,12.074346,9.255058,1.337664
min,-52.0,-1183.362793,-3084.490234,-3356.729736,-2073.150613,0.0
25%,6.0,-73.99202,40.734875,-73.991425,40.733988,1.0
50%,8.5,-73.981819,40.752621,-73.980179,40.753102,1.0
75%,12.5,-73.967155,40.767036,-73.963737,40.76806,2.0
max,499.0,2420.209473,404.983337,2467.752686,3351.403027,208.0


# Some observations from the data 

#### by looking at the data , we can see the range of the fare amount - $6 to $12.50 (see the 25% , 50% , 75%) . also the max fare amount $499. and min is -$52 (which is not possible , it needs some data cleaning)
### IMPORTANT : 50% of the rides cost less than $8.5 and 75% of the rides cost $12.5. so when predicting taxi fares, the o/p or answer should be in the +/- $3 range , if not we could be way off on our predictions
### pickup longitude & latitude has negative values , which cannot actually be right, hence this requires some data cleaning
### passenger count ranges from 0 to 208 , which again is not right , hence data cleaning is required.
### There are no missing data




In [36]:
# let's check the date time once
df['pickup_datetime'].min(),df['pickup_datetime'].max()

(Timestamp('2009-01-01 00:11:46+0000', tz='UTC'),
 Timestamp('2015-06-30 23:59:54+0000', tz='UTC'))

In [37]:
test_df.describe()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,9914.0,9914.0,9914.0,9914.0,9914.0
mean,-73.974716,40.751041,-73.973656,40.751743,1.671273
std,0.042774,0.033541,0.039072,0.035435,1.278747
min,-74.25219,40.573143,-74.263245,40.568973,1.0
25%,-73.9925,40.736125,-73.991249,40.735254,1.0
50%,-73.982327,40.753052,-73.980015,40.754065,1.0
75%,-73.968012,40.767113,-73.964062,40.768757,2.0
max,-72.986534,41.709557,-72.990967,41.696683,6.0


In [None]:
#some observations on the test dataset
#1. no missing value
#2. 1 to 6 passengers
#3. Latitudes 40 - 42
#4. longitudes 
#5. IMPORTANT : the test df is considered to be actual actual real data, and when the predictions are made , they are compared with the test data to get the accuracy level 
# be mindful of the test dataset

In [38]:
# let's check the date time of the test dataset once
test_df['pickup_datetime'].min(),test_df['pickup_datetime'].max()

(Timestamp('2009-01-01 11:04:24+0000', tz='UTC'),
 Timestamp('2015-06-30 20:03:50+0000', tz='UTC'))

#### interesting component here is that the start and end date is the same for testing and training dataset 

# EXPLORATORY DATA ANALYSIS & VISUALISATION 

### Let's get some answers for the questions from the dataset
What is the busiest day of the week? 

What is the busiest time of the day?

In which month are fares the highest?

Which pickup locations have the highest fares?

Which drop locations have the highest fares?

What is the average ride distance?

In [68]:
df['pickup_datetime']

0        2014-12-06 20:36:22+00:00
1        2013-01-17 17:22:00+00:00
2        2011-06-15 18:07:00+00:00
3        2009-12-14 12:33:00+00:00
4        2013-11-06 11:26:54+00:00
                    ...           
552445   2014-02-06 23:59:45+00:00
552446   2015-01-05 15:29:08+00:00
552447   2013-02-17 22:27:00+00:00
552448   2013-01-27 12:41:00+00:00
552449   2014-10-18 07:51:00+00:00
Name: pickup_datetime, Length: 552450, dtype: datetime64[ns, UTC]

In [74]:
df.columns

Index(['fare_amount', 'pickup_datetime', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'passenger_count'],
      dtype='object')

#   PREPARE DATASET FOR TRAINING 

##### SPLITING THE DATAFRAME INTO TRAINING AND VALIDATION SET

In [88]:
from sklearn.model_selection import train_test_split

train_df , val_df = train_test_split(df , test_size=0.2 , random_state= 42)

len(train_df) , len(val_df)

(441960, 110490)

In [89]:
# removing all the empty rows or rows which has a missing value 
train_df = train_df.dropna()
val_df = val_df.dropna()

len(train_df) , len(val_df)

(441960, 110490)

In [101]:
#extract input and output
df.columns
input_cols = [ 'pickup_longitude', 'pickup_latitude','dropoff_longitude', 'dropoff_latitude', 'passenger_count']
target_cols = ['fare_amount']


In [102]:
train_inputs = train_df[input_cols]
train_target = train_df[target_cols]

In [103]:
val_inputs = val_df[input_cols]
val_target = val_df[target_cols]

In [105]:
train_target

Unnamed: 0,fare_amount
353352,6.0
360070,3.7
372609,10.0
550895,8.9
444151,7.3
...,...
110268,9.3
259178,18.5
365838,10.1
131932,10.9


ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()