In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pwd

## Extract Data

In [None]:
# unzip the files 

from zipfile import ZipFile
from pathlib import Path

# set the paths for the output
output_path = Path('/kaggle/working/data')
output_path.mkdir(exist_ok=True)

# extract the train files
train_zip = ZipFile('/kaggle/input/nyc-taxi-trip-duration/train.zip')

train_zip.extractall(output_path / 'train')

#extract the test files
test_zip = ZipFile('/kaggle/input/nyc-taxi-trip-duration/test.zip')

test_zip.extractall(output_path / 'test')

In [None]:
# load the dataframe

training_data_path = Path('/kaggle/working/data/train/train.csv')

# check if the path exists
if training_data_path.exists():
    print("The data path exists")
else:
    print('The data path does not exist')

In [None]:
train_df = pd.read_csv(training_data_path)

train_df.head()

In [None]:
# shape of training data

train_df.shape

## Drop Columns

In [None]:
# remove unwanted columns from the data

train_df.columns[train_df.columns.isin(['id',
                      'dropoff_datetime',
                      'store_and_fwd_flag'])]

In [None]:
# remove the columns from training data

train_df.drop(columns=train_df.columns[train_df.columns.isin(['id',
                      'dropoff_datetime',
                      'store_and_fwd_flag'])],inplace=True)

In [None]:
train_df.shape

In [None]:
train_df.head()

## Target Column

In [None]:
# distribution of the trip duration column

import seaborn as sns

sns.kdeplot(train_df['trip_duration']);

In [None]:
# check for missing values in data

train_df.isna().sum()

In [None]:
# skewness of the output column

train_df['trip_duration'].skew()

**The target column is extremely right skewed in nature**

In [None]:
# apply reciprocal transformation on data

# check for 0 in target

print('The minimum value in target data is',train_df['trip_duration'].min())

target_reciprocal_trans = train_df['trip_duration'].apply(lambda x: 1/x)

In [None]:
# distribution of target column after reciprocal transformation

sns.kdeplot(target_reciprocal_trans);

In [None]:
# skewness ater reciprocal transformation

target_reciprocal_trans.skew()

In [None]:
# apply log transformation on the data

target_log_trans = train_df['trip_duration'].apply(np.log)

In [None]:
# distribution of target column after log transformation

sns.kdeplot(target_log_trans);

In [None]:
target_log_trans.skew()

**Log transformation worked far better as compared to reciprocal transformation for the target column in reducing the skeweness of data**

In [None]:
# checking for outliers in the target data

# plot box plot
sns.boxplot(data=train_df,x='trip_duration');

In [None]:
# outliers in data after log transformation

sns.boxplot(target_log_trans);

In [None]:
DURATION_THRESH = 200 * 60

In [None]:
# rows of data that are above 200 minutes

target_below_200 = train_df.loc[train_df['trip_duration'] <= DURATION_THRESH,'trip_duration']

In [None]:
# number of rows dropped in the process

train_df.shape[0] - target_below_200.shape[0]

In [None]:
# percentage of total data rows dropped

print('The number of rows dropped when target above 200 minutes are removed',
      train_df.shape[0] - target_below_200.shape[0])

print('\nPercentage of rows dropped when target above 200 minutes are removed',
      f'{(train_df.shape[0] - target_below_200.shape[0])/train_df.shape[0]:.2%}')

**Not a lot of rows are dropped as compared to the size of the data**

In [None]:
# boxplot after removal of data

sns.boxplot(target_below_200)

In [None]:
# distribution of target after removing 200 rows

sns.kdeplot(target_below_200);

### Target Column Transformations

In [None]:
# log transform this data

target_below_200_log_transformed = target_below_200.apply(np.log)

In [None]:
# boxplot after removal of data and log transformation

sns.boxplot(target_below_200_log_transformed);

In [None]:
# distribution of target after removing 200 rows and log transformation

sns.kdeplot(target_below_200_log_transformed);

In [None]:
print('The skewness of target before log transformation',target_below_200.skew())

print('\nSkewness of target after log transformation',target_below_200_log_transformed.skew())

#### Power Transformations

In [None]:
# yeo johnson transformation on the target column

from sklearn.preprocessing import PowerTransformer

pt_target = PowerTransformer()

pt_target

In [None]:
target_below_200

In [None]:
# fit on the training data

pt_target.fit(target_below_200.to_frame())

In [None]:
# transform the target column

target_power_trans = pt_target.transform(target_below_200.to_frame())

target_power_trans

In [None]:
# boxplot after power transformation

sns.boxplot(target_power_trans);

In [None]:
# distribution of target after power transformation

sns.kdeplot(target_power_trans);

In [None]:
print('The skewness of target before power transformation',target_below_200.skew())

print('\nSkewness of target after power transformation',pd.Series(target_power_trans.ravel()).skew())

**It is quite evident fronm the observations that for the given dataset yeo-johnson technique works better as compared to the others**

In [None]:
pt_target.lambdas_[0]

In [None]:
# apply both the techniques and compare skeness

transformation_methods = ['yeo-johnson','box-cox']

skewness = [target_below_200.skew()]
methods = ['original']
lambdas = [np.NaN]

for method in transformation_methods:
    methods.append(method)
    pt_target = PowerTransformer(method=method)
    # fit  and transform on the training data
    target_trans = pt_target.fit(target_below_200.to_frame())
    # skewness after transform
    skew = pd.Series(target_power_trans.ravel()).skew()
    skewness.append(skew)
    # get the lambda value
    lambdas.append(f'{pt_target.lambdas_[0]:.3f}')
    
results_df = pd.DataFrame(data={
    'methods':methods,
    'skewness':skewness,
    'lambdas':lambdas
})

results_df

## Outlier Removal from Lat/Long columns

In [None]:
import matplotlib.pyplot as plt

In [None]:
new_df = train_df.loc[train_df['trip_duration'] <= DURATION_THRESH,:]

new_df.head()

In [None]:
# distribution of lat long columns

def plot_density_plots(df):
    fig, (ax1,ax2) = plt.subplots(nrows=2,ncols=2,figsize=(15,10))
    # density plot for pickup latitude
    sns.kdeplot(data=df,x='pickup_latitude',ax=ax1[0])
    # density plot for dropoff latitude
    sns.kdeplot(data=df,x='dropoff_latitude',ax=ax1[1])
    # density plot for pickup longitude
    sns.kdeplot(data=df,x='pickup_longitude',ax=ax2[0])
    # density plot for pickup longitude
    sns.kdeplot(data=df,x='dropoff_longitude',ax=ax2[1])
    
    
def plot_boxplots(df):
    fig, (ax1,ax2) = plt.subplots(nrows=2,ncols=2,figsize=(15,10))
    # density plot for pickup latitude
    sns.boxplot(data=df,y='pickup_latitude',ax=ax1[0])
    # density plot for dropoff latitude
    sns.boxplot(data=df,y='dropoff_latitude',ax=ax1[1])
    # density plot for pickup longitude
    sns.boxplot(data=df,y='pickup_longitude',ax=ax2[0])
    # density plot for pickup longitude
    sns.boxplot(data=df,y='dropoff_longitude',ax=ax2[1])


In [None]:
# plot density plots

plot_density_plots(new_df)

In [None]:
# plot boxplots

plot_boxplots(new_df)

In [None]:
# function to remove outliers from data

def remove_outliers(percentiles,column_names):
    original_number_of_rows = new_df.shape[0]
    rows_removed = {}
    df_after_removal = new_df.copy()
    for column_name in column_names:
        # calculate the values in data for the given percentiles
        quantile_values = new_df[column_name].quantile(list(percentiles))
        # create data filter
        data_filter = (new_df[column_name] >= quantile_values.iloc[0]) & (new_df[column_name] <= quantile_values.iloc[1])
        # filter out the data
        df_after_removal = df_after_removal.loc[data_filter,:]
        # number of rows removed 
        rows_removed[column_name] = original_number_of_rows - df_after_removal.shape[0]
        # set the number of rows
        original_number_of_rows = df_after_removal.shape[0]
    print(rows_removed)
    
    return df_after_removal, rows_removed

In [None]:
PERCENTILE_VALUES = (0.002,0.998)

In [None]:
new_df.columns[3:7]

In [None]:
# columns to remove outliers from

outlier_cols = new_df.columns[3:7]

outlier_cols

In [None]:
# remove outliers from data

df_without_outliers,removal_dict = remove_outliers(percentiles=PERCENTILE_VALUES,column_names=outlier_cols)

In [None]:
removal_dict

In [None]:
sum(removal_dict.values())

In [None]:
# percentage of data points removed after outliers removal

per_of_data_removed = (new_df.shape[0] - df_without_outliers.shape[0])/new_df.shape[0]

print(f'Percentage of data points removed after outlier removal from Lat/Long columns is {per_of_data_removed:.2%}')

In [None]:
# plot density plots after outlier_removal

plot_density_plots(df_without_outliers)

In [None]:
# plot boxplots after outlier removal

plot_boxplots(df_without_outliers)

In [None]:
def plot_boxplots(df):
    fig, (ax1,ax2) = plt.subplots(nrows=2,ncols=2,figsize=(15,10))
    # density plot for pickup latitude
    sns.boxplot(data=df,x='pickup_latitude',ax=ax1[0])
    # density plot for dropoff latitude
    sns.boxplot(data=df,x='dropoff_latitude',ax=ax1[1])
    # density plot for pickup longitude
    sns.boxplot(data=df,x='pickup_longitude',ax=ax2[0])
    # density plot for pickup longitude
    sns.boxplot(data=df,x='dropoff_longitude',ax=ax2[1])

    
# plot boxplots after outlier removal

plot_boxplots(df_without_outliers)

In [None]:
# skewness of lat/long columns

df_without_outliers[outlier_cols].skew().to_frame(name='skewness').iloc[[1,3,0,2]]

In [None]:
def apply_transformation(func):
    for i in range(len(func)):
        original_skew = df_without_outliers[f'{list(outlier_cols)[i]}'].skew()
        print(f'The skewness of column {list(outlier_cols)[i]} before transformation is {original_skew}')
        skewness_after_transformation = df_without_outliers[f'{list(outlier_cols)[i]}'].apply(func[i]).skew()
        print(f'The skewness of column {list(outlier_cols)[i]} after transformation is {skewness_after_transformation}')
        print('\n','*'*30,'\n')

In [None]:
log_func = lambda x : -np.log(abs(x))
cube_func = lambda x: x**3

In [None]:
function_list = [log_func,cube_func,log_func,cube_func]

apply_transformation(func=function_list)

In [None]:
# target data distribution with and without outliers

sns.kdeplot(data=new_df,x='trip_duration',label='with outliers')
sns.kdeplot(data=df_without_outliers,x='trip_duration',label='without outliers')

plt.legend()
plt.show()

## Passenger count

In [None]:
df_without_outliers.head()

In [None]:
new_df['trip_duration'].max() / 60

In [None]:
df_without_outliers['passenger_count'].value_counts()

In [None]:
# remove passengers 0 and 8 from data

final_df = df_without_outliers.loc[~df_without_outliers['passenger_count'].isin([0,8]),:]

In [None]:
# compare the shapes

print('The shape of data before removing rows of passengers 0 and 8',df_without_outliers.shape[0])
print('\nThe shape of data after removing rows of passengers 0 and 8',final_df.shape[0])
print(f'\nThe difference is 53 rows --> {(df_without_outliers.shape[0] - final_df.shape[0]) == 53}')

In [None]:
# save the final_df

save_path = Path('/kaggle/working/data/output')

save_path.mkdir(exist_ok=True)

final_df.to_csv(path_or_buf=save_path / 'final_df.csv',index=False)

In [None]:
# load the final_df back

final_df_copy = pd.read_csv(save_path / 'final_df.csv')

final_df.head()