# <div style="font-family: Trebuchet MS; background-color: #B0E0E6; color: #000000; padding: 12px; line-height: 1.5;"> Importing Libraries 📚</div>

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import regex as re
import os

## Supressing warnings:
import warnings
warnings.filterwarnings("ignore")

In [3]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25ldone
[?25h  Created wheel for pyspark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285413 sha256=d4af522460334f5a38a9c579478187d9b8fdbf8db6746682d5c4a1835b72bae3
  Stored in directory: /root/.cache/pip/wheels/0d/77/a3/ff2f74cc9ab41f8f594dabf0579c2a7c6de920d584206e0834
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.1


In [35]:
## importing essential spark libraries:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, split, count, when, regexp_replace, isnan, udf
from pyspark.sql.types import StructField, StructType, StringType, IntegerType, FloatType

# <div style="font-family: Trebuchet MS; background-color: #B0E0E6; color: #000000; padding: 12px; line-height: 1.5;"> Getting Started with the Analysis 🔬</div>

#### The first step towards your adventure in Spark is to create a Spark Session. It is the entry point to the Spark ecosystem. Once you reach the Spark environment via the entry point, you can freely create and manipulate Spark RDDs, Dataframes and Datasets. 

## 💥 What is a RDD?

You might be wondering what this new term is. Well RDD stands for **Resilient Distributed Dataset**. It is the fundamental data structure of Spark.

#### SparkSession will be created using SparkSession.builder() builder patterns::

In [5]:
##  Creating a Spark session:
spark = SparkSession.builder.appName('Sample').getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/08/17 19:48:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [6]:
## Quick glance at the object
spark

##### Here, the spark object acts as the gateway to the Spark ecosystem. 

##### Next in order to read the CSV data, we use the **read.csv** functionality:

In [50]:
df=spark.read.csv("/kaggle/input/food-delivery-dataset/train.csv",
                  header=True,
                  inferSchema=True)
#  Parameters:
## - inferSchema parameter ensures that the data formatting stays the same as the original dataframe. If False, then the 
##     columns will be of class string.
## - header parameter tells that the columns names are provided along with the dataset.

## Displaying the first 5 rows:
df.show(5)

                                                                                

+-------+------------------+-------------------+-----------------------+-------------------+--------------------+--------------------------+---------------------------+----------+-----------+-------------------+--------------------+--------------------+-----------------+-------------+---------------+-------------------+--------+--------------+---------------+
|     ID|Delivery_person_ID|Delivery_person_Age|Delivery_person_Ratings|Restaurant_latitude|Restaurant_longitude|Delivery_location_latitude|Delivery_location_longitude|Order_Date|Time_Orderd|  Time_Order_picked|   Weatherconditions|Road_traffic_density|Vehicle_condition|Type_of_order|Type_of_vehicle|multiple_deliveries|Festival|          City|Time_taken(min)|
+-------+------------------+-------------------+-----------------------+-------------------+--------------------+--------------------------+---------------------------+----------+-----------+-------------------+--------------------+--------------------+-----------------+-----

In [51]:
## To convert a spark dataframe into a pandas dataframe
df.toPandas().head()

Unnamed: 0,ID,Delivery_person_ID,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Order_Date,Time_Orderd,Time_Order_picked,Weatherconditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Time_taken(min)
0,0x4607,INDORES13DEL02,37.0,4.9,22.745049,75.892471,22.765049,75.912471,19-03-2022,11:30:00,2023-08-17 11:45:00,conditions Sunny,High,2,Snack,motorcycle,0.0,No,Urban,(min) 24
1,0xb379,BANGRES18DEL02,34.0,4.5,12.913041,77.683237,13.043041,77.813237,25-03-2022,19:45:00,2023-08-17 19:50:00,conditions Stormy,Jam,2,Snack,scooter,1.0,No,Metropolitian,(min) 33
2,0x5d6d,BANGRES19DEL01,23.0,4.4,12.914264,77.6784,12.924264,77.6884,19-03-2022,08:30:00,2023-08-17 08:45:00,conditions Sandstorms,Low,0,Drinks,motorcycle,1.0,No,Urban,(min) 26
3,0x7a6a,COIMBRES13DEL02,38.0,4.7,11.003669,76.976494,11.053669,77.026494,05-04-2022,18:00:00,2023-08-17 18:10:00,conditions Sunny,Medium,0,Buffet,motorcycle,1.0,No,Metropolitian,(min) 21
4,0x70a2,CHENRES12DEL01,32.0,4.6,12.972793,80.249982,13.012793,80.289982,26-03-2022,13:30:00,2023-08-17 13:45:00,conditions Cloudy,High,1,Snack,scooter,1.0,No,Metropolitian,(min) 30


#### As you can see above, Time_taken(min) is the target variable.

#### Now we have read the csv file into Spark. Lets view the dataframe:

In [52]:
## Viewing the type
type(df)

pyspark.sql.dataframe.DataFrame

#### Printing the schema of the dataframe

In [53]:
## Printing the attributes of the table:
df.printSchema()

root
 |-- ID: string (nullable = true)
 |-- Delivery_person_ID: string (nullable = true)
 |-- Delivery_person_Age: double (nullable = true)
 |-- Delivery_person_Ratings: double (nullable = true)
 |-- Restaurant_latitude: double (nullable = true)
 |-- Restaurant_longitude: double (nullable = true)
 |-- Delivery_location_latitude: double (nullable = true)
 |-- Delivery_location_longitude: double (nullable = true)
 |-- Order_Date: string (nullable = true)
 |-- Time_Orderd: string (nullable = true)
 |-- Time_Order_picked: timestamp (nullable = true)
 |-- Weatherconditions: string (nullable = true)
 |-- Road_traffic_density: string (nullable = true)
 |-- Vehicle_condition: integer (nullable = true)
 |-- Type_of_order: string (nullable = true)
 |-- Type_of_vehicle: string (nullable = true)
 |-- multiple_deliveries: double (nullable = true)
 |-- Festival: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Time_taken(min): string (nullable = true)



In [54]:
## Displaying the first 5 rows in the form of col-value pairs
df.head(5)

[Row(ID='0x4607 ', Delivery_person_ID='INDORES13DEL02 ', Delivery_person_Age=37.0, Delivery_person_Ratings=4.9, Restaurant_latitude=22.745049, Restaurant_longitude=75.892471, Delivery_location_latitude=22.765049, Delivery_location_longitude=75.912471, Order_Date='19-03-2022', Time_Orderd='11:30:00', Time_Order_picked=datetime.datetime(2023, 8, 17, 11, 45), Weatherconditions='conditions Sunny', Road_traffic_density='High ', Vehicle_condition=2, Type_of_order='Snack ', Type_of_vehicle='motorcycle ', multiple_deliveries=0.0, Festival='No ', City='Urban ', Time_taken(min)='(min) 24'),
 Row(ID='0xb379 ', Delivery_person_ID='BANGRES18DEL02 ', Delivery_person_Age=34.0, Delivery_person_Ratings=4.5, Restaurant_latitude=12.913041, Restaurant_longitude=77.683237, Delivery_location_latitude=13.043041, Delivery_location_longitude=77.813237, Order_Date='25-03-2022', Time_Orderd='19:45:00', Time_Order_picked=datetime.datetime(2023, 8, 17, 19, 50), Weatherconditions='conditions Stormy', Road_traffic_d

In [55]:
## Basic statistics of the data:
df.describe()    ### df.summary()
df.describe().show()



+-------+-------+------------------+-------------------+-----------------------+-------------------+--------------------+--------------------------+---------------------------+----------+-----------+-----------------+--------------------+------------------+-------------+---------------+-------------------+--------+--------------+---------------+
|summary|     ID|Delivery_person_ID|Delivery_person_Age|Delivery_person_Ratings|Restaurant_latitude|Restaurant_longitude|Delivery_location_latitude|Delivery_location_longitude|Order_Date|Time_Orderd|Weatherconditions|Road_traffic_density| Vehicle_condition|Type_of_order|Type_of_vehicle|multiple_deliveries|Festival|          City|Time_taken(min)|
+-------+-------+------------------+-------------------+-----------------------+-------------------+--------------------+--------------------------+---------------------------+----------+-----------+-----------------+--------------------+------------------+-------------+---------------+-----------------

                                                                                

#### NOTE: describe() represents the statiscal summary of dataframe but it also uses the string variables

In [56]:
## Shape of the dataframe is:
df.count(),len(df.columns)

(45593, 20)

In [57]:
## Checking for null values:
df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).show()

+---+------------------+-------------------+-----------------------+-------------------+--------------------+--------------------------+---------------------------+----------+-----------+-----------------+-----------------+--------------------+-----------------+-------------+---------------+-------------------+--------+----+---------------+
| ID|Delivery_person_ID|Delivery_person_Age|Delivery_person_Ratings|Restaurant_latitude|Restaurant_longitude|Delivery_location_latitude|Delivery_location_longitude|Order_Date|Time_Orderd|Time_Order_picked|Weatherconditions|Road_traffic_density|Vehicle_condition|Type_of_order|Type_of_vehicle|multiple_deliveries|Festival|City|Time_taken(min)|
+---+------------------+-------------------+-----------------------+-------------------+--------------------+--------------------------+---------------------------+----------+-----------+-----------------+-----------------+--------------------+-----------------+-------------+---------------+-------------------+--

#### Looks like there are no null values.

In [58]:
## Checking the dtypes:
df.dtypes

[('ID', 'string'),
 ('Delivery_person_ID', 'string'),
 ('Delivery_person_Age', 'double'),
 ('Delivery_person_Ratings', 'double'),
 ('Restaurant_latitude', 'double'),
 ('Restaurant_longitude', 'double'),
 ('Delivery_location_latitude', 'double'),
 ('Delivery_location_longitude', 'double'),
 ('Order_Date', 'string'),
 ('Time_Orderd', 'string'),
 ('Time_Order_picked', 'timestamp'),
 ('Weatherconditions', 'string'),
 ('Road_traffic_density', 'string'),
 ('Vehicle_condition', 'int'),
 ('Type_of_order', 'string'),
 ('Type_of_vehicle', 'string'),
 ('multiple_deliveries', 'double'),
 ('Festival', 'string'),
 ('City', 'string'),
 ('Time_taken(min)', 'string')]

In [59]:
## To view a few selected columns:
df.select(["ID","Delivery_person_ID"]).show()

+-------+------------------+
|     ID|Delivery_person_ID|
+-------+------------------+
|0x4607 |   INDORES13DEL02 |
|0xb379 |   BANGRES18DEL02 |
|0x5d6d |   BANGRES19DEL01 |
|0x7a6a |  COIMBRES13DEL02 |
|0x70a2 |   CHENRES12DEL01 |
|0x9bb4 |    HYDRES09DEL03 |
|0x95b4 | RANCHIRES15DEL01 |
|0x9eb2 |    MYSRES15DEL02 |
|0x1102 |    HYDRES05DEL02 |
|0xcdcd |    DEHRES17DEL01 |
|0xd987 |    KOCRES16DEL01 |
|0x2784 |   PUNERES13DEL03 |
|0xc8b6 |   LUDHRES15DEL02 |
|0xdb64 |    KNPRES14DEL02 |
|0x3af3 |    MUMRES15DEL03 |
|0x3aab |    MYSRES01DEL01 |
|0x689b |   PUNERES20DEL01 |
|0x6f67 |    HYDRES14DEL01 |
|0xc9cf |    KOLRES15DEL03 |
|0x36b8 |   PUNERES19DEL02 |
+-------+------------------+
only showing top 20 rows



In [60]:
df.printSchema()

root
 |-- ID: string (nullable = true)
 |-- Delivery_person_ID: string (nullable = true)
 |-- Delivery_person_Age: double (nullable = true)
 |-- Delivery_person_Ratings: double (nullable = true)
 |-- Restaurant_latitude: double (nullable = true)
 |-- Restaurant_longitude: double (nullable = true)
 |-- Delivery_location_latitude: double (nullable = true)
 |-- Delivery_location_longitude: double (nullable = true)
 |-- Order_Date: string (nullable = true)
 |-- Time_Orderd: string (nullable = true)
 |-- Time_Order_picked: timestamp (nullable = true)
 |-- Weatherconditions: string (nullable = true)
 |-- Road_traffic_density: string (nullable = true)
 |-- Vehicle_condition: integer (nullable = true)
 |-- Type_of_order: string (nullable = true)
 |-- Type_of_vehicle: string (nullable = true)
 |-- multiple_deliveries: double (nullable = true)
 |-- Festival: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Time_taken(min): string (nullable = true)



#### The various datatypes that a column can take up are integers, string, double, float, timestamp, etc...

#### To convert a column into:

1. double ---> use DoubleType()

2. int    ---> use IntegerType()

3. float  ---> use FloatType()

4. string ---> use StringType()

5. long   ---> use LongType()

#### all inside the cast() method.

#### In PySpark, the withColumn() function is widely used and defined as the **transformation function** of the DataFrame

#### which is further

- used to change the value, 

- convert the datatype of an existing column, 

- create the new column etc...

In [61]:
## Have to correct the datatypes of some columns. Delivery_person_Age, Vehicle_condition, multiple_deliveries
df=df.withColumn('Delivery_person_Age',col('Delivery_person_Age').cast(IntegerType()))\
.withColumn('Vehicle_condition',col('Vehicle_condition').cast(IntegerType()))\
.withColumn('multiple_deliveries',col('multiple_deliveries').cast(IntegerType()))

In [62]:
## Checking after conversion:
df.dtypes

[('ID', 'string'),
 ('Delivery_person_ID', 'string'),
 ('Delivery_person_Age', 'int'),
 ('Delivery_person_Ratings', 'double'),
 ('Restaurant_latitude', 'double'),
 ('Restaurant_longitude', 'double'),
 ('Delivery_location_latitude', 'double'),
 ('Delivery_location_longitude', 'double'),
 ('Order_Date', 'string'),
 ('Time_Orderd', 'string'),
 ('Time_Order_picked', 'timestamp'),
 ('Weatherconditions', 'string'),
 ('Road_traffic_density', 'string'),
 ('Vehicle_condition', 'int'),
 ('Type_of_order', 'string'),
 ('Type_of_vehicle', 'string'),
 ('multiple_deliveries', 'int'),
 ('Festival', 'string'),
 ('City', 'string'),
 ('Time_taken(min)', 'string')]

In [63]:
df.select(['Delivery_person_Age','Vehicle_condition','multiple_deliveries']).dtypes

[('Delivery_person_Age', 'int'),
 ('Vehicle_condition', 'int'),
 ('multiple_deliveries', 'int')]

In [64]:
## To display the PySpark dataframe as a pandas dataframe:
df.toPandas().head()

Unnamed: 0,ID,Delivery_person_ID,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Order_Date,Time_Orderd,Time_Order_picked,Weatherconditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Time_taken(min)
0,0x4607,INDORES13DEL02,37,4.9,22.745049,75.892471,22.765049,75.912471,19-03-2022,11:30:00,2023-08-17 11:45:00,conditions Sunny,High,2,Snack,motorcycle,0,No,Urban,(min) 24
1,0xb379,BANGRES18DEL02,34,4.5,12.913041,77.683237,13.043041,77.813237,25-03-2022,19:45:00,2023-08-17 19:50:00,conditions Stormy,Jam,2,Snack,scooter,1,No,Metropolitian,(min) 33
2,0x5d6d,BANGRES19DEL01,23,4.4,12.914264,77.6784,12.924264,77.6884,19-03-2022,08:30:00,2023-08-17 08:45:00,conditions Sandstorms,Low,0,Drinks,motorcycle,1,No,Urban,(min) 26
3,0x7a6a,COIMBRES13DEL02,38,4.7,11.003669,76.976494,11.053669,77.026494,05-04-2022,18:00:00,2023-08-17 18:10:00,conditions Sunny,Medium,0,Buffet,motorcycle,1,No,Metropolitian,(min) 21
4,0x70a2,CHENRES12DEL01,32,4.6,12.972793,80.249982,13.012793,80.289982,26-03-2022,13:30:00,2023-08-17 13:45:00,conditions Cloudy,High,1,Snack,scooter,1,No,Metropolitian,(min) 30


In [65]:
## Checking the numeric columns:
def num_cols(dataframe):
    num_cols = [col for col in dataframe.columns if dataframe.select(col).dtypes[0][1] in ['double', 'int']]
    return num_cols

num_cols = num_cols(df)  ### list of numeric columns
    
df.describe(num_cols).show()

+-------+-------------------+-----------------------+-------------------+--------------------+--------------------------+---------------------------+------------------+-------------------+
|summary|Delivery_person_Age|Delivery_person_Ratings|Restaurant_latitude|Restaurant_longitude|Delivery_location_latitude|Delivery_location_longitude| Vehicle_condition|multiple_deliveries|
+-------+-------------------+-----------------------+-------------------+--------------------+--------------------------+---------------------------+------------------+-------------------+
|  count|              45593|                  45593|              45593|               45593|                     45593|                      45593|             45593|              45593|
|   mean| 28.364814774197793|                    NaN| 17.017728506525582|   70.23133233807862|        17.465185865088966|          70.84570225567651|  1.02335884894611| 0.7284451560546575|
| stddev|  8.157529884739837|                    NaN|  

In [66]:
### There are 1320 unique IDs
df.select('Delivery_person_ID').distinct().count()  

1320

In [67]:
### Counts of unique delivery person ids::
df.select('Delivery_person_ID').distinct().show()  ### 20 
df.groupBy('Delivery_person_ID').count().orderBy('count').show()

+------------------+
|Delivery_person_ID|
+------------------+
|    SURRES11DEL01 |
|    GOARES02DEL01 |
|    KNPRES09DEL03 |
|    KOCRES02DEL01 |
|    KOLRES08DEL01 |
|    BHPRES13DEL03 |
|    ALHRES06DEL02 |
|    BHPRES05DEL02 |
|    GOARES03DEL03 |
|    VADRES16DEL02 |
|    VADRES04DEL02 |
|  COIMBRES07DEL01 |
|    KNPRES08DEL03 |
|   LUDHRES09DEL01 |
|    KOCRES09DEL01 |
|   MUMRES010DEL02 |
| RANCHIRES11DEL01 |
|    HYDRES09DEL02 |
|    DEHRES06DEL02 |
|    BHPRES09DEL03 |
+------------------+
only showing top 20 rows

+------------------+-----+
|Delivery_person_ID|count|
+------------------+-----+
|   BHPRES010DEL03 |    5|
|    KOLRES09DEL03 |    6|
|    KOCRES16DEL03 |    6|
|    BHPRES15DEL03 |    7|
|   AURGRES13DEL03 |    7|
|    GOARES01DEL03 |    7|
|   AURGRES11DEL03 |    7|
|    DEHRES18DEL03 |    7|
|   LUDHRES01DEL03 |    8|
|    GOARES11DEL01 |    8|
|    KOLRES08DEL03 |    8|
|    BHPRES06DEL03 |    8|
|   AURGRES06DEL03 |    8|
|   GOARES010DEL03 |    8|
|    BHPRES

<div style="color:white;
           display:fill;
           border-radius:5px;
           background-color:#5642C5;
           font-size:110%;
           font-family:Verdana;
           letter-spacing:0.5px">
        <p style="padding: 10px;
              color:white;">
            Feature Engineering Overview: As observed from the above dataset, we can extract the following
        </p>
    </div>

1. City from Delivery_person_ID ----> city

2. Bucket cities into Zones - North, South, East, West  ----> city_zone

3. Time taken to pick up delivery using Time_Orderd and Time_Order_picked ----> pickup_time

4. Time of the day - Morning, Lunch, Evening, Night, Midnight ----> day_zone

5. To clean up target variable - Time_taken(min)

6. Bucket Age - Delivery_person_Age ----> life_stage

7. Features using Latitude and Longitude ----> geosidic

<blockquote><p style="font-size:20px; color:#159364; font-family:verdana;">1. City from delivery id:</p></blockquote>

#### In order to apply a function into a particular column, we have create the function and register it as a UDF(User Defined Function) on Spark

In [68]:
# Create custom function
def city_extract(x):
    return re.findall("(\S+)RES\S+",x)[0]

# Convert the function as a UDF using the udf function:
city_extract_UDF = udf(lambda x:city_extract(x),StringType()) 

# Apply the function on the desired column:
df=df.withColumn("City",city_extract_UDF(col("Delivery_person_ID")))

## Having a glance at the new column:
df.select(['Delivery_person_ID','City']).show()

+------------------+------+
|Delivery_person_ID|  City|
+------------------+------+
|   INDORES13DEL02 |  INDO|
|   BANGRES18DEL02 |  BANG|
|   BANGRES19DEL01 |  BANG|
|  COIMBRES13DEL02 | COIMB|
|   CHENRES12DEL01 |  CHEN|
|    HYDRES09DEL03 |   HYD|
| RANCHIRES15DEL01 |RANCHI|
|    MYSRES15DEL02 |   MYS|
|    HYDRES05DEL02 |   HYD|
|    DEHRES17DEL01 |   DEH|
|    KOCRES16DEL01 |   KOC|
|   PUNERES13DEL03 |  PUNE|
|   LUDHRES15DEL02 |  LUDH|
|    KNPRES14DEL02 |   KNP|
|    MUMRES15DEL03 |   MUM|
|    MYSRES01DEL01 |   MYS|
|   PUNERES20DEL01 |  PUNE|
|    HYDRES14DEL01 |   HYD|
|    KOLRES15DEL03 |   KOL|
|   PUNERES19DEL02 |  PUNE|
+------------------+------+
only showing top 20 rows



In [79]:
df.select("City").distinct().show(22)

[Stage 93:>                                                         (0 + 2) / 2]

+------+
|  City|
+------+
|  LUDH|
|  CHEN|
|   KOC|
|   GOA|
|  AURG|
|   JAP|
|   DEH|
|   MUM|
|   AGR|
|   SUR|
|  INDO|
|  PUNE|
|   ALH|
|   MYS|
| COIMB|
|   HYD|
|   VAD|
|RANCHI|
|   BHP|
|   KOL|
|   KNP|
|  BANG|
+------+



                                                                                

<blockquote><p style="font-size:20px; color:#159364; font-family:verdana;">3. Getting Pickup time:</p></blockquote>

In [88]:
## equivalent value counts in python:
## Looks like there are ~1700 rows of null values in this column.
df.groupBy('Time_Orderd').count().sort(col("count").desc()).show(10)

+-----------+-----+
|Time_Orderd|count|
+-----------+-----+
|       NaN | 1731|
|   21:55:00|  461|
|   17:55:00|  456|
|   20:00:00|  449|
|   22:20:00|  448|
|   21:35:00|  446|
|   19:50:00|  444|
|   21:15:00|  442|
|   21:20:00|  438|
|   22:45:00|  438|
+-----------+-----+
only showing top 10 rows



In [87]:
df.groupBy('Time_Order_picked').count().sort(col("count").desc()).show(10)

+-------------------+-----+
|  Time_Order_picked|count|
+-------------------+-----+
|2023-08-17 21:30:00|  496|
|2023-08-17 22:50:00|  474|
|2023-08-17 22:40:00|  458|
|2023-08-17 18:40:00|  457|
|2023-08-17 17:55:00|  456|
|2023-08-17 21:45:00|  456|
|2023-08-17 22:25:00|  455|
|2023-08-17 18:05:00|  454|
|2023-08-17 23:50:00|  453|
|2023-08-17 20:50:00|  453|
+-------------------+-----+
only showing top 10 rows



<div class="alert alert-block alert-info" style="font-size:14px; font-family:verdana; line-height: 1.7em;">
    📌 &nbsp; A go-to approach will be to calculate average pickup time using other non null rows and then imputing the null rows with the average obtained.
</div>

<blockquote><p style="font-size:20px; color:#159364; font-family:verdana;">5. Cleaning the target variable:</p></blockquote>

#### Use withColumnRenamed method to rename a column.

In [70]:
## Before transformation:
df.select("Time_taken(min)").show(5)

+---------------+
|Time_taken(min)|
+---------------+
|       (min) 24|
|       (min) 33|
|       (min) 26|
|       (min) 21|
|       (min) 30|
+---------------+
only showing top 5 rows



In [71]:
## Renaming the column name::
df=df.withColumnRenamed('Time_taken(min)','time_taken')

## Removing the preffix (i.e. '(min)') in the column values with the help of a UDF:
def target_clean(x):
    return x[-2:]

target_clean_udf=udf(lambda x:target_clean(x),StringType())
df=df.withColumn("time_taken",target_clean_udf(col("time_taken")))
## Converting type:
df=df.withColumn("time_taken",col("time_taken").cast(IntegerType()))

In [74]:
## As you can see, the values have been cleaned and the type has been changed:
df.select("time_taken").show(5),df.select("time_taken").dtypes

+----------+
|time_taken|
+----------+
|        24|
|        33|
|        26|
|        21|
|        30|
+----------+
only showing top 5 rows



(None, [('time_taken', 'int')])

In [None]:
7. 

# from geopy.distance import geodesic 

# train['distance_diff_KM']=np.zeros(len(train))
# restaurant_cordinates_train=train[['Restaurant_latitude','Restaurant_longitude']].to_numpy()
# delivery_location_cordinates_train=train[['Delivery_location_latitude','Delivery_location_longitude']].to_numpy()

# for i in range(len(train)):
#     train['distance_diff_KM'].loc[i]=geodesic(restaurant_cordinates_train[i],delivery_location_cordinates_train[i])