In [158]:
import numpy as np
import pandas as pd
import missingno as msno
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import date
import warnings
warnings.simplefilter("ignore")

In [159]:
path = "/Users/aryagupta/Desktop/food-delivery-estimation/food_time_prediction_using_mlops/data/raw/train.csv"

data = pd.read_csv(path)
data = data.drop([45593], axis = 0) # contains all null value

In [160]:
def hello(data, age_equal_15, six_star_ratings):
    # Lowercase column names in-place
    data.columns = data.columns.str.lower()
    
    # Rename columns in-place
    data.rename({
        "delivery_person_id": "person_id", 
        "delivery_person_age" : "age", 
        "delivery_person_ratings" : "ratings",
        "delivery_location_latitude": "delivery_latitude",
        "delivery_location_longitude" : "delivery_longitude",
        "time_order_picked" : "order_picked",
        "weatherconditions" : "weather",
        "road_traffic_density" : "traffic", 
        "type_of_order" : "order_type",
        "time_taken(min)" : "time",
        "city" : 'city_category',
        'festival' : 'is_festival',
        'type_of_vehicle' : 'vehical_type',
        'time_orderd' : 'order_time',                        
    }, axis=1, inplace=True)

    # ID Feature
    data['id'] = data['id'].replace("NaN", np.nan)

    # city colm
    data['city'] = data['person_id'].str.split('RES').str.get(0)

    # person_id
    data['person_id'] = data['person_id'].replace("NaN", np.nan)

    # age
    data['age'] = data['age'].replace("NaN", np.nan)
    data['age'] = data['age'].astype('float').round()
    
    # ratings
    data['ratings'] = data['ratings'].replace("NaN", np.nan)
    data['ratings'] = data['ratings'].astype('float')

    # restaurant_latitude
    data['restaurant_latitude'] = data['restaurant_latitude'].replace("NaN", np.nan)

    # restaurant_longitude
    data['restaurant_longitude'] = data['restaurant_longitude'].replace("NaN", np.nan)

    # delivery_latitude
    data['delivery_latitude'] = data['delivery_latitude'].replace("NaN", np.nan)

    # delivery_longitude
    data['delivery_longitude'] = data['delivery_longitude'].replace("NaN", np.nan)

    # order_date
    data['order_date'] = pd.to_datetime(data['order_date'], errors='coerce')

    # time_ordered
    data['order_time'] = data['order_time'].replace("NaN", np.nan)
    data['order_time'] = pd.to_datetime(data['order_time'], errors='coerce')

    # order_picked
    data['order_picked'] = pd.to_datetime(data['order_picked'], errors='coerce')

    # weather
    data['weather'] = data['weather'].replace("conditions NaN", np.nan)
    data['weather'] = data['weather'].str.lower()
    data['weather'] = data['weather'].str.replace("conditions ", "").str.strip()

    # traffic
    data['traffic'] = data['traffic'].replace("NaN ", np.nan)
    data['traffic'] = data['traffic'].str.lower()

    # vehical_condition
    data['vehicle_condition'] = data['vehicle_condition'].replace("NaN", np.nan)
    data['vehicle_condition'] = data['vehicle_condition'].astype('Int64')

    # order_type
    data['order_type'] = data['order_type'].replace("NaN ", np.nan)
    data['order_type'] = data['order_type'].str.lower()

    # type_of_vehicle
    data['vehical_type'] = data['vehical_type'].replace("NaN ", np.nan)

    # multiple_deliveries
    data['multiple_deliveries'] = data['multiple_deliveries'].replace("NaN ", np.nan)
    data['multiple_deliveries'] = data['multiple_deliveries'].astype('float')

    # festival
    data['is_festival'] = data['is_festival'].replace("NaN ", np.nan)
    data['is_festival'] = data['is_festival'].str.lower()

    # city
    data['city_category'] = data['city_category'].replace("NaN ", np.nan)
    data['city_category'] = data['city_category'].str.lower()

    # time
    data['time'] = data['time'].str.replace(r"\(min\)", "", regex=True)
    # data['time'] = pd.to_numeric(data['time'], errors='coerce')

    # Drop rows where age is 15
    data = data.drop(index = age_equal_15.index)
    data = data.drop(index = six_star_ratings.index)

    # Latitude/Longitude validation and cleaning
    loc_columns = ['restaurant_latitude', 'restaurant_longitude', 'delivery_latitude', 'delivery_longitude']
    lower_bound_lat_ind = 6.44
    lower_bound_long_ind = 68.70

    # Replace invalid lat/long values with NaN
    for col in loc_columns:
        if "latitude" in col:
            data[col] = np.where(data[col] < lower_bound_lat_ind, np.nan, data[col])
        elif "longitude" in col:
            data[col] = np.where(data[col] < lower_bound_long_ind, np.nan, data[col])

    # Datetime feature extraction
    date_col = pd.to_datetime(data['order_date'], dayfirst=True)
    data['day'] = date_col.dt.day
    data['month'] = date_col.dt.month
    data['year'] = date_col.dt.year
    data['day_of_week'] = date_col.dt.day_name()
    data['is_weekend'] = date_col.dt.day_name().isin(["Saturday", "Sunday"]).astype(int)

    # Hour and time of day from order time
    order_hour = pd.to_datetime(data['order_time'], errors='coerce').dt.hour
    data['order_time_hour'] = order_hour

    def time_of_day(hours):
        return np.select(
            condlist=[
                hours.between(6, 12, inclusive='left'),
                hours.between(12, 17, inclusive='left'),
                hours.between(17, 20, inclusive='left'),
                hours.between(20, 24, inclusive='left')
            ],
            choicelist=["morning", "afternoon", "evening", "night"],
            default="after_midnight"
        )

    data['order_time_of_day'] = time_of_day(order_hour)

    # Pickup time in minutes
    valid_times = data[['order_time', 'order_picked']].dropna()
    pickup_duration = (valid_times['order_picked'] - valid_times['order_time']).dt.total_seconds() / 60
    data['pickup_time'] = pickup_duration

    def calculate_haversine_distance(data, loc_columns):
        lat1 = data[loc_columns[0]]
        lon1 = data[loc_columns[1]]
        lat2 = data[loc_columns[2]]
        lon2 = data[loc_columns[3]]
        lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])
        dlon = lon2 - lon1
        dlat = lat2 - lat1
        a = np.sin(dlat / 2.0) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2.0) ** 2
        c = 2 * np.arcsin(np.sqrt(a))
        distance = 6371 * c  # Earth radius in km
        return data.assign(distance=distance)

    data = calculate_haversine_distance(data, loc_columns)

    data.to_csv("/Users/aryagupta/Desktop/food-delivery-estimation/food_time_prediction_using_mlops/data/interim/train_interim.csv")
    return data


hello(data, age_equal_15, six_star_ratings)

Unnamed: 0,id,person_id,age,ratings,restaurant_latitude,restaurant_longitude,delivery_latitude,delivery_longitude,order_date,order_time,...,city,day,month,year,day_of_week,is_weekend,order_time_hour,order_time_of_day,pickup_time,distance
0,0x4607,INDORES13DEL02,37.0,4.9,22.745049,75.892471,22.765049,75.912471,2022-03-19,2025-05-23 11:30:00,...,INDO,19,3,2022,Saturday,1,11.0,morning,15.0,3.025149
1,0xb379,BANGRES18DEL02,34.0,4.5,12.913041,77.683237,13.043041,77.813237,2022-03-25,2025-05-23 19:45:00,...,BANG,25,3,2022,Friday,0,19.0,evening,5.0,20.183530
2,0x5d6d,BANGRES19DEL01,23.0,4.4,12.914264,77.678400,12.924264,77.688400,2022-03-19,2025-05-23 08:30:00,...,BANG,19,3,2022,Saturday,1,8.0,morning,15.0,1.552758
3,0x7a6a,COIMBRES13DEL02,38.0,4.7,11.003669,76.976494,11.053669,77.026494,2022-04-05,2025-05-23 18:00:00,...,COIMB,5,4,2022,Tuesday,0,18.0,evening,10.0,7.790401
4,0x70a2,CHENRES12DEL01,32.0,4.6,12.972793,80.249982,13.012793,80.289982,2022-03-26,2025-05-23 13:30:00,...,CHEN,26,3,2022,Saturday,1,13.0,afternoon,15.0,6.210138
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45588,0x7c09,JAPRES04DEL01,30.0,4.8,26.902328,75.794257,26.912328,75.804257,2022-03-24,2025-05-23 11:35:00,...,JAP,24,3,2022,Thursday,0,11.0,morning,10.0,1.489846
45589,0xd641,AGRRES16DEL01,21.0,4.6,,,,,2022-02-16,2025-05-23 19:55:00,...,AGR,16,2,2022,Wednesday,0,19.0,evening,15.0,
45590,0x4f8d,CHENRES08DEL03,30.0,4.9,13.022394,80.242439,13.052394,80.272439,2022-03-11,2025-05-23 23:50:00,...,CHEN,11,3,2022,Friday,0,23.0,night,-1425.0,4.657195
45591,0x5eee,COIMBRES11DEL01,20.0,4.7,11.001753,76.986241,11.041753,77.026241,2022-03-07,2025-05-23 13:35:00,...,COIMB,7,3,2022,Monday,0,13.0,afternoon,5.0,6.232393


### Dont delete

In [162]:
six_star_ratings = data[data['ratings'] == 6]
six_star_ratings.head()

Unnamed: 0,id,person_id,age,ratings,restaurant_latitude,restaurant_longitude,delivery_latitude,delivery_longitude,order_date,order_time,...,weather,traffic,vehicle_condition,order_type,vehical_type,multiple_deliveries,is_festival,city_category,time,city
3586,0x46d,BANGRES05DEL01,50.0,6.0,-12.970324,-77.645748,13.010324,77.685748,2022-03-13,NaT,...,,,3,meal,electric_scooter,0.0,no,urban,25,BANG
4714,0x493,HYDRES17DEL01,50.0,6.0,-17.451976,-78.385883,17.561976,78.495883,2022-04-04,NaT,...,,,3,snack,bicycle,0.0,no,metropolitian,27,HYD
5169,0x4f2,JAPRES08DEL01,50.0,6.0,-26.910262,-75.783013,27.020262,75.893013,2022-03-18,NaT,...,,,3,drinks,scooter,1.0,no,metropolitian,20,JAP
5362,0x430,BANGRES19DEL01,50.0,6.0,12.914264,77.6784,13.024264,77.7884,2022-04-06,NaT,...,,,3,meal,electric_scooter,1.0,no,metropolitian,18,BANG
5651,0xbef1,AGRRES13DEL02,50.0,6.0,-27.159795,-78.04299,27.209795,78.09299,2022-02-13,NaT,...,,,3,drinks,scooter,1.0,no,metropolitian,20,AGR


In [163]:
# don't delete
age_equal_15 = data[data['age'] == 15]
age_equal_15.head()

Unnamed: 0,id,person_id,age,ratings,restaurant_latitude,restaurant_longitude,delivery_latitude,delivery_longitude,order_date,order_time,...,weather,traffic,vehicle_condition,order_type,vehical_type,multiple_deliveries,is_festival,city_category,time,city
2387,0x564,JAPRES15DEL03,15.0,1.0,-26.891191,75.802083,26.981191,75.892083,2022-03-12,NaT,...,,,3,meal,motorcycle,0.0,no,urban,15,JAP
2905,0xcd0,INDORES010DEL03,15.0,1.0,22.75004,75.902847,22.81004,75.962847,2022-04-03,NaT,...,,,3,snack,scooter,1.0,no,metropolitian,29,INDO
2951,0x91a,SURRES17DEL03,15.0,1.0,21.149569,72.772697,21.209569,72.832697,2022-03-21,NaT,...,,,3,buffet,bicycle,1.0,no,metropolitian,20,SUR
5902,0x474,CHENRES15DEL03,15.0,1.0,13.026286,80.275235,13.056286,80.305235,2022-03-11,NaT,...,,,3,drinks,bicycle,1.0,no,metropolitian,25,CHEN
9156,0x73f,BANGRES05DEL01,15.0,1.0,12.970324,77.645748,13.080324,77.755748,2022-03-25,NaT,...,,,3,buffet,motorcycle,2.0,no,metropolitian,34,BANG
