In [2]:
# ------------------------------------------------------------------
# Below is an example of a minimal notebook that:
# 1. Reads the dataset
# 2. Drops the Route column
# 3. Performs label encoding
# 4. Trains a model (Random Forest as an example)
# 5. Saves the trained model and encoders
# ------------------------------------------------------------------

import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

# =========================
# Cell 1: Load the dataset
# =========================


df = pd.read_csv("data/train.csv")

# Convert 'Price' to numeric if needed
df["Price"] = pd.to_numeric(df["Price"], errors='coerce')

# =========================================
# Cell 2: Data Cleaning & Feature Engineering
# =========================================

# 1) Drop the Route column since we do NOT want to use it at all
df.drop(["Route"], axis=1, inplace=True)

# 2) Convert 'Date_of_Journey' to datetime, and extract day/month
df["Date_of_Journey"] = pd.to_datetime(df["Date_of_Journey"], format="%d/%m/%Y")
df["Date"] = df["Date_of_Journey"].dt.day
df["Month"] = df["Date_of_Journey"].dt.month
df.drop(["Date_of_Journey"], axis=1, inplace=True)

# 3) Convert 'Total_Stops' from text to numeric
# Example: "1 stop" -> 1, "non-stop" -> 0
def parse_stops(stops):
    stops = stops.lower().strip()
    if stops == "non-stop":
        return 0
    return int(stops.split()[0])  # "1 stop" -> "1"

# Fill missing values with "non-stop" before applying the function
df["Total_Stops"].fillna("non-stop", inplace=True)
df["Total_Stops"] = df["Total_Stops"].apply(parse_stops)

# 4) Parse Dep_Time, Arrival_Time, and Duration
def parse_time_str(t):
    # t might have 'HH:MM' or 'HH:MM DD Mon'
    # We'll just take the first HH:MM
    return t.split()[0]

df["Dep_Time"] = df["Dep_Time"].apply(parse_time_str)
df["Arrival_Time"] = df["Arrival_Time"].apply(parse_time_str)

# Extract hours and minutes
def split_hour_min(t):
    hh, mm = t.split(":")
    return int(hh), int(mm)

df["Dep_hour"], df["Dep_min"] = zip(*df["Dep_Time"].apply(split_hour_min))
df["Arrival_hour"], df["Arrival_min"] = zip(*df["Arrival_Time"].apply(split_hour_min))

# Parse durations, e.g. "10h 55m" -> (10, 55), "4h" -> (4,0)
def parse_duration(d):
    d = d.lower()
    hr = 0
    mn = 0
    if 'h' in d:
        parts = d.split('h')
        hr = int(parts[0].strip())
        if 'm' in parts[1]:
            mn = int(parts[1].split('m')[0].strip())
    elif 'm' in d:
        mn = int(d.split('m')[0].strip())
    return hr, mn

df["Duration_hour"], df["Duration_min"] = zip(*df["Duration"].apply(parse_duration))

# Drop original time columns now that we have numeric
df.drop(["Dep_Time", "Arrival_Time", "Duration"], axis=1, inplace=True)

# 5) Label encode categorical columns
cat_cols = ["Airline", "Source", "Destination", "Additional_Info"]
encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoders[col] = le

# =================================
# Cell 3: Split data & Train Model
# =================================
X = df.drop(["Price"], axis=1)
y = df["Price"]

x_train, x_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)

model = RandomForestRegressor(n_estimators=10, random_state=42)
model.fit(x_train, y_train)

# Evaluate quickly
score = model.score(x_test, y_test)
print("Model R2:", score)

# =================================
# Cell 4: Save model & encoders
# =================================
joblib.dump(model, "models/best_model.pkl")
joblib.dump(encoders, "models/encoders.pkl")

print("Model and encoders saved successfully!")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Total_Stops"].fillna("non-stop", inplace=True)


Model R2: 0.8522463892515735
Model and encoders saved successfully!
