In [1]:
 
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score

In [2]:
df = pd.read_csv('Data_Train.csv')

In [3]:
df.drop(columns=['Additional_Info', 'Route'], inplace=True)
df.dropna(inplace=True)

In [4]:
df['Date_of_Journey'] = pd.to_datetime(df['Date_of_Journey'], format='%d/%m/%Y')
df['Journey_Day'] = df['Date_of_Journey'].dt.day
df['Journey_Month'] = df['Date_of_Journey'].dt.month
df.drop(columns=['Date_of_Journey'], inplace=True)

In [None]:
df['Arrival_Time'] = df['Arrival_Time'].str.split().str[0]
df['Dep_Time'] = df['Dep_Time'].str.strip() 

In [6]:
df['Dep_Hour'] = pd.to_datetime(df['Dep_Time'], format='%H:%M').dt.hour
df['Dep_Minute'] = pd.to_datetime(df['Dep_Time'], format='%H:%M').dt.minute
df.drop(columns=['Dep_Time'], inplace=True)

In [7]:
df['Arrival_Hour'] = pd.to_datetime(df['Arrival_Time'], format='%H:%M').dt.hour
df['Arrival_Minute'] = pd.to_datetime(df['Arrival_Time'], format='%H:%M').dt.minute
df.drop(columns=['Arrival_Time'], inplace=True)

In [8]:
df['Duration_Hours'] = df['Duration'].apply(lambda x: int(x.split('h')[0]) if 'h' in x else 0)
df['Duration_Minutes'] = df['Duration'].apply(lambda x: int(x.split('h')[-1].replace('m', '')) if 'm' in x else 0)
df.drop(columns=['Duration'], inplace=True)

In [9]:
df['Total_Stops'] = df['Total_Stops'].astype(str).replace({'non-stop': 0,'1 stop': 1,'2 stops': 2,'3 stops': 3,'4 stops': 4}).astype(int)

  df['Total_Stops'] = df['Total_Stops'].astype(str).replace({'non-stop': 0,'1 stop': 1,'2 stops': 2,'3 stops': 3,'4 stops': 4}).astype(int)


In [10]:
Q1 = df['Price'].quantile(0.25)  
Q3 = df['Price'].quantile(0.75)  
IQR = Q3 - Q1 

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

In [11]:
df = df[(df['Price'] >= lower_bound) & (df['Price'] <= upper_bound)]

In [12]:
X = df.drop(columns=['Price'])  
y = df['Price']

In [13]:
categorical_cols = ['Airline', 'Source', 'Destination']
numerical_cols = ['Total_Stops', 'Journey_Day', 'Journey_Month', 'Dep_Hour', 'Dep_Minute', 
                  'Arrival_Hour', 'Arrival_Minute', 'Duration_Hours', 'Duration_Minutes']

In [15]:
preprocessor = ColumnTransformer([
    ('onehot', OneHotEncoder(handle_unknown='ignore', drop='first'), categorical_cols),
    ('scaler', StandardScaler(), numerical_cols)])

In [16]:
ridge_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', Ridge(alpha=1.0))  # Regularization strength (α)
])

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
ridge_pipeline.fit(X_train, y_train)

In [19]:
y_pred = ridge_pipeline.predict(X_test)



In [20]:
r2_ridge = r2_score(y_test, y_pred)

In [21]:
print(f'R2 Score: {r2_ridge:.4f}')

R2 Score: 0.6387
