In [6]:
 
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score

In [7]:
df = pd.read_csv('Data_Train.csv')

In [8]:
df.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302


In [9]:
df.drop(columns=['Additional_Info', 'Route'], inplace=True)
df.dropna(inplace=True)

In [10]:
df.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Dep_Time,Arrival_Time,Duration,Total_Stops,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,22:20,01:10 22 Mar,2h 50m,non-stop,3897
1,Air India,1/05/2019,Kolkata,Banglore,05:50,13:15,7h 25m,2 stops,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,09:25,04:25 10 Jun,19h,2 stops,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,18:05,23:30,5h 25m,1 stop,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,16:50,21:35,4h 45m,1 stop,13302


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10682 entries, 0 to 10682
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Airline          10682 non-null  object
 1   Date_of_Journey  10682 non-null  object
 2   Source           10682 non-null  object
 3   Destination      10682 non-null  object
 4   Dep_Time         10682 non-null  object
 5   Arrival_Time     10682 non-null  object
 6   Duration         10682 non-null  object
 7   Total_Stops      10682 non-null  object
 8   Price            10682 non-null  int64 
dtypes: int64(1), object(8)
memory usage: 834.5+ KB


In [12]:
df['Date_of_Journey'] = pd.to_datetime(df['Date_of_Journey'], format='%d/%m/%Y')
df['Journey_Day'] = df['Date_of_Journey'].dt.day
df['Journey_Month'] = df['Date_of_Journey'].dt.month
df.drop(columns=['Date_of_Journey'], inplace=True)

In [13]:
df.head()

Unnamed: 0,Airline,Source,Destination,Dep_Time,Arrival_Time,Duration,Total_Stops,Price,Journey_Day,Journey_Month
0,IndiGo,Banglore,New Delhi,22:20,01:10 22 Mar,2h 50m,non-stop,3897,24,3
1,Air India,Kolkata,Banglore,05:50,13:15,7h 25m,2 stops,7662,1,5
2,Jet Airways,Delhi,Cochin,09:25,04:25 10 Jun,19h,2 stops,13882,9,6
3,IndiGo,Kolkata,Banglore,18:05,23:30,5h 25m,1 stop,6218,12,5
4,IndiGo,Banglore,New Delhi,16:50,21:35,4h 45m,1 stop,13302,1,3


In [14]:
df['Arrival_Time'] = df['Arrival_Time'].str.split().str[0]
df['Dep_Time'] = df['Dep_Time'].str.strip() 

In [15]:
df.head()

Unnamed: 0,Airline,Source,Destination,Dep_Time,Arrival_Time,Duration,Total_Stops,Price,Journey_Day,Journey_Month
0,IndiGo,Banglore,New Delhi,22:20,01:10,2h 50m,non-stop,3897,24,3
1,Air India,Kolkata,Banglore,05:50,13:15,7h 25m,2 stops,7662,1,5
2,Jet Airways,Delhi,Cochin,09:25,04:25,19h,2 stops,13882,9,6
3,IndiGo,Kolkata,Banglore,18:05,23:30,5h 25m,1 stop,6218,12,5
4,IndiGo,Banglore,New Delhi,16:50,21:35,4h 45m,1 stop,13302,1,3


In [16]:
df['Dep_Hour'] = pd.to_datetime(df['Dep_Time'], format='%H:%M').dt.hour
df['Dep_Minute'] = pd.to_datetime(df['Dep_Time'], format='%H:%M').dt.minute
df.drop(columns=['Dep_Time'], inplace=True)

In [17]:
df.head

<bound method NDFrame.head of            Airline    Source Destination Arrival_Time Duration Total_Stops  \
0           IndiGo  Banglore   New Delhi        01:10   2h 50m    non-stop   
1        Air India   Kolkata    Banglore        13:15   7h 25m     2 stops   
2      Jet Airways     Delhi      Cochin        04:25      19h     2 stops   
3           IndiGo   Kolkata    Banglore        23:30   5h 25m      1 stop   
4           IndiGo  Banglore   New Delhi        21:35   4h 45m      1 stop   
...            ...       ...         ...          ...      ...         ...   
10678     Air Asia   Kolkata    Banglore        22:25   2h 30m    non-stop   
10679    Air India   Kolkata    Banglore        23:20   2h 35m    non-stop   
10680  Jet Airways  Banglore       Delhi        11:20       3h    non-stop   
10681      Vistara  Banglore   New Delhi        14:10   2h 40m    non-stop   
10682    Air India     Delhi      Cochin        19:15   8h 20m     2 stops   

       Price  Journey_Day  Journe

In [18]:
df['Arrival_Hour'] = pd.to_datetime(df['Arrival_Time'], format='%H:%M').dt.hour
df['Arrival_Minute'] = pd.to_datetime(df['Arrival_Time'], format='%H:%M').dt.minute
df.drop(columns=['Arrival_Time'], inplace=True)

In [20]:
df.head()

Unnamed: 0,Airline,Source,Destination,Duration,Total_Stops,Price,Journey_Day,Journey_Month,Dep_Hour,Dep_Minute,Arrival_Hour,Arrival_Minute
0,IndiGo,Banglore,New Delhi,2h 50m,non-stop,3897,24,3,22,20,1,10
1,Air India,Kolkata,Banglore,7h 25m,2 stops,7662,1,5,5,50,13,15
2,Jet Airways,Delhi,Cochin,19h,2 stops,13882,9,6,9,25,4,25
3,IndiGo,Kolkata,Banglore,5h 25m,1 stop,6218,12,5,18,5,23,30
4,IndiGo,Banglore,New Delhi,4h 45m,1 stop,13302,1,3,16,50,21,35


In [21]:
df['Duration_Hours'] = df['Duration'].apply(lambda x: int(x.split('h')[0]) if 'h' in x else 0)
df['Duration_Minutes'] = df['Duration'].apply(lambda x: int(x.split('h')[-1].replace('m', '')) if 'm' in x else 0)
df.drop(columns=['Duration'], inplace=True)

In [22]:
df.head()

Unnamed: 0,Airline,Source,Destination,Total_Stops,Price,Journey_Day,Journey_Month,Dep_Hour,Dep_Minute,Arrival_Hour,Arrival_Minute,Duration_Hours,Duration_Minutes
0,IndiGo,Banglore,New Delhi,non-stop,3897,24,3,22,20,1,10,2,50
1,Air India,Kolkata,Banglore,2 stops,7662,1,5,5,50,13,15,7,25
2,Jet Airways,Delhi,Cochin,2 stops,13882,9,6,9,25,4,25,19,0
3,IndiGo,Kolkata,Banglore,1 stop,6218,12,5,18,5,23,30,5,25
4,IndiGo,Banglore,New Delhi,1 stop,13302,1,3,16,50,21,35,4,45


In [23]:
df['Total_Stops'] = df['Total_Stops'].astype(str).replace({'non-stop': 0,'1 stop': 1,'2 stops': 2,'3 stops': 3,'4 stops': 4}).astype(int)

  df['Total_Stops'] = df['Total_Stops'].astype(str).replace({'non-stop': 0,'1 stop': 1,'2 stops': 2,'3 stops': 3,'4 stops': 4}).astype(int)


In [24]:
df.head()

Unnamed: 0,Airline,Source,Destination,Total_Stops,Price,Journey_Day,Journey_Month,Dep_Hour,Dep_Minute,Arrival_Hour,Arrival_Minute,Duration_Hours,Duration_Minutes
0,IndiGo,Banglore,New Delhi,0,3897,24,3,22,20,1,10,2,50
1,Air India,Kolkata,Banglore,2,7662,1,5,5,50,13,15,7,25
2,Jet Airways,Delhi,Cochin,2,13882,9,6,9,25,4,25,19,0
3,IndiGo,Kolkata,Banglore,1,6218,12,5,18,5,23,30,5,25
4,IndiGo,Banglore,New Delhi,1,13302,1,3,16,50,21,35,4,45


In [25]:
Q1 = df['Price'].quantile(0.25)  
Q3 = df['Price'].quantile(0.75)  
IQR = Q3 - Q1 

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

In [26]:
df = df[(df['Price'] >= lower_bound) & (df['Price'] <= upper_bound)]

In [27]:
X = df.drop(columns=['Price'])  
y = df['Price']

In [28]:
categorical_cols = ['Airline', 'Source', 'Destination']
numerical_cols = ['Total_Stops', 'Journey_Day', 'Journey_Month', 'Dep_Hour', 'Dep_Minute', 
                  'Arrival_Hour', 'Arrival_Minute', 'Duration_Hours', 'Duration_Minutes']

In [29]:
preprocessor = ColumnTransformer([
    ('onehot', OneHotEncoder(handle_unknown='ignore', drop='first'), categorical_cols),
    ('scaler', StandardScaler(), numerical_cols)])

In [30]:
model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', DecisionTreeRegressor(random_state=42))
])

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [32]:
model_pipeline.fit(X_train, y_train)

In [33]:
y_pred = model_pipeline.predict(X_test)



In [34]:
r2 = r2_score(y_test, y_pred)

In [35]:
print(f'R2 Score: {r2:.4f}')

R2 Score: 0.7074
