In [20]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
import pickle

In [21]:
df = pd.read_csv("New_clean_data.csv", parse_dates=["Date_of_Journey"])
df = df.loc[:, ~df.columns.str.contains("^Unnamed")]
df.head()

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,2019-03-24,Banglore,New Delhi,170,0,No info,3897
1,Air India,2019-05-01,Kolkata,Banglore,445,2,No info,7662
2,Jet Airways,2019-06-09,Delhi,Cochin,1140,2,No info,13882
3,IndiGo,2019-05-12,Kolkata,Banglore,325,1,No info,6218
4,IndiGo,2019-03-01,Banglore,New Delhi,285,1,No info,13302


In [29]:
def convert_duration(duration_str):
    try:
        parts = duration_str.strip().lower().replace(' ', '').replace('h', ':').replace('m', '').split(':')
        if len(parts) == 2:
            hours, minutes = int(parts[0]), int(parts[1])
        elif 'h' in duration_str:
            hours, minutes = int(parts[0]), 0
        else:
            hours, minutes = 0, int(parts[0])
        return hours * 60 + minutes
    except:
        return 0  # fallback for invalid format

df['Duration'] = df['Duration'].apply(convert_duration)
df['Duration'].head()

0    0
1    0
2    0
3    0
4    0
Name: Duration, dtype: int64

In [30]:
X = df[['Airline', 'Source', 'Destination', 'Duration', 'Total_Stops', 'Date_of_Journey']]
y = df['Price']

In [31]:
categorical_features = ['Airline', 'Source', 'Destination', 'Total_Stops']
numeric_features = ['Duration']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('num', StandardScaler(), numeric_features)
    ],
    remainder='drop'
)


In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

In [33]:
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(random_state=42
                                          )
}

results = {}

for name, model in models.items():
    pipe = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    score = r2_score(y_test, y_pred)
    results[name] = (score, pipe)

# Convert results dict to DataFrame
results_df = pd.DataFrame([
    {"Model": name, "R2 Score": score}
    for name, (score, _) in results.items()
])

# Sort by R2 Score descending
results_df = results_df.sort_values(by="R2 Score", ascending=False).reset_index(drop=True)

# Display
results_df

Unnamed: 0,Model,R2 Score
0,Random Forest,0.626392
1,Decision Tree,0.626141
2,Linear Regression,0.585683


In [34]:
best_model_name = max(results, key=lambda name: results[name][0])
best_score, best_pipeline = results[best_model_name]

print(f"Best Model: {best_model_name} with R² Score: {best_score:.4f}")


Best Model: Random Forest with R² Score: 0.6264


In [40]:

airline = input("Enter Airline (e.g., IndiGo, Air India, etc.): ")
date_of_Journey= input("Enter Date of Journey (DD/MM/YYYY): ")
source = input("Enter Source city (e.g., Delhi, Kolkata): ")
destination= input("Enter Destination city (e.g., Cochin, Delhi): ")
duration= input("Enter Duration (e.g., '2h 50m', '50m', '2h'): ")
total_Stops= input("Enter Total Stops (e.g., 'non-stop', '1 stop', '2 stops'): ")

from datetime import datetime
def convert_duration(duration_str):
    try:
        parts = duration_str.strip().lower().replace(' ', '').replace('h', ':').replace('m', '').split(':')
        if len(parts) == 2:
            hours, minutes = int(parts[0]), int(parts[1])
        elif 'h' in duration_str:
            hours, minutes = int(parts[0]), 0
        else:
            hours, minutes = 0, int(parts[0])
        return hours * 60 + minutes
    except:
        return 0

duration_mins = convert_duration(duration)
date_parsed = pd.to_datetime(datetime.strptime(date_of_Journey, "%d/%m/%Y"))

# Construct DataFrame for prediction
columns = ['Airline', 'Date_of_Journey', 'Source', 'Destination', 'Duration', 'Total_Stops']
user_input = pd.DataFrame([[airline, date_parsed, source, destination, duration_mins, total_Stops]], columns=columns)

# Predict using best pipeline
predicted_price = best_pipeline.predict(user_input)
print(f"Predicted Price: ₹ {predicted_price[0]:,.2f}")

Enter Airline (e.g., IndiGo, Air India, etc.):  IndiGo
Enter Date of Journey (DD/MM/YYYY):  8/8/2025
Enter Source city (e.g., Delhi, Kolkata):  Kolkata
Enter Destination city (e.g., Cochin, Delhi):  Delhi
Enter Duration (e.g., '2h 50m', '50m', '2h'):  2h 50m
Enter Total Stops (e.g., 'non-stop', '1 stop', '2 stops'):  2 stops


Predicted Price: ₹ 6,647.86


In [41]:

with open("PlaneTicketPricePredcitionfinal123.pkl", "wb") as f:
    pickle.dump(best_pipeline, f)