In [5]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from category_encoders import BinaryEncoder
from sklearn.linear_model import LogisticRegression
from category_encoders import OneHotEncoder

In [27]:
df=df=pd.read_csv(r'D:\training\price_prediction.csv')
df.head()

Unnamed: 0,Airline,Source,Destination,Additional_Info,Journey_Month_num,Journey_Day_num,Dep_Hour,Arrival_Hour,Duration_min,Stops,Price
0,IndiGo,Banglore,New Delhi,No info,3,6,22,1,170,0,3897
1,Air India,Kolkata,Banglore,No info,5,2,5,13,445,2,7662
2,Jet Airways,Delhi,Cochin,No info,6,6,9,4,1140,2,13882
3,IndiGo,Kolkata,Banglore,No info,5,6,18,23,325,1,6218
4,IndiGo,Banglore,New Delhi,No info,3,4,16,21,285,1,13302


In [28]:
obj_features=df.select_dtypes(include='O')
for feature in obj_features:
    print(f"Feature {feature} has {obj_features[feature].nunique()} unique values ")

Feature Airline has 12 unique values 
Feature Source has 5 unique values 
Feature Destination has 6 unique values 
Feature Additional_Info has 9 unique values 


In [29]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

Encoder = ColumnTransformer(transformers=[
    ('OHE', OneHotEncoder(drop='first', sparse_output=False), ['Source','Destination']),
    ('BE', BinaryEncoder(), ['Airline','Additional_Info'])
], remainder='passthrough')

In [30]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from lightgbm import LGBMRegressor

In [31]:
steps=[]
steps.append(('Encoder',Encoder))
steps.append(("Scaler" , RobustScaler()))
steps.append(("Model" , LGBMRegressor()))
pipeline = Pipeline(steps=steps)
pipeline

In [32]:
X=df.drop('Price',axis=1)
y=df['Price']

In [33]:
# Specify the scoring metrics
scoring_metrics = ['neg_mean_squared_error', 'neg_mean_absolute_error', 'r2']

# Perform cross-validation and evaluate the model's performance
cv_results = cross_validate(pipeline, X, y, cv=5, scoring=scoring_metrics, return_train_score=True)

# Print the mean scores for each metric
print("Train MSE:", -cv_results['train_neg_mean_squared_error'].mean())
print("Test MSE:", -cv_results['test_neg_mean_squared_error'].mean())
print("Train MAE:", -cv_results['train_neg_mean_absolute_error'].mean())
print("Test MAE:", -cv_results['test_neg_mean_absolute_error'].mean())
print("Train R-squared:", cv_results['train_r2'].mean())
print("Test R-squared:", cv_results['test_r2'].mean())

Train MSE: 2908503.226289744
Test MSE: 4200338.979598369
Train MAE: 985.9603032666018
Test MAE: 1112.367762383659
Train R-squared: 0.8639989286155784
Test R-squared: 0.8040593384521468


In [34]:
models = list()
models.append(("LR" , LinearRegression()))
models.append(("CART" , DecisionTreeRegressor()))
models.append(("RF" , RandomForestRegressor()))
models.append(("GB" , GradientBoostingRegressor()))
models.append(("lgb" , LGBMRegressor()))

In [35]:
for model in models:
    steps = []
    steps.append(("Encoder" , Encoder))
    steps.append(("Scaler" , RobustScaler()))
    steps.append(model)
    pipeline = Pipeline(steps=steps)
    scores = cross_validate(pipeline ,X ,y , cv = 5 , scoring="r2" , return_train_score=True)
    print(model[0])
    print("Train_accuracy" , scores["train_score"].mean() )
    print("-" * 10)
    print("Test_accuracy" , scores["test_score"].mean())
    print("-" * 20)
    print("\n")

LR
Train_accuracy 0.5795064544446465
----------
Test_accuracy 0.5753954122548469
--------------------


CART
Train_accuracy 0.975916688226626
----------
Test_accuracy 0.7191796690753838
--------------------


RF
Train_accuracy 0.95756729781546
----------
Test_accuracy 0.8178158694222415
--------------------


GB
Train_accuracy 0.7743570242317617
----------
Test_accuracy 0.7537150820153424
--------------------


lgb
Train_accuracy 0.8639989286155784
----------
Test_accuracy 0.8040593384521468
--------------------




In [229]:
grid_search.best_params_

{'Model__colsample_bytree': 0.5,
 'Model__learning_rate': 0.1,
 'Model__max_depth': 10,
 'Model__min_child_samples': 10,
 'Model__n_estimators': 350,
 'Model__reg_alpha': 0.1,
 'Model__reg_lambda': 0.1,
 'Model__subsample': 0.3}

In [36]:
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from category_encoders import BinaryEncoder

# Define the column transformers for preprocessing
column_transformer = ColumnTransformer(transformers=[
    ('OHE', OneHotEncoder(drop='first', sparse_output=False), ['Source','Destination']),
    ('BE', BinaryEncoder(), ['Airline','Additional_Info'])
], remainder='passthrough')


# Create the pipeline
pipeline = Pipeline([
    ('Encoder', column_transformer),
    ('Scaler', RobustScaler()),
    ('Model', LGBMRegressor())
])


param_grid = {
    'Model__learning_rate': [0.1, 0.01],
    'Model__n_estimators': [200,300,400, 600],
    'Model__max_depth': [5,7,9],
    'Model__colsample_bytree': [0.8, 0.9, 1.0],  # Subsample ratio of columns when constructing each tree
    'Model__reg_alpha': [0.0, 0.1, 0.5],  # L1 regularization term on weights
    'Model__reg_lambda': [0.0, 0.1, 0.5],  # L2 regularization term on weights}
}


# Perform grid search and cross-validation
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring='r2', return_train_score=True, n_jobs=-1)
grid_search.fit(X, y)

# Print the best parameters and the corresponding score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Best Parameters: {'Model__colsample_bytree': 0.9, 'Model__learning_rate': 0.1, 'Model__max_depth': 7, 'Model__n_estimators': 600, 'Model__reg_alpha': 0.1, 'Model__reg_lambda': 0.1}
Best Score: 0.8359679524233334


In [37]:
print('mean_train_score',grid_search.cv_results_["mean_train_score"].mean())
print('mean_test_score',grid_search.cv_results_["mean_test_score"].mean())    

mean_train_score 0.8356142217801791
mean_test_score 0.7855086681405168


In [38]:
# Assuming you have already performed the grid search and stored it in a variable named 'grid_search'
# Access the best estimator
best_estimator = grid_search.best_estimator_

# Assuming the last step of the pipeline is the LGBMRegressor model, access it
model = best_estimator.steps[-1][1]

# Fit the model to your data
#model.fit(X, y)

# Access the feature importances
importances = model.feature_importances_

# Print the importances for each feature
for feature_name, importance in zip(X.columns, importances):
    print(f"Feature: {feature_name}, Importance: {importance}")

Feature: Airline, Importance: 49
Feature: Source, Importance: 383
Feature: Destination, Importance: 290
Feature: Additional_Info, Importance: 108
Feature: Journey_Month_num, Importance: 32
Feature: Journey_Day_num, Importance: 95
Feature: Dep_Hour, Importance: 10
Feature: Arrival_Hour, Importance: 4
Feature: Duration_min, Importance: 487
Feature: Stops, Importance: 332


In [39]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [40]:
y_pred = grid_search.best_estimator_.predict(X_test)
from sklearn.metrics import make_scorer, r2_score, mean_squared_error

r2 = r2_score(y_test, y_pred)
print("R2 Score on Validation Set:", r2)

R2 Score on Validation Set: 0.9166912936014738


In [42]:
final_model = grid_search.best_estimator_
final_model

In [44]:
final_model = grid_search.best_estimator_
import joblib
joblib.dump(final_model , "Model.pkl")
joblib.dump(X.columns , "Inputs.pkl")

['Inputs.pkl']

In [45]:
import joblib

# Load the model
loaded_model = joblib.load("Model.pkl")
loaded_model

In [54]:
import streamlit as st
import pandas as pd
import joblib

Inputs = joblib.load("Inputs.pkl")
Model = joblib.load("Model.pkl")

def prediction(Airline, Source, Destination, Dep_Hour, Arrival_Hour, Duration_min, Stops, Additional_Info, Journey_Month_num, Journey_Day_num):
    test_df = pd.DataFrame(columns=Inputs)
    test_df.at[0, "Airline"] = Airline
    test_df.at[0, "Source"] = Source
    test_df.at[0, "Destination"] = Destination
    test_df.at[0, "Dep_Hour"] = Dep_Hour
    test_df.at[0, "Arrival_Hour"] = Arrival_Hour
    test_df.at[0, "Duration_min"] = Duration_min
    test_df.at[0, "Stops"] = Stops
    test_df.at[0, "Additional_Info"] = Additional_Info
    test_df.at[0, "Journey_Month_num"] = Journey_Month_num
    test_df.at[0, "Journey_Day_num"] = Journey_Day_num
    st.dataframe(test_df)
    result = Model.predict(test_df)[0]
    return result

def main():
    st.title("Flight Ticket Price Prediction")
    Airline = st.selectbox("Airline", ['Jet Airways', 'IndiGo', 'Air India', 'Multiple carriers', 'SpiceJet', 'Vistara',
                                       'Air Asia', 'GoAir', 'Multiple carriers Premium economy',
                                       'Jet Airways Business', 'Vistara Premium economy', 'Trujet'])
    Source = st.selectbox("Source", ['Delhi', 'Kolkata', 'Banglore', 'Mumbai', 'Chennai'])
    Destination = st.selectbox("Destination", ['Cochin', 'Banglore', 'Delhi', 'New Delhi', 'Hyderabad', 'Kolkata'])
    Dep_Hour = st.slider("Dep_Hour", min_value=0, max_value=23, value=0, step=1)
    Arrival_Hour = st.slider("Arrival_Hour", min_value=0, max_value=23, value=0, step=1)
    Duration_min = st.slider("Duration_min", min_value=0, max_value=50, value=0, step=1)
    Stops = st.selectbox("Stops", [0, 1, 2, 3])
    Additional_Info = st.selectbox("Additional_Info", ['no info', 'in-flight meal not included',
                                                       'no check-in baggage included', '1 long layover',
                                                       'change airports', 'business class', '1 short layover',
                                                       'red-eye flight', '2 long layover'])
    Journey_Month_num = st.selectbox("Journey_Month_num", [3, 4, 5, 6])
    Journey_Day = st.selectbox("Journey_Day", ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])
    Journey_Day_num = {'Monday': 0, 'Tuesday': 1, 'Wednesday': 2, 'Thursday': 3, 'Friday': 4, 'Saturday': 5, 'Sunday': 6}[Journey_Day]
    
    if st.button("Predict Flight Cost"):
        result = prediction(Airline, Source, Destination, Dep_Hour, Arrival_Hour, Duration_min, Stops, Additional_Info, Journey_Month_num, Journey_Day_num)
        st.text(f"The flight ticket will cost {result} dollars")

if __name__ == '__main__':
    main()
