In [34]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [35]:
df=pd.read_csv('cleaned_dataset.csv')
df.head()

Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,days_left,price,duration_hour,duration_min
0,SpiceJet,SG-8709,Delhi,Evening,zero,Night,Mumbai,Economy,1,8.691819,2,17
1,SpiceJet,SG-8157,Delhi,Morning,zero,Morning,Mumbai,Economy,1,8.691819,2,33
2,AirAsia,I5-764,Delhi,Morning,zero,Morning,Mumbai,Economy,1,8.692322,2,17
3,Vistara,UK-995,Delhi,Morning,zero,Afternoon,Mumbai,Economy,1,8.692154,2,25
4,Vistara,UK-963,Delhi,Morning,zero,Morning,Mumbai,Economy,1,8.692154,2,33


In [36]:
data=df.sample(n=2000,random_state=42)

In [37]:
from sklearn.preprocessing import LabelEncoder

label_encoders = {} 

for col in df.select_dtypes(include='object'):
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col].astype(str))
    label_encoders[col] = le
 
data.head()

Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,days_left,price,duration_hour,duration_min
27131,1,219,2,2,0,2,4,1,40,8.904766,19,75
266857,5,524,4,2,0,3,5,0,42,11.079555,9,83
141228,5,532,4,2,0,3,0,1,41,8.731659,10,5
288329,5,557,1,3,0,2,2,0,14,11.00478,14,5
97334,1,214,0,0,0,3,5,1,20,8.791638,8,25


In [38]:
from sklearn.model_selection import train_test_split

X=data.drop('price',axis=1)
y=data['price']

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [39]:
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error,root_mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [40]:
models={
  'Linear Regression': Pipeline([
    ('scaler',StandardScaler()),
    ('model',LinearRegression())
  ]),

  'Ridge Regression': Pipeline([
    ('scaler',StandardScaler()),
    ('model',RidgeCV())
  ]),

  'Lasso Regression': Pipeline([
    ('scaler',StandardScaler()),
    ('model',LassoCV())
  ]),

  'KNN': Pipeline([
    ('scaler',StandardScaler()),
    ('model',KNeighborsRegressor())
  ]),

  'SVM': Pipeline([
    ('scaler',StandardScaler()),
    ('model',SVR())
  ]),

  'Xgboost': Pipeline([
    ('model',XGBRegressor()),
  ]),

  'Random Forest': Pipeline([
    ('model',RandomForestRegressor())
  ]),

  'Gradient Boosting': Pipeline([
    ('model',GradientBoostingRegressor())
  ])
}

In [41]:
result={}
for name,pipeline in models.items():
  model=pipeline.fit(X_train,y_train)
  y_pred=model.predict(X_test)
  result[name]={
    'R^2': r2_score(y_test,y_pred),
    'MAE': mean_absolute_error(y_test,y_pred),
    'MSE': mean_squared_error(y_test,y_pred),
    'RMSE': root_mean_squared_error(y_test,y_pred)
  }

result_df=pd.DataFrame(result).T
result_df.sort_values(by='R^2',ascending=False)

Unnamed: 0,R^2,MAE,MSE,RMSE
Random Forest,0.949048,0.177545,0.063077,0.251152
Gradient Boosting,0.947424,0.187104,0.065088,0.255123
Xgboost,0.946518,0.184138,0.06621,0.257313
SVM,0.928238,0.223369,0.08884,0.298061
KNN,0.905753,0.254742,0.116676,0.341579
Ridge Regression,0.888548,0.287309,0.137975,0.37145
Linear Regression,0.888532,0.287341,0.137996,0.371478
Lasso Regression,0.888469,0.287185,0.138073,0.371582


In [49]:
param_grid={
  'n_estimators':[50,100,150],
  'max_depth':[5,10,15],
  'min_samples_split':[2,5,10],
  'min_samples_leaf': [1,2,4]
}

best_model=RandomForestRegressor()
grid=GridSearchCV(estimator=best_model,param_grid=param_grid,cv=5,scoring='r2')
grid.fit(X_train,y_train)

y_pred_grid=grid.best_estimator_.predict(X_test)

print(f'Best parameters: {grid.best_params_}')
print(f'R^2: {r2_score(y_test,y_pred_grid)}')
print(f'MAE: {mean_absolute_error(y_test,y_pred_grid)}')
print(f'MSE: {mean_squared_error(y_test,y_pred_grid)}')
print(f'RMSE: {root_mean_squared_error(y_test,y_pred_grid)}')

Best parameters: {'max_depth': 15, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}
R^2: 0.9495182839851449
MAE: 0.17850441892973476
MSE: 0.062495424329965646
RMSE: 0.2499908484924311


In [50]:
import pickle

with open("model.pkl","wb") as f:
    pickle.dump(grid,f)
with open("label_encoders.pkl","wb") as f:
    pickle.dump(label_encoders,f)