In [69]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [70]:
df=pd.read_csv('cleaned_dataset.csv')
df.head()

Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,days_left,price,duration_hour,duration_min
0,SpiceJet,SG-8709,Delhi,Evening,zero,Night,Mumbai,Economy,1,5953,2,17
1,SpiceJet,SG-8157,Delhi,Morning,zero,Morning,Mumbai,Economy,1,5953,2,33
2,AirAsia,I5-764,Delhi,Morning,zero,Morning,Mumbai,Economy,1,5956,2,17
3,Vistara,UK-995,Delhi,Morning,zero,Afternoon,Mumbai,Economy,1,5955,2,25
4,Vistara,UK-963,Delhi,Morning,zero,Morning,Mumbai,Economy,1,5955,2,33


In [71]:
data=df.sample(n=2000,random_state=42)

In [72]:
from sklearn.preprocessing import LabelEncoder

categorical_columns=[]
for i in data.columns:
  if data[i].dtype=='object':
    categorical_columns.append(i)

label_encoders = {} 

for col in categorical_columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col].astype(str))
    label_encoders[col] = le
 
data.head()

Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,days_left,price,duration_hour,duration_min
27131,1,219,2,2,0,2,4,1,40,7366,19,75
266857,5,524,4,2,0,3,5,0,42,64831,9,83
141228,5,532,4,2,0,3,0,1,41,6195,10,5
288329,5,557,1,3,0,2,2,0,14,60160,14,5
97334,1,214,0,0,0,3,5,1,20,6578,8,25


In [73]:
from sklearn.model_selection import train_test_split

X=data.drop('price',axis=1)
y=data['price']

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

LINEAR REGRESSION

In [74]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score,root_mean_squared_error
model=LinearRegression()
model.fit(X_train,y_train)

y_pred=model.predict(X_test)

print(f'R^2: {r2_score(y_test,y_pred)}')
print(f'RMSE: {root_mean_squared_error(y_test,y_pred)}')

R^2: 0.9034554998383858
RMSE: 7043.775130184119


RIDGE REGRESSION

In [75]:
from sklearn.linear_model import RidgeCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge

scaler=StandardScaler()
X_train_scaled=scaler.fit_transform(X_train)
X_test_scaled=scaler.transform(X_test)

param_grid={
  'alpha': np.logspace(-3,3,50)
}

grid=GridSearchCV(Ridge(),param_grid,cv=5,scoring='r2')
grid.fit(X_train_scaled,y_train)

y_pred1=grid.best_estimator_.predict(X_test_scaled)

print(f'Best alpha: {grid.best_params_["alpha"]}')
print(f'R^2: {r2_score(y_test,y_pred1)}')
print(f'RMSE: {root_mean_squared_error(y_test,y_pred1)}')


Best alpha: 4.714866363457395
R^2: 0.9034654736988674
RMSE: 7043.411280121006


LASSO REGRESSION

In [76]:
from sklearn.linear_model import Lasso

param_grid={
  'alpha':np.logspace(-3,1,50)
}

grid=GridSearchCV(Lasso(),param_grid,cv=5,scoring='r2')
grid.fit(X_train_scaled,y_train)

y_pred2=grid.best_estimator_.predict(X_test_scaled)

print(f'Best alpha: {grid.best_params_["alpha"]}')
print(f'R^2: {r2_score(y_test,y_pred2)}')
print(f'RMSE: {root_mean_squared_error(y_test,y_pred2)}')


Best alpha: 10.0
R^2: 0.9035671413264245
RMSE: 7039.701335179362


KNN

In [77]:
from sklearn.neighbors import KNeighborsRegressor

param_grid={
  'n_neighbors':list(range(1,21)), #no.of nearest neighbors used to make predictions
  'weights':['uniform','distance'],  #how neighbors influence the prediction (uniform--> all neighbors contribute equally, distance → closer neighbors have more influence)
  'p':[1,2]  #p=1 --> Manhattan distance, p=2 --> Euclidean Distance
  }
grid=GridSearchCV(KNeighborsRegressor(),param_grid,cv=5,scoring='r2')
grid.fit(X_train_scaled,y_train)
y_pred3=grid.best_estimator_.predict(X_test_scaled)

print(f'Best params: {grid.best_params_}')
print(f'R^2: {r2_score(y_test,y_pred3)}')
print(f'RMSE: {root_mean_squared_error(y_test,y_pred3)}')


Best params: {'n_neighbors': 5, 'p': 2, 'weights': 'distance'}
R^2: 0.9348686534869874
RMSE: 5785.444636919566


SVM

In [78]:
from sklearn.svm import SVR

param_grid={
  'C':[0.1,1,10,100],  #C --> regularization parameter
  'epsilon':[0.01,0.1,0.5,1],
  'kernel':['linear','rbf']
}
grid=GridSearchCV(SVR(),param_grid,cv=5,scoring='r2')
grid.fit(X_train_scaled,y_train)

y_pred4=grid.best_estimator_.predict(X_test_scaled)

print(f'Best params: {grid.best_params_}')
print(f'R^2: {r2_score(y_test,y_pred4)}')
print(f'RMSE: {root_mean_squared_error(y_test,y_pred4)}')

Best params: {'C': 100, 'epsilon': 0.01, 'kernel': 'linear'}
R^2: 0.9011692941412056
RMSE: 7126.686621675992


Xgboost

In [79]:
from xgboost import XGBRegressor
model6=XGBRegressor(objective='reg:squarederror')
param_grid={
  'max_depth':[2,3,5],
  'learning_rate':[0.01,0.1,0.2],
  'n_estimators':[50,80,100]
  }
grid=GridSearchCV(model6,param_grid,cv=5,scoring='r2')
grid.fit(X_train,y_train)

y_pred6=grid.best_estimator_.predict(X_test)

print(f'Best params: {grid.best_params_}')
print(f'R^2: {r2_score(y_test,y_pred6)}')
print(f'RMSE: {root_mean_squared_error(y_test,y_pred6)}')


Best params: {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 80}
R^2: 0.9472814202308655
RMSE: 5205.0341796875


RANDOM FOREST REGRESSOR

In [80]:
from sklearn.ensemble import RandomForestRegressor

model8=RandomForestRegressor(random_state=42)
param_grid={
  'n_estimators':[50,100,150],
  'max_depth':[5,10,15],
  'min_samples_split':[2,5,10]
  }

grid=GridSearchCV(model8,param_grid,cv=5,scoring='r2')
grid.fit(X_train,y_train)

y_pred8=grid.best_estimator_.predict(X_test)

print(f'Best params: {grid.best_params_}')
print(f'R^2: {r2_score(y_test,y_pred8)}')
print(f'RMSE: {root_mean_squared_error(y_test,y_pred8)}')

Best params: {'max_depth': 15, 'min_samples_split': 2, 'n_estimators': 150}
R^2: 0.949883244610807
RMSE: 5074.96658291267


GRADIENT BOOST

In [81]:
from sklearn.ensemble import GradientBoostingRegressor
model7=GradientBoostingRegressor()
param_grid={
  'max_depth':[2,3,5],
  'learning_rate':[0.01,0.1,0.2],
  'n_estimators':[50,80,100]
  }
grid=GridSearchCV(model7,param_grid,cv=5,scoring='r2')
grid.fit(X_train,y_train)

y_pred7=grid.best_estimator_.predict(X_test)

print(f'Best params: {grid.best_params_}')
print(f'R^2: {r2_score(y_test,y_pred7)}')
print(f'RMSE: {root_mean_squared_error(y_test,y_pred7)}')

Best params: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 80}
R^2: 0.9508730181525509
RMSE: 5024.603025275199


In [84]:
with open("model.pkl","wb") as f:
    pickle.dump(model,f)
with open("label_encoders.pkl","wb") as f:
    pickle.dump(label_encoders,f)