In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score

In [None]:
# Load the cleaned airlines data
data = pd.read_csv('cleaned_data.csv', index_col=0)
data.head()

Unnamed: 0,Total_Stops,Price,Airline_Air India,Airline_GoAir,Airline_IndiGo,Airline_Jet Airways,Airline_Jet Airways Business,Airline_Multiple carriers,Airline_SpiceJet,Airline_Trujet,...,arrival_timezone_Morning,arrival_timezone_Afternoon,arrival_timezone_Evening,Additional_Info_1 short layover,Additional_Info_2 long layover,Additional_Info_business class,Additional_Info_change airports,Additional_Info_in-flight meal not included,Additional_Info_no check-in baggage included,Additional_Info_red-eye flight
0,0,3897,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,7662,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,2,13882,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,6218,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,1,13302,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [None]:
# Separate features and target variable
features = data.drop(columns=['Price'])
target = data['Price']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

print("The size of training input is", X_train.shape)
print("The size of training output is", y_train.shape)
print("The size of testing input is", X_test.shape)
print("The size of testing output is", y_test.shape)

The size of training input is (8546, 41)
The size of training output is (8546,)
The size of testing input is (2137, 41)
The size of testing output is (2137,)


## Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

# keeping to default hyperparameters
lr = LinearRegression()

lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

lr_mae = mean_absolute_error(y_test, y_pred)
lr_mape = mean_absolute_percentage_error(y_test, y_pred)
lr_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
lr_r2 = 100*r2_score(y_test.values, y_pred)

print(f"""
Mean Absolute Error: {lr_mae}
Mean Absolute Percentage Error: {lr_mape}
Root Mean Square Error: {lr_rmse}
R-Squared: {lr_r2}%
""")


Mean Absolute Error: 1784.5199931470474
Mean Absolute Percentage Error: 0.21791151953740756
Root Mean Square Error: 2553.704184268547
R-Squared: 69.18628085306136%



## Ridge Regression

In [None]:
from sklearn.linear_model import Ridge

ridge = Ridge()
param_grid = {'alpha': np.logspace(0, 5, 10)}

grid_search = GridSearchCV(estimator=ridge, param_grid=param_grid, cv=5, scoring='neg_mean_absolute_error')
best_ridge = grid_search.fit(X_train, y_train)
print(f"Best Parameters: {best_ridge.best_params_}")

y_pred = best_ridge.predict(X_test)

ridge_mae = mean_absolute_error(y_test, y_pred)
ridge_mape = mean_absolute_percentage_error(y_test, y_pred)
ridge_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
ridge_r2 = 100*r2_score(y_test.values, y_pred)

print(f"""
Mean Absolute Error: {ridge_mae}
Mean Absolute Percentage Error: {ridge_mape}
Root Mean Square Error: {ridge_rmse}
R-Squared: {ridge_r2}%
""")

Best Parameters: {'alpha': 1.0}

Mean Absolute Error: 1781.1129235121325
Mean Absolute Percentage Error: 0.2182311361706202
Root Mean Square Error: 2531.998241044975
R-Squared: 69.70787481047658%



## Lasso Regression

In [None]:
from sklearn.linear_model import Lasso

param_grid = {
    'alpha': np.logspace(0, 5, 10),
    'fit_intercept': [True, False]
}
lasso = Lasso()
grid_search = GridSearchCV(estimator=lasso, param_grid=param_grid, cv=5, scoring='neg_mean_absolute_error')
best_lasso = grid_search.fit(X_train, y_train)
print("Best Parameters:", grid_search.best_params_)

y_pred_train = best_lasso.predict(X_train)
y_pred = best_lasso.predict(X_test)

lasso_mae_train = mean_absolute_error(y_train, y_pred_train)
lasso_mape_train = mean_absolute_percentage_error(y_train, y_pred_train)
lasso_rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
lasso_r2_train = r2_score(y_train, y_pred_train)

print(lasso_mae_train, lasso_mape_train, lasso_rmse_train, lasso_r2_train)

lasso_mae = mean_absolute_error(y_test, y_pred)
lasso_mape = mean_absolute_percentage_error(y_test, y_pred)
lasso_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
lasso_r2 = 100*r2_score(y_test.values, y_pred)

print(f"""
Mean Absolute Error: {lasso_mae}
Mean Absolute Percentage Error: {lasso_mape}
Root Mean Square Error: {lasso_rmse}
R-Squared: {lasso_r2}%
""")

Best Parameters: {'alpha': 3.5938136638046276, 'fit_intercept': False}
1733.4478579022434 0.21015354803089212 2545.8479696576323 0.6955196592907527

Mean Absolute Error: 1765.1262532067321
Mean Absolute Percentage Error: 0.21243654419317606
Root Mean Square Error: 2541.25047616862
R-Squared: 69.48608797785315%



## Decision Tree Regression

In [None]:
from sklearn.tree import DecisionTreeRegressor

dt_regressor = DecisionTreeRegressor()

param_grid = {
   'max_depth': list(range(3,30)),
   'min_samples_leaf': [2, 3, 4, 6, 8, 10, 12, 14, 16]
}

grid_search = GridSearchCV(estimator=dt_regressor, param_grid=param_grid, cv=5)
dt_best_model = grid_search.fit(X_train, y_train)
print(f"Best Parameters: {dt_best_model.best_params_}")

Best Parameters: {'max_depth': 18, 'min_samples_leaf': 3}


In [None]:
y_pred = dt_best_model.predict(X_test)

dt_mae = mean_absolute_error(y_test, y_pred)
dt_mape = mean_absolute_percentage_error(y_test, y_pred)
dt_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
dt_r2 = 100*r2_score(y_test.values, y_pred)

print(f"""
Mean Absolute Error: {dt_mae}
Mean Absolute Percentage Error: {dt_mape}
Root Mean Square Error: {dt_rmse}
R-Squared: {dt_r2}%
""")


Mean Absolute Error: 825.5520985706361
Mean Absolute Percentage Error: 0.08970409828139436
Root Mean Square Error: 1948.9686365130492
R-Squared: 82.05214119074773%



## Random Forest Regression

In [None]:
from sklearn.ensemble import RandomForestRegressor

param_grid = {
    'n_estimators': [500],
    'max_depth': [17, 19, 20],
    'min_samples_split': [10, 11, 12],
    'min_samples_leaf': [1],
    'max_features': ['sqrt'],
    'bootstrap': [True]
}

# Initialize the Random Forest Regressor
rf = RandomForestRegressor()
grid_search = GridSearchCV(rf, param_grid, cv=5)
best_rf = grid_search.fit(X_train, y_train)
print(f"Best Parameters: {best_rf.best_params_}")

Best Parameters: {'bootstrap': True, 'max_depth': 19, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 400}


In [None]:
y_pred_train = best_rf.predict(X_train)
y_pred = best_rf.predict(X_test)

rf_mae_train = mean_absolute_error(y_train, y_pred_train)
rf_mape_train = mean_absolute_percentage_error(y_train, y_pred_train)
rf_rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
rf_r2_train = r2_score(y_train, y_pred_train)

print(rf_mae_train, rf_mape_train, rf_rmse_train, rf_r2_train)

rf_mae = mean_absolute_error(y_test, y_pred)
rf_mape = mean_absolute_percentage_error(y_test, y_pred)
rf_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
rf_r2 = 100*r2_score(y_test.values, y_pred)

print(f"""
Mean Absolute Error: {rf_mae}
Mean Absolute Percentage Error: {rf_mape}
Root Mean Square Error: {rf_rmse}
R-Squared: {rf_r2}%
""")

852.1572159566828 0.10012601498450763 1472.4550618914136 0.8981459212685691

Mean Absolute Error: 1042.4646555870218
Mean Absolute Percentage Error: 0.122726691855556
Root Mean Square Error: 1731.226998288715
R-Squared: 85.83844393003376%



In [None]:
results = {'Model': ['Linear Regression', 'Ridge', 'Lasso', 'Decision Tree', 'Random Forest'],
           'MAE': [lr_mae, ridge_mae, lasso_mae, dt_mae, rf_mae],
           'MAPE': [lr_mape, ridge_mape, lasso_mape, dt_mape, rf_mape],
           'RMSE': [lr_rmse, ridge_rmse, lasso_rmse, dt_rmse, rf_rmse],
           'R^2 Score': [lr_r2, ridge_r2, lasso_r2, dt_r2, rf_r2]}

# Convert the dictionary into a pandas DataFrame
df_results = pd.DataFrame(results)

# Print the resulting DataFrame
df_results

Unnamed: 0,Model,MAE,MAPE,RMSE,R^2 Score
0,Linear Regression,1784.519993,0.217912,2553.704184,69.186281
1,Ridge,1781.112924,0.218231,2531.998241,69.707875
2,Lasso,1765.126253,0.212437,2541.250476,69.486088
3,Decision Tree,825.552099,0.089704,1948.968637,82.052141
4,Random Forest,1042.464656,0.122727,1731.226998,85.838444


In [None]:
from sklearn.externals import joblib

# Save to file in the current working directory
joblib_file = "airfare_model.pkl"
joblib.dump(best_rf, joblib_file)
print("Model saved!")