In [5]:
import pandas as pd
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [2]:
data = pd.read_csv("../mydata/datafinal.csv")
data.head()

Unnamed: 0,Airline,stops,class,depature time,arrival time,Price,Days Left,Day of Week,Duration_log,Source_CMN,Source_IST,Source_LAX,Source_NRT,Source_PAR,Destination_CMN,Destination_IST,Destination_LAX,Destination_NRT,Destination_PAR
0,12,2,1,2,4,6.987213,36,2,7.573017,True,False,False,False,False,False,False,False,True,False
1,36,1,1,1,5,5.858476,10,4,7.150701,False,True,False,False,False,True,False,False,False,False
2,78,0,1,4,0,6.403111,34,0,5.686975,False,True,False,False,False,True,False,False,False,False
3,83,1,0,4,4,8.327777,12,6,6.872128,False,False,False,False,True,False,False,False,True,False
4,18,3,1,4,5,6.653082,30,3,7.420579,False,False,True,False,False,False,True,False,False,False


In [3]:
Y = data['Price']
X = data.drop(['Price'], axis=1)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.2,random_state=42)

In [8]:
models = {
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(),
    'Randomforest': RandomForestRegressor(),
    'xgboost': XGBRegressor(),
    'DecisionTree': DecisionTreeRegressor() 
}


In [9]:
# Train and evaluate each model
results = []
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    results.append({'Model': name, 'MSE': mse, 'MAE': mae, 'R2 Score': r2})

# Create DataFrame to display results
results_df = pd.DataFrame(results)

# Display results
print(results_df)

              Model       MSE       MAE  R2 Score
0  LinearRegression  0.195606  0.351579  0.784013
1             Ridge  0.195605  0.351577  0.784013
2      Randomforest  0.025871  0.099597  0.971433
3           xgboost  0.029922  0.124729  0.966960
4      DecisionTree  0.044453  0.110299  0.950915


 Random Forest and XGBoost appear to be the best-performing models based on the provided evaluation metrics, with Random Forest slightly edging out XGBoost in terms of MSE and MAE.

Perform gridsearch to choose better parameters

In [10]:
param_grid = {
    'LinearRegression': {},
    'Ridge': {'alpha': [0.1, 0.5, 1.0]},
    'Randomforest': {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20]},
    'xgboost': {'n_estimators': [50, 100, 200], 'max_depth': [3, 5, 7]}
}

# Perform GridSearchCV for each model
best_models = {}
results = []
for name, model in models.items():
    if name in param_grid:
        grid_search = GridSearchCV(model, param_grid[name], cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
        grid_search.fit(X_train, y_train)
        best_models[name] = grid_search.best_estimator_
        y_pred = best_models[name].predict(X_test)
        r2 = r2_score(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        results.append({'Model': name, 'MSE': mse, 'MAE': mae, 'R2 Score': r2})

# Create DataFrame to display results
grid_results_df = pd.DataFrame(results)

# Display results
print(grid_results_df)

              Model       MSE       MAE  R2 Score
0  LinearRegression  0.195606  0.351579  0.784013
1             Ridge  0.195605  0.351577  0.784013
2      Randomforest  0.025984  0.099695  0.971308
3           xgboost  0.025826  0.110790  0.971483


Linear Regression and Ridge Regression remain the same, indicating that the hyperparameter tuning did not significantly affect their performance.
Both Random Forest and XGBoost models have slightly lower MSE and MAE values, indicating improved performance in terms of error metrics. Their R2 scores remain high, indicating good performance.


********************************************************************************************************************

we choose Random Forest. why?

Random Forest models tend to be more lightweight compared to XGBoost models. Random Forest is an ensemble method based on decision trees, and each decision tree in the ensemble is relatively simple compared to the boosted trees used in XGBoost. Additionally, Random Forest typically requires fewer hyperparameters to tune compared to XGBoost, which can simplify the model training process and reduce computational overhead.