In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import csv

In [None]:
# Load the data
data = pd.read_csv('/content/Data_subset1.csv')

In [None]:
data

Unnamed: 0,ID,b1,b2,b3,b4,b5,b6,b7,b8,b9,...,b183,b184,b185,b186,b187,b188,b189,b190,b191,Turbidity
0,1,0.081264,0.081630,0.080447,0.084221,0.085154,0.088562,0.084579,0.090803,0.094506,...,0.021204,0.020077,0.022498,0.019879,0.012558,0.016607,0.019179,0.014887,0.016105,17.32
1,2,0.081264,0.081630,0.080447,0.084221,0.085154,0.088562,0.084579,0.090803,0.094506,...,0.021204,0.020077,0.022498,0.019879,0.012558,0.016607,0.019179,0.014887,0.016105,20.48
2,3,0.081264,0.081630,0.080447,0.084221,0.085154,0.088562,0.084579,0.090803,0.094506,...,0.021204,0.020077,0.022498,0.019879,0.012558,0.016607,0.019179,0.014887,0.016105,17.19
3,4,0.081264,0.081630,0.080447,0.084221,0.085154,0.088562,0.084579,0.090803,0.094506,...,0.021204,0.020077,0.022498,0.019879,0.012558,0.016607,0.019179,0.014887,0.016105,17.54
4,5,0.081264,0.081630,0.080447,0.084221,0.085154,0.088562,0.084579,0.090803,0.094506,...,0.021204,0.020077,0.022498,0.019879,0.012558,0.016607,0.019179,0.014887,0.016105,19.42
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
811,812,0.078206,0.079817,0.077016,0.080315,0.082159,0.085170,0.083653,0.088803,0.091550,...,0.020793,0.023350,0.024111,0.017627,0.016759,0.018555,0.013365,0.019103,0.016531,31.06
812,813,0.078206,0.079817,0.077016,0.080315,0.082159,0.085170,0.083653,0.088803,0.091550,...,0.020793,0.023350,0.024111,0.017627,0.016759,0.018555,0.013365,0.019103,0.016531,52.08
813,814,0.076448,0.077887,0.076394,0.080611,0.081132,0.084687,0.084073,0.087379,0.089745,...,0.026607,0.020016,0.020808,0.018099,0.016866,0.012725,0.012649,0.026592,0.014826,641.01
814,815,0.078206,0.079817,0.077016,0.080315,0.082159,0.085170,0.083653,0.088803,0.091550,...,0.020793,0.023350,0.024111,0.017627,0.016759,0.018555,0.013365,0.019103,0.016531,304.22


In [None]:
# Split the data into features (X) and target (y)
X = data.iloc[:, 1:192].values
y = data.iloc[:, 192].values

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
# Define the list of regressors to use
regressors = [RandomForestRegressor(n_estimators=100, random_state=0),
              DecisionTreeRegressor(random_state=0),
              LinearRegression(),
              Ridge(),
              Lasso(),
              SVR(),
              KNeighborsRegressor(),
              XGBRegressor(random_state=0),
              LGBMRegressor(random_state=0)]

In [None]:
# Train and evaluate each regressor
for regressor in regressors:
    # Train the regressor
    regressor.fit(X_train, y_train)

    # Predict the target values
    y_pred = regressor.predict(X_test)

    # Calculate the mean squared error and R-squared score
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
    r2 = r2_score(y_test, y_pred)

    # Print the results
    print(regressor.__class__.__name__)
    print('MSE:', mse)
    print('RMSE:', rmse)
    print('MAPE:', mape)
    print('R-squared:', r2)
    #print('Feature Importances:', regressor.feature_importances_ if hasattr(regressor, 'feature_importances_') else 'N/A')
    print('-' * 50)



RandomForestRegressor
MSE: 1423.7220577077308
RMSE: 37.73224162049918
MAPE: 28.173983034533144
R-squared: -1.3523145061080313
--------------------------------------------------
DecisionTreeRegressor
MSE: 3538.463753601695
RMSE: 59.48498763218914
MAPE: 25.542906682390747
R-squared: -4.846351520560238
--------------------------------------------------
LinearRegression
MSE: 315.01844966342054
RMSE: 17.748759102073038
MAPE: 50.479693400641125
R-squared: 0.479517462819946
--------------------------------------------------
Ridge
MSE: 584.6792748586037
RMSE: 24.18014215960286
MAPE: 37.027788137972465
R-squared: 0.03397609651071576
--------------------------------------------------
Lasso
MSE: 611.5492691586634
RMSE: 24.729522218568302
MAPE: 35.9729377145991
R-squared: -0.01041928040213258
--------------------------------------------------
SVR
MSE: 622.7662712418982
RMSE: 24.95528543699507
MAPE: 9.969652291886355
R-squared: -0.02895233365686667
--------------------------------------------------

In [None]:
with open('regressor_scores.csv', mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Regressor', 'MSE', 'RMSE', 'MAPE', 'R-squared'])
    for regressor in regressors:
        regressor.fit(X_train, y_train)
        y_pred = regressor.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
        r2 = r2_score(y_test, y_pred)
        writer.writerow([regressor.__class__.__name__, mse, rmse, mape, r2])





In [None]:
pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.1.1-cp38-none-manylinux1_x86_64.whl (76.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.6/76.6 MB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.1.1


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor

ModuleNotFoundError: ignored

In [None]:
df = pd.read_csv('/content/Data_subset1.csv')

In [None]:
X = df.iloc[:, 1:192].values
y = df.iloc[:, 192].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Initialize the 10 machine learning algorithms
rf = RandomForestRegressor(random_state=42)
lr = LinearRegression()
dt = DecisionTreeRegressor(random_state=42)
svm = SVR(kernel='linear')
xgb = XGBRegressor(random_state=42)
lgbm = LGBMRegressor(random_state=42)
cat = CatBoostRegressor(random_state=42)
mlp = MLPRegressor(random_state=42)
gb = GradientBoostingRegressor(random_state=42)
et = ExtraTreesRegressor(random_state=42)


In [None]:
# Fit each algorithm on the training data
rf.fit(X_train, y_train)
lr.fit(X_train, y_train)
dt.fit(X_train, y_train)
svm.fit(X_train, y_train)
xgb.fit(X_train, y_train)
lgbm.fit(X_train, y_train)
cat.fit(X_train, y_train)
mlp.fit(X_train, y_train)
gb.fit(X_train, y_train)
et.fit(X_train, y_train)

Learning rate set to 0.038041
0:	learn: 46.2867409	total: 77.7ms	remaining: 1m 17s
1:	learn: 45.8676621	total: 121ms	remaining: 1m
2:	learn: 45.4465150	total: 165ms	remaining: 54.7s
3:	learn: 45.1233560	total: 215ms	remaining: 53.6s
4:	learn: 44.6415861	total: 261ms	remaining: 51.9s
5:	learn: 44.2493014	total: 304ms	remaining: 50.4s
6:	learn: 43.8604404	total: 349ms	remaining: 49.5s
7:	learn: 43.4709588	total: 397ms	remaining: 49.2s
8:	learn: 43.2879785	total: 449ms	remaining: 49.5s
9:	learn: 42.9100197	total: 496ms	remaining: 49.1s
10:	learn: 42.5442919	total: 545ms	remaining: 49s
11:	learn: 42.0976163	total: 602ms	remaining: 49.6s
12:	learn: 41.7342188	total: 649ms	remaining: 49.3s
13:	learn: 41.4176533	total: 705ms	remaining: 49.6s
14:	learn: 41.0768463	total: 753ms	remaining: 49.4s
15:	learn: 40.7289139	total: 800ms	remaining: 49.2s
16:	learn: 40.3383005	total: 865ms	remaining: 50s
17:	learn: 39.9650022	total: 914ms	remaining: 49.9s
18:	learn: 39.6596539	total: 968ms	remaining: 50s

ExtraTreesRegressor(random_state=42)

In [None]:
# Calculate the feature importances for each algorithm
rf_feature_importances = rf.feature_importances_
lr_feature_importances = lr.coef_
dt_feature_importances = dt.feature_importances_
svm_feature_importances = None
if svm.kernel == 'linear':
    svm_feature_importances = svm.coef_
else:
    svm_feature_importances = svm.feature_importances_
xgb_feature_importances = xgb.feature_importances_
lgbm_feature_importances = lgbm.feature_importances_
cat_feature_importances = cat.feature_importances_
mlp_feature_importances = mlp.coefs_
gb_feature_importances = gb.feature_importances_
et_feature_importances = et.feature_importances_

In [None]:
# Store the feature importances in a dataframe for comparison
feature_importances_df = pd.DataFrame({
    'Algorithm': ['Random Forest', 'Linear Regression', 'Decision Tree', 'Support Vector Machine', 'XGBoost', 'LightGBM', 'CatBoost', 'MLP', 'Gradient Boosting', 'Extra Trees'],
    'Feature Importances': [rf_feature_importances, lr_feature_importances, dt_feature_importances, svm_feature_importances, xgb_feature_importances, lgbm_feature_importances,
                            cat_feature_importances, mlp_feature_importances, gb_feature_importances, et_feature_importances]
})

# Save feature importances to a CSV file
feature_importances_df.to_csv('feature_importances.csv', index=False)




In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
import csv

In [None]:
X = df.iloc[:, 1:-1]
y = data.iloc[:, -1]

In [None]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit the model on the training data
rf.fit(X, y)

# Calculate feature importances
importances = rf.feature_importances_

# Print feature importances
#for i, importance in enumerate(importances):
    #print('Feature %d: %.3f' % (i+1, importance))

y_pred = rf.predict(X)
mse = mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y, y_pred)
mape = np.mean(np.abs((y - y_pred) / y)) * 100
r2 = r2_score(y, y_pred)

# Save feature importances and performance metrics in a CSV file
metrics = pd.DataFrame({'Feature Importances': importances, 'MSE': mse, 'RMSE': rmse, 'MAE': mae, 'MAPE': mape, 'R2': r2})
metrics.to_csv('RF_feature_importances.csv', index=True)



In [None]:
svr = SVR(kernel='linear')

# Fit the model on the training data
svr.fit(X, y)

# Calculate feature importances
importances = np.abs(svr.coef_[0])

# Print feature importances
for i, importance in enumerate(importances):
    print('Feature %d: %.3f' % (i+1, importance))

# Calculate performance metrics
y_pred = svr.predict(X)
mse = mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y, y_pred)
mape = np.mean(np.abs((y - y_pred) / y)) * 100
r2 = r2_score(y, y_pred)

# Save feature importances and performance metrics in a CSV file
metrics =



In [None]:
y_pred = rf.predict(X)
mse = mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y, y_pred)
mape = np.mean(np.abs((y - y_pred) / y)) * 100
r2 = r2_score(y, y_pred)

In [None]:
# Save feature importances and performance metrics in a CSV file
metrics = pd.DataFrame({'Feature Importances': importances, 'MSE': mse, 'RMSE': rmse, 'MAE': mae, 'MAPE': mape, 'R2': r2})
metrics.to_csv('metrics.csv', index=True)

In [None]:
dt = DecisionTreeRegressor(random_state=42)

# Fit the model on the training data
dt.fit(X, y)

# Calculate feature importances
importances = dt.feature_importances_

# Print feature importances
for i, importance in enumerate(importances):
    print('Feature %d: %.3f' % (i+1, importance))

# Calculate performance metrics
y_pred = dt.predict(X)
mse = mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y, y_pred)
mape = np.mean(np.abs((y - y_pred) / y)) * 100
r2 = r2_score(y, y_pred)

# Save feature importances and performance metrics in a CSV file
metrics = pd.DataFrame({'Feature Importances': importances, 'MSE': mse, 'RMSE': rmse, 'MAE': mae, 'MAPE': mape, 'R2': r2})
metrics.to_csv('decision_tree_metrics.csv', index=True)

Feature 1: 0.000
Feature 2: 0.000
Feature 3: 0.000
Feature 4: 0.000
Feature 5: 0.000
Feature 6: 0.000
Feature 7: 0.000
Feature 8: 0.000
Feature 9: 0.000
Feature 10: 0.000
Feature 11: 0.001
Feature 12: 0.012
Feature 13: 0.000
Feature 14: 0.000
Feature 15: 0.000
Feature 16: 0.000
Feature 17: 0.000
Feature 18: 0.000
Feature 19: 0.000
Feature 20: 0.000
Feature 21: 0.000
Feature 22: 0.000
Feature 23: 0.000
Feature 24: 0.001
Feature 25: 0.000
Feature 26: 0.000
Feature 27: 0.000
Feature 28: 0.000
Feature 29: 0.000
Feature 30: 0.000
Feature 31: 0.000
Feature 32: 0.000
Feature 33: 0.000
Feature 34: 0.000
Feature 35: 0.000
Feature 36: 0.000
Feature 37: 0.000
Feature 38: 0.000
Feature 39: 0.000
Feature 40: 0.000
Feature 41: 0.000
Feature 42: 0.000
Feature 43: 0.000
Feature 44: 0.000
Feature 45: 0.165
Feature 46: 0.000
Feature 47: 0.000
Feature 48: 0.001
Feature 49: 0.000
Feature 50: 0.000
Feature 51: 0.000
Feature 52: 0.001
Feature 53: 0.000
Feature 54: 0.000
Feature 55: 0.000
Feature 56: 0.000
F

In [None]:
svr = SVR(kernel='linear')

# Fit the model on the training data
svr.fit(X, y)

# Calculate feature importances
importances = np.abs(svr.coef_[0])

# Print feature importances
for i, importance in enumerate(importances):
    print('Feature %d: %.3f' % (i+1, importance))

# Calculate performance metrics
y_pred = svr.predict(X)
mse = mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y, y_pred)
mape = np.mean(np.abs((y - y_pred) / y)) * 100
r2 = r2_score(y, y_pred)

# Save feature importances and performance metrics in a CSV file
metrics = pd.DataFrame({'Feature Importances': importances, 'MSE': mse, 'RMSE': rmse, 'MAE': mae, 'MAPE': mape, 'R2': r2})
metrics.to_csv('svr_metrics.csv', index=True)




Feature 1: 0.145
Feature 2: 0.126
Feature 3: 0.167
Feature 4: 0.153
Feature 5: 0.166
Feature 6: 0.207
Feature 7: 0.167
Feature 8: 0.133
Feature 9: 0.143
Feature 10: 0.163
Feature 11: 0.184
Feature 12: 0.142
Feature 13: 0.165
Feature 14: 0.197
Feature 15: 0.225
Feature 16: 0.210
Feature 17: 0.222
Feature 18: 0.245
Feature 19: 0.177
Feature 20: 0.171
Feature 21: 0.169
Feature 22: 0.173
Feature 23: 0.158
Feature 24: 0.165
Feature 25: 0.126
Feature 26: 0.083
Feature 27: 0.096
Feature 28: 0.115
Feature 29: 0.022
Feature 30: 0.136
Feature 31: 0.284
Feature 32: 0.275
Feature 33: 0.384
Feature 34: 0.304
Feature 35: 0.339
Feature 36: 0.328
Feature 37: 0.358
Feature 38: 0.417
Feature 39: 0.449
Feature 40: 0.431
Feature 41: 0.446
Feature 42: 0.523
Feature 43: 0.498
Feature 44: 0.484
Feature 45: 0.517
Feature 46: 0.547
Feature 47: 0.526
Feature 48: 0.512
Feature 49: 0.607
Feature 50: 0.606
Feature 51: 0.453
Feature 52: 0.719
Feature 53: 0.555
Feature 54: 0.569
Feature 55: 0.462
Feature 56: 0.541
F

In [None]:
# Create a XGBRegressor model
xgb = XGBRegressor(n_estimators=100, random_state=42)

# Fit the model on the training data
xgb.fit(X, y)

# Calculate feature importances
importances = xgb.feature_importances_

# Print feature importances
for i, importance in enumerate(importances):
    print('XGB Feature %d: %.3f' % (i+1, importance))

# Calculate performance metrics
y_pred = xgb.predict(X)
mse = mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y, y_pred)
mape = np.mean(np.abs((y - y_pred) / y)) * 100
r2 = r2_score(y, y_pred)

# Append feature importances and performance metrics to the CSV file
#metrics = metrics.append(pd.DataFrame({'Model': ['XGBRegressor'], 'Feature Importances': importances, 'MSE': mse, 'RMSE': rmse, 'MAE': mae, 'MAPE': mape, 'R2': r2}), ignore_index=True)

metrics = pd.DataFrame({'Feature Importances': importances, 'MSE': mse, 'RMSE': rmse, 'MAE': mae, 'MAPE': mape, 'R2': r2})
metrics.to_csv('xgb_metrics.csv', index=True)


XGB Feature 1: 0.018
XGB Feature 2: 0.000
XGB Feature 3: 0.005
XGB Feature 4: 0.000
XGB Feature 5: 0.000
XGB Feature 6: 0.000
XGB Feature 7: 0.000
XGB Feature 8: 0.000
XGB Feature 9: 0.000
XGB Feature 10: 0.000
XGB Feature 11: 0.000
XGB Feature 12: 0.000
XGB Feature 13: 0.000
XGB Feature 14: 0.000
XGB Feature 15: 0.000
XGB Feature 16: 0.000
XGB Feature 17: 0.000
XGB Feature 18: 0.000
XGB Feature 19: 0.002
XGB Feature 20: 0.000
XGB Feature 21: 0.000
XGB Feature 22: 0.000
XGB Feature 23: 0.000
XGB Feature 24: 0.000
XGB Feature 25: 0.000
XGB Feature 26: 0.000
XGB Feature 27: 0.000
XGB Feature 28: 0.000
XGB Feature 29: 0.001
XGB Feature 30: 0.000
XGB Feature 31: 0.000
XGB Feature 32: 0.000
XGB Feature 33: 0.000
XGB Feature 34: 0.000
XGB Feature 35: 0.002
XGB Feature 36: 0.000
XGB Feature 37: 0.002
XGB Feature 38: 0.000
XGB Feature 39: 0.002
XGB Feature 40: 0.000
XGB Feature 41: 0.000
XGB Feature 42: 0.000
XGB Feature 43: 0.000
XGB Feature 44: 0.005
XGB Feature 45: 0.001
XGB Feature 46: 0.0

In [None]:
# Create a LGBMRegressor model
lgbm = LGBMRegressor(n_estimators=100, random_state=42)

# Fit the model on the training data
lgbm.fit(X, y)

# Calculate feature importances
importances = lgbm.feature_importances_

# Print feature importances
for i, importance in enumerate(importances):
    print('LGBM Feature %d: %.3f' % (i+1, importance))

# Calculate performance metrics
y_pred = lgbm.predict(X)
mse = mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y, y_pred)
mape = np.mean(np.abs((y - y_pred) / y)) * 100
r2 = r2_score(y, y_pred)

# Append feature importances and performance metrics to the CSV file
#metrics = metrics.append(pd.DataFrame({'Model': ['XGBRegressor'], 'Feature Importances': importances, 'MSE': mse, 'RMSE': rmse, 'MAE': mae, 'MAPE': mape, 'R2': r2}), ignore_index=True)

metrics = pd.DataFrame({'Feature Importances': importances, 'MSE': mse, 'RMSE': rmse, 'MAE': mae, 'MAPE': mape, 'R2': r2})
metrics.to_csv('lgbm_feature_importance.csv', index=True)

In [None]:
cb = CatBoostRegressor(iterations=100, learning_rate=0.1, random_seed=42)
cb.fit(X, y, verbose=False)
importances = cb.feature_importances_
#for i, importance in enumerate(importances):
 #print('Feature %d: %.3f' % (i+1, importance))
# Calculate performance metrics
y_pred = cb.predict(X)
mse = mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y, y_pred)
mape = np.mean(np.abs((y - y_pred) / y)) * 100
r2 = r2_score(y, y_pred)

# Append feature importances and performance metrics to the CSV file
#metrics = metrics.append(pd.DataFrame({'Model': ['XGBRegressor'], 'Feature Importances': importances, 'MSE': mse, 'RMSE': rmse, 'MAE': mae, 'MAPE': mape, 'R2': r2}), ignore_index=True)

metrics = pd.DataFrame({'Feature Importances': importances, 'MSE': mse, 'RMSE': rmse, 'MAE': mae, 'MAPE': mape, 'R2': r2})
metrics.to_csv('catboost_feature_importance.csv', index=True)

In [None]:
gbr = GradientBoostingRegressor(n_estimators=100, random_state=42)

# Fit the model on the training data
gbr.fit(X, y)

# Calculate feature importances
importances = gbr.feature_importances_

# Calculate performance metrics
y_pred = gbr.predict(X)
mse = mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y, y_pred)
mape = np.mean(np.abs((y - y_pred) / y)) * 100
r2 = r2_score(y, y_pred)

metrics = pd.DataFrame({'Feature Importances': importances, 'MSE': mse, 'RMSE': rmse, 'MAE': mae, 'MAPE': mape, 'R2': r2})
metrics.to_csv('gbr_feature_importance.csv', index=True)

In [None]:
# Create an extra trees regressor model
etr = ExtraTreesRegressor(n_estimators=100, random_state=42)

# Fit the model on the training data
etr.fit(X, y)

# Calculate feature importances
importances = etr.feature_importances_


# Calculate performance metrics
y_pred = etr.predict(X)
mse = mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y, y_pred)
mape = np.mean(np.abs((y - y_pred) / y)) * 100
r2 = r2_score(y, y_pred)

metrics = pd.DataFrame({'Feature Importances': importances, 'MSE': mse, 'RMSE': rmse, 'MAE': mae, 'MAPE': mape, 'R2': r2})
metrics.to_csv('etr_feature_importance.csv', index=True)

In [None]:
from sklearn.neighbors import KNeighborsRegressor

In [None]:
# Create an extra trees regressor model
knn = KNeighborsRegressor(n_neighbors=5)

# Fit the model on the training data
knn.fit(X, y)

# Calculate feature importances
importances = np.zeros(X.shape[1])


# Calculate performance metrics
y_pred = lgbm.predict(X)
mse = mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y, y_pred)
mape = np.mean(np.abs((y - y_pred) / y)) * 100
r2 = r2_score(y, y_pred)

metrics = pd.DataFrame({'Feature Importances': importances, 'MSE': mse, 'RMSE': rmse, 'MAE': mae, 'MAPE': mape, 'R2': r2})
metrics.to_csv('knn_feature_importance.csv', index=True)

In [None]:
from sklearn.decomposition import PCA

In [None]:
# Use PCA to reduce the number of features
pca = PCA(n_components=10)
X_pca = pca.fit_transform(X)
dt.fit(X_pca, y)
importances_pca = dt.feature_importances_

# Calculate performance metrics
y_pred = dt.predict(X_pca)
mse = mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y, y_pred)
mape = np.mean(np.abs((y - y_pred) / y)) * 100
r2 = r2_score(y, y_pred)

metrics = pd.DataFrame({'Feature Importances': importances_pca, 'MSE': mse, 'RMSE': rmse, 'MAE': mae, 'MAPE': mape, 'R2': r2})
metrics.to_csv('pca_dt_feature_importance.csv', index=True)

In [None]:
from sklearn.feature_selection import RFE

# Create RFE model and select top n features
n = 10 # choose the number of top features to select
rfe = RFE(estimator=DecisionTreeRegressor(random_state=42), n_features_to_select=n)
rfe.fit(X, y)
# Get ranking of input features based on RFE
rfe_ranking = rfe.ranking_

# Calculate performance metrics
y_pred = rfe.predict(X)
mse = mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y, y_pred)
mape = np.mean(np.abs((y - y_pred) / y)) * 100
r2 = r2_score(y, y_pred)

metrics = pd.DataFrame({'Feature Importances': rfe_ranking, 'MSE': mse, 'RMSE': rmse, 'MAE': mae, 'MAPE': mape, 'R2': r2})
metrics.to_csv('recursive_feature_elimination_feature_importance.csv', index=True)

In [None]:
from sklearn.inspection import permutation_importance

# Calculate permutation importance
permutation_importances = permutation_importance(rf, X, y, random_state=42)
importances = permutation_importances.importances_mean

# Calculate performance metrics
y_pred = rf.predict(X)
mse = mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y, y_pred)
mape = np.mean(np.abs((y - y_pred) / y)) * 100
r2 = r2_score(y, y_pred)

metrics = pd.DataFrame({'Feature Importances': importances, 'MSE': mse, 'RMSE': rmse, 'MAE': mae, 'MAPE': mape, 'R2': r2})
metrics.to_csv('permutation_importance_feature_importance.csv', index=True)

In [None]:
from sklearn.linear_model import ElasticNet

In [None]:
en = ElasticNet(random_state=42)

In [None]:
en.fit(X, y)
importances = np.abs(en.coef_)
#importances = coef_abs / np.sum(coef_abs)

# Calculate performance metrics
y_pred = en.predict(X)
mse = mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y, y_pred)
mape = np.mean(np.abs((y - y_pred) / y)) * 100
r2 = r2_score(y, y_pred)

metrics = pd.DataFrame({'Feature Importances': importances, 'MSE': mse, 'RMSE': rmse, 'MAE': mae, 'MAPE': mape, 'R2': r2})
metrics.to_csv('Elastic_Net_feature_importance.csv', index=True)

In [None]:
pip install shap

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting shap
  Downloading shap-0.41.0-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (575 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m575.9/575.9 KB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
Collecting slicer==0.0.7
  Downloading slicer-0.0.7-py3-none-any.whl (14 kB)
Installing collected packages: slicer, shap
Successfully installed shap-0.41.0 slicer-0.0.7


In [None]:
data = pd.read_csv('/content/data_subset.csv')

In [None]:
# Split the dataset into input features and target variable
X = data.iloc[:, 1:-1] # all columns except the last and first one
y = data.iloc[:,-1] # the last column as the target variable


In [None]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
import numpy as np

In [None]:
selector = SelectKBest(score_func=f_regression, k=10) # select top 10 features
X_selected = selector.fit_transform(X, y)

In [None]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)

In [None]:
rf.fit(X_selected, y)

RandomForestRegressor(random_state=42)

In [None]:
importances = rf.feature_importances_

In [None]:
y_pred = rf.predict(X_selected)
mse = mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y, y_pred)
mape = np.mean(np.abs((y - y_pred) / y)) * 100
r2 = r2_score(y, y_pred)

In [None]:
metrics = pd.DataFrame({'Feature Importances': importances, 'MSE': mse, 'RMSE': rmse, 'MAE': mae, 'MAPE': mape, 'R2': r2})
metrics.to_csv('metrics.csv', index=False)

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Create a random forest regressor model with Gini impurity
rf = RandomForestRegressor(n_estimators=100, criterion='friedman_mse', max_features='auto', random_state=42)

# Fit the model on the training data
rf.fit(X, y)

# Calculate feature importances
importances = rf.feature_importances_

# Print feature importances
for i, importance in enumerate(importances):
    print('Feature %d: %.3f' % (i+1, importance))

# Calculate performance metrics
y_pred = rf.predict(X)
mse = mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y, y_pred)
mape = np.mean(np.abs((y - y_pred) / y)) * 100
r2 = r2_score(y, y_pred)

# Save feature importances and performance metrics in a CSV file
metrics = pd.DataFrame({'Feature Importances': importances, 'MSE': mse, 'RMSE': rmse, 'MAE': mae, 'MAPE': mape, 'R2': r2})
metrics.to_csv('metrics.csv', index=False)


  warn(


Feature 1: 0.000
Feature 2: 0.006
Feature 3: 0.000
Feature 4: 0.041
Feature 5: 0.001
Feature 6: 0.001
Feature 7: 0.002
Feature 8: 0.003
Feature 9: 0.000
Feature 10: 0.001
Feature 11: 0.005
Feature 12: 0.008
Feature 13: 0.007
Feature 14: 0.008
Feature 15: 0.000
Feature 16: 0.002
Feature 17: 0.004
Feature 18: 0.002
Feature 19: 0.000
Feature 20: 0.007
Feature 21: 0.003
Feature 22: 0.000
Feature 23: 0.013
Feature 24: 0.008
Feature 25: 0.003
Feature 26: 0.001
Feature 27: 0.000
Feature 28: 0.003
Feature 29: 0.001
Feature 30: 0.003
Feature 31: 0.002
Feature 32: 0.000
Feature 33: 0.015
Feature 34: 0.003
Feature 35: 0.003
Feature 36: 0.001
Feature 37: 0.003
Feature 38: 0.001
Feature 39: 0.004
Feature 40: 0.001
Feature 41: 0.000
Feature 42: 0.007
Feature 43: 0.001
Feature 44: 0.001
Feature 45: 0.007
Feature 46: 0.005
Feature 47: 0.004
Feature 48: 0.000
Feature 49: 0.001
Feature 50: 0.003
Feature 51: 0.002
Feature 52: 0.096
Feature 53: 0.001
Feature 54: 0.000
Feature 55: 0.006
Feature 56: 0.000
F