In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import csv

In [None]:
# Load the data
data = pd.read_csv('/content/Data_subset1.csv')

In [None]:
data

In [None]:
# Split the data into features (X) and target (y)
X = data.iloc[:, 1:192].values
y = data.iloc[:, 192].values

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
# Define the list of regressors to use
regressors = [RandomForestRegressor(n_estimators=100, random_state=0),
              DecisionTreeRegressor(random_state=0),
              LinearRegression(),
              Ridge(),
              Lasso(),
              SVR(),
              KNeighborsRegressor(),
              XGBRegressor(random_state=0),
              LGBMRegressor(random_state=0)]

In [None]:
# Train and evaluate each regressor
for regressor in regressors:
    # Train the regressor
    regressor.fit(X_train, y_train)

    # Predict the target values
    y_pred = regressor.predict(X_test)

    # Calculate the mean squared error and R-squared score
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
    r2 = r2_score(y_test, y_pred)

    # Print the results
    print(regressor.__class__.__name__)
    print('MSE:', mse)
    print('RMSE:', rmse)
    print('MAPE:', mape)
    print('R-squared:', r2)
    #print('Feature Importances:', regressor.feature_importances_ if hasattr(regressor, 'feature_importances_') else 'N/A')
    print('-' * 50)



In [None]:
with open('regressor_scores.csv', mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Regressor', 'MSE', 'RMSE', 'MAPE', 'R-squared'])
    for regressor in regressors:
        regressor.fit(X_train, y_train)
        y_pred = regressor.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
        r2 = r2_score(y_test, y_pred)
        writer.writerow([regressor.__class__.__name__, mse, rmse, mape, r2])





In [None]:
pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.1.1-cp38-none-manylinux1_x86_64.whl (76.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.6/76.6 MB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.1.1


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor

In [None]:
df = pd.read_csv('/content/Data_subset1.csv')

In [None]:
X = df.iloc[:, 1:192].values
y = df.iloc[:, 192].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Initialize the 10 machine learning algorithms
rf = RandomForestRegressor(random_state=42)
lr = LinearRegression()
dt = DecisionTreeRegressor(random_state=42)
svm = SVR(kernel='linear')
xgb = XGBRegressor(random_state=42)
lgbm = LGBMRegressor(random_state=42)
cat = CatBoostRegressor(random_state=42)
mlp = MLPRegressor(random_state=42)
gb = GradientBoostingRegressor(random_state=42)
et = ExtraTreesRegressor(random_state=42)


In [None]:
# Fit each algorithm on the training data
rf.fit(X_train, y_train)
lr.fit(X_train, y_train)
dt.fit(X_train, y_train)
svm.fit(X_train, y_train)
xgb.fit(X_train, y_train)
lgbm.fit(X_train, y_train)
cat.fit(X_train, y_train)
mlp.fit(X_train, y_train)
gb.fit(X_train, y_train)
et.fit(X_train, y_train)

In [None]:
# Calculate the feature importances for each algorithm
rf_feature_importances = rf.feature_importances_
lr_feature_importances = lr.coef_
dt_feature_importances = dt.feature_importances_
svm_feature_importances = None
if svm.kernel == 'linear':
    svm_feature_importances = svm.coef_
else:
    svm_feature_importances = svm.feature_importances_
xgb_feature_importances = xgb.feature_importances_
lgbm_feature_importances = lgbm.feature_importances_
cat_feature_importances = cat.feature_importances_
mlp_feature_importances = mlp.coefs_
gb_feature_importances = gb.feature_importances_
et_feature_importances = et.feature_importances_

In [None]:
# Store the feature importances in a dataframe for comparison
feature_importances_df = pd.DataFrame({
    'Algorithm': ['Random Forest', 'Linear Regression', 'Decision Tree', 'Support Vector Machine', 'XGBoost', 'LightGBM', 'CatBoost', 'MLP', 'Gradient Boosting', 'Extra Trees'],
    'Feature Importances': [rf_feature_importances, lr_feature_importances, dt_feature_importances, svm_feature_importances, xgb_feature_importances, lgbm_feature_importances,
                            cat_feature_importances, mlp_feature_importances, gb_feature_importances, et_feature_importances]
})

# Save feature importances to a CSV file
feature_importances_df.to_csv('feature_importances.csv', index=False)




In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
import csv

In [None]:
X = df.iloc[:, 1:-1]
y = data.iloc[:, -1]

In [None]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit the model on the training data
rf.fit(X, y)

# Calculate feature importances
importances = rf.feature_importances_

# Print feature importances
#for i, importance in enumerate(importances):
    #print('Feature %d: %.3f' % (i+1, importance))

y_pred = rf.predict(X)
mse = mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y, y_pred)
mape = np.mean(np.abs((y - y_pred) / y)) * 100
r2 = r2_score(y, y_pred)

# Save feature importances and performance metrics in a CSV file
metrics = pd.DataFrame({'Feature Importances': importances, 'MSE': mse, 'RMSE': rmse, 'MAE': mae, 'MAPE': mape, 'R2': r2})
metrics.to_csv('RF_feature_importances.csv', index=True)



In [None]:
svr = SVR(kernel='linear')

# Fit the model on the training data
svr.fit(X, y)

# Calculate feature importances
importances = np.abs(svr.coef_[0])

# Print feature importances
for i, importance in enumerate(importances):
    print('Feature %d: %.3f' % (i+1, importance))

# Calculate performance metrics
y_pred = svr.predict(X)
mse = mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y, y_pred)
mape = np.mean(np.abs((y - y_pred) / y)) * 100
r2 = r2_score(y, y_pred)

# Save feature importances and performance metrics in a CSV file
metrics =



In [None]:
y_pred = rf.predict(X)
mse = mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y, y_pred)
mape = np.mean(np.abs((y - y_pred) / y)) * 100
r2 = r2_score(y, y_pred)

In [None]:
# Save feature importances and performance metrics in a CSV file
metrics = pd.DataFrame({'Feature Importances': importances, 'MSE': mse, 'RMSE': rmse, 'MAE': mae, 'MAPE': mape, 'R2': r2})
metrics.to_csv('metrics.csv', index=True)

In [None]:
dt = DecisionTreeRegressor(random_state=42)

# Fit the model on the training data
dt.fit(X, y)

# Calculate feature importances
importances = dt.feature_importances_

# Print feature importances
for i, importance in enumerate(importances):
    print('Feature %d: %.3f' % (i+1, importance))

# Calculate performance metrics
y_pred = dt.predict(X)
mse = mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y, y_pred)
mape = np.mean(np.abs((y - y_pred) / y)) * 100
r2 = r2_score(y, y_pred)

# Save feature importances and performance metrics in a CSV file
metrics = pd.DataFrame({'Feature Importances': importances, 'MSE': mse, 'RMSE': rmse, 'MAE': mae, 'MAPE': mape, 'R2': r2})
metrics.to_csv('decision_tree_metrics.csv', index=True)

In [None]:
svr = SVR(kernel='linear')

# Fit the model on the training data
svr.fit(X, y)

# Calculate feature importances
importances = np.abs(svr.coef_[0])

# Print feature importances
for i, importance in enumerate(importances):
    print('Feature %d: %.3f' % (i+1, importance))

# Calculate performance metrics
y_pred = svr.predict(X)
mse = mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y, y_pred)
mape = np.mean(np.abs((y - y_pred) / y)) * 100
r2 = r2_score(y, y_pred)

# Save feature importances and performance metrics in a CSV file
metrics = pd.DataFrame({'Feature Importances': importances, 'MSE': mse, 'RMSE': rmse, 'MAE': mae, 'MAPE': mape, 'R2': r2})
metrics.to_csv('svr_metrics.csv', index=True)




In [None]:
# Create a XGBRegressor model
xgb = XGBRegressor(n_estimators=100, random_state=42)

# Fit the model on the training data
xgb.fit(X, y)

# Calculate feature importances
importances = xgb.feature_importances_

# Print feature importances
for i, importance in enumerate(importances):
    print('XGB Feature %d: %.3f' % (i+1, importance))

# Calculate performance metrics
y_pred = xgb.predict(X)
mse = mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y, y_pred)
mape = np.mean(np.abs((y - y_pred) / y)) * 100
r2 = r2_score(y, y_pred)

# Append feature importances and performance metrics to the CSV file
#metrics = metrics.append(pd.DataFrame({'Model': ['XGBRegressor'], 'Feature Importances': importances, 'MSE': mse, 'RMSE': rmse, 'MAE': mae, 'MAPE': mape, 'R2': r2}), ignore_index=True)

metrics = pd.DataFrame({'Feature Importances': importances, 'MSE': mse, 'RMSE': rmse, 'MAE': mae, 'MAPE': mape, 'R2': r2})
metrics.to_csv('xgb_metrics.csv', index=True)


In [None]:
# Create a LGBMRegressor model
lgbm = LGBMRegressor(n_estimators=100, random_state=42)

# Fit the model on the training data
lgbm.fit(X, y)

# Calculate feature importances
importances = lgbm.feature_importances_

# Print feature importances
for i, importance in enumerate(importances):
    print('LGBM Feature %d: %.3f' % (i+1, importance))

# Calculate performance metrics
y_pred = lgbm.predict(X)
mse = mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y, y_pred)
mape = np.mean(np.abs((y - y_pred) / y)) * 100
r2 = r2_score(y, y_pred)

# Append feature importances and performance metrics to the CSV file
#metrics = metrics.append(pd.DataFrame({'Model': ['XGBRegressor'], 'Feature Importances': importances, 'MSE': mse, 'RMSE': rmse, 'MAE': mae, 'MAPE': mape, 'R2': r2}), ignore_index=True)

metrics = pd.DataFrame({'Feature Importances': importances, 'MSE': mse, 'RMSE': rmse, 'MAE': mae, 'MAPE': mape, 'R2': r2})
metrics.to_csv('lgbm_feature_importance.csv', index=True)

In [None]:
cb = CatBoostRegressor(iterations=100, learning_rate=0.1, random_seed=42)
cb.fit(X, y, verbose=False)
importances = cb.feature_importances_
#for i, importance in enumerate(importances):
 #print('Feature %d: %.3f' % (i+1, importance))
# Calculate performance metrics
y_pred = cb.predict(X)
mse = mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y, y_pred)
mape = np.mean(np.abs((y - y_pred) / y)) * 100
r2 = r2_score(y, y_pred)

# Append feature importances and performance metrics to the CSV file
#metrics = metrics.append(pd.DataFrame({'Model': ['XGBRegressor'], 'Feature Importances': importances, 'MSE': mse, 'RMSE': rmse, 'MAE': mae, 'MAPE': mape, 'R2': r2}), ignore_index=True)

metrics = pd.DataFrame({'Feature Importances': importances, 'MSE': mse, 'RMSE': rmse, 'MAE': mae, 'MAPE': mape, 'R2': r2})
metrics.to_csv('catboost_feature_importance.csv', index=True)

In [None]:
gbr = GradientBoostingRegressor(n_estimators=100, random_state=42)

# Fit the model on the training data
gbr.fit(X, y)

# Calculate feature importances
importances = gbr.feature_importances_

# Calculate performance metrics
y_pred = gbr.predict(X)
mse = mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y, y_pred)
mape = np.mean(np.abs((y - y_pred) / y)) * 100
r2 = r2_score(y, y_pred)

metrics = pd.DataFrame({'Feature Importances': importances, 'MSE': mse, 'RMSE': rmse, 'MAE': mae, 'MAPE': mape, 'R2': r2})
metrics.to_csv('gbr_feature_importance.csv', index=True)

In [None]:
# Create an extra trees regressor model
etr = ExtraTreesRegressor(n_estimators=100, random_state=42)

# Fit the model on the training data
etr.fit(X, y)

# Calculate feature importances
importances = etr.feature_importances_


# Calculate performance metrics
y_pred = etr.predict(X)
mse = mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y, y_pred)
mape = np.mean(np.abs((y - y_pred) / y)) * 100
r2 = r2_score(y, y_pred)

metrics = pd.DataFrame({'Feature Importances': importances, 'MSE': mse, 'RMSE': rmse, 'MAE': mae, 'MAPE': mape, 'R2': r2})
metrics.to_csv('etr_feature_importance.csv', index=True)

In [None]:
from sklearn.neighbors import KNeighborsRegressor

In [None]:
# Create an extra trees regressor model
knn = KNeighborsRegressor(n_neighbors=5)

# Fit the model on the training data
knn.fit(X, y)

# Calculate feature importances
importances = np.zeros(X.shape[1])


# Calculate performance metrics
y_pred = lgbm.predict(X)
mse = mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y, y_pred)
mape = np.mean(np.abs((y - y_pred) / y)) * 100
r2 = r2_score(y, y_pred)

metrics = pd.DataFrame({'Feature Importances': importances, 'MSE': mse, 'RMSE': rmse, 'MAE': mae, 'MAPE': mape, 'R2': r2})
metrics.to_csv('knn_feature_importance.csv', index=True)

In [None]:
from sklearn.decomposition import PCA

In [None]:
# Use PCA to reduce the number of features
pca = PCA(n_components=10)
X_pca = pca.fit_transform(X)
dt.fit(X_pca, y)
importances_pca = dt.feature_importances_

# Calculate performance metrics
y_pred = dt.predict(X_pca)
mse = mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y, y_pred)
mape = np.mean(np.abs((y - y_pred) / y)) * 100
r2 = r2_score(y, y_pred)

metrics = pd.DataFrame({'Feature Importances': importances_pca, 'MSE': mse, 'RMSE': rmse, 'MAE': mae, 'MAPE': mape, 'R2': r2})
metrics.to_csv('pca_dt_feature_importance.csv', index=True)

In [None]:
from sklearn.feature_selection import RFE

# Create RFE model and select top n features
n = 10 # choose the number of top features to select
rfe = RFE(estimator=DecisionTreeRegressor(random_state=42), n_features_to_select=n)
rfe.fit(X, y)
# Get ranking of input features based on RFE
rfe_ranking = rfe.ranking_

# Calculate performance metrics
y_pred = rfe.predict(X)
mse = mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y, y_pred)
mape = np.mean(np.abs((y - y_pred) / y)) * 100
r2 = r2_score(y, y_pred)

metrics = pd.DataFrame({'Feature Importances': rfe_ranking, 'MSE': mse, 'RMSE': rmse, 'MAE': mae, 'MAPE': mape, 'R2': r2})
metrics.to_csv('recursive_feature_elimination_feature_importance.csv', index=True)

In [None]:
from sklearn.inspection import permutation_importance

# Calculate permutation importance
permutation_importances = permutation_importance(rf, X, y, random_state=42)
importances = permutation_importances.importances_mean

# Calculate performance metrics
y_pred = rf.predict(X)
mse = mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y, y_pred)
mape = np.mean(np.abs((y - y_pred) / y)) * 100
r2 = r2_score(y, y_pred)

metrics = pd.DataFrame({'Feature Importances': importances, 'MSE': mse, 'RMSE': rmse, 'MAE': mae, 'MAPE': mape, 'R2': r2})
metrics.to_csv('permutation_importance_feature_importance.csv', index=True)

In [None]:
from sklearn.linear_model import ElasticNet

In [None]:
en = ElasticNet(random_state=42)

In [None]:
en.fit(X, y)
importances = np.abs(en.coef_)
#importances = coef_abs / np.sum(coef_abs)

# Calculate performance metrics
y_pred = en.predict(X)
mse = mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y, y_pred)
mape = np.mean(np.abs((y - y_pred) / y)) * 100
r2 = r2_score(y, y_pred)

metrics = pd.DataFrame({'Feature Importances': importances, 'MSE': mse, 'RMSE': rmse, 'MAE': mae, 'MAPE': mape, 'R2': r2})
metrics.to_csv('Elastic_Net_feature_importance.csv', index=True)

In [None]:
pip install shap

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting shap
  Downloading shap-0.41.0-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (575 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m575.9/575.9 KB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
Collecting slicer==0.0.7
  Downloading slicer-0.0.7-py3-none-any.whl (14 kB)
Installing collected packages: slicer, shap
Successfully installed shap-0.41.0 slicer-0.0.7


In [None]:
data = pd.read_csv('/content/data_subset.csv')

In [None]:
# Split the dataset into input features and target variable
X = data.iloc[:, 1:-1] # all columns except the last and first one
y = data.iloc[:,-1] # the last column as the target variable


In [None]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
import numpy as np

In [None]:
selector = SelectKBest(score_func=f_regression, k=10) # select top 10 features
X_selected = selector.fit_transform(X, y)

In [None]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)

In [None]:
rf.fit(X_selected, y)

RandomForestRegressor(random_state=42)

In [None]:
importances = rf.feature_importances_

In [None]:
y_pred = rf.predict(X_selected)
mse = mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y, y_pred)
mape = np.mean(np.abs((y - y_pred) / y)) * 100
r2 = r2_score(y, y_pred)

In [None]:
metrics = pd.DataFrame({'Feature Importances': importances, 'MSE': mse, 'RMSE': rmse, 'MAE': mae, 'MAPE': mape, 'R2': r2})
metrics.to_csv('metrics.csv', index=False)

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Create a random forest regressor model with Gini impurity
rf = RandomForestRegressor(n_estimators=100, criterion='friedman_mse', max_features='auto', random_state=42)

# Fit the model on the training data
rf.fit(X, y)

# Calculate feature importances
importances = rf.feature_importances_

# Print feature importances
for i, importance in enumerate(importances):
    print('Feature %d: %.3f' % (i+1, importance))

# Calculate performance metrics
y_pred = rf.predict(X)
mse = mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y, y_pred)
mape = np.mean(np.abs((y - y_pred) / y)) * 100
r2 = r2_score(y, y_pred)

# Save feature importances and performance metrics in a CSV file
metrics = pd.DataFrame({'Feature Importances': importances, 'MSE': mse, 'RMSE': rmse, 'MAE': mae, 'MAPE': mape, 'R2': r2})
metrics.to_csv('metrics.csv', index=False)
