In [None]:
# Import necessary libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.formula.api import ols
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.model_selection import train_test_split
import itertools
import random
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error


In [None]:
#df_All = pd.read_csv(r"C:\Users\saust\OneDrive - Sasol\1 Project rC4\Jupyter Notebooks\Report 10-20-23 No Fluff\df_All_Avg.csv")

df_All_1o2 = pd.read_csv('3 Feature Selection/contData_all_Avg - 1o2.csv')
df_All_1o2 = pd.read_csv('3 Feature Selection/contData_all_Avg - 2o2.csv')

#df_All = pd.read_csv(r"C:\Users\saust\OneDrive\Desktop\CodeSpace DLs\102423\Project rC4\3 Final Machine Butanol\df_CDCA6.csv")


In [None]:
# Concatenate (union) the dataframes
df_All = pd.concat([df_All_1o2, df_All_1o2], ignore_index=True)

print(df_All.head())

In [None]:
# Set max columns to display
pd.set_option('display.max_columns', None)

In [None]:
# df_All = df_All[df_All['Date'] > '2022-06-15 00:00:00']

In [None]:
# List of columns to exclude to run XGboost feature selection
exclude_columns = ['Octanol', 'Hexanol',
       'Ethanol', 'Decanol',
       
       'TI52014', 'TI55013', 'TI55014', 'TI55015', 'TI55016', 'TI55017', 'TI55021', 'TI55023',
       'TC52015', 'FC52018', 'II52554', 'TI40050', 'VI52558B'

       # 'FC55102', 'FC55152', 'LC55557', 'LC55568', 'TC55555',

       # '425 SAO Al', 'FFC55553', 'LC52572', 'LC90366',

       # 'FC42428', 'LC55553',

       # 'FC55009'
                   ]

# Create a new DataFrame without the excluded columnsd
df_All = df_All.drop(columns=exclude_columns)

In [None]:
df_All.columns

In [None]:
# # Splitting into train and test
# X = df_All.drop('Butanol', axis=1)  # Assuming 'target' is your target column
# y = df_All['Butanol']
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
def adjusted_r2(r_squared, num_data_points, num_predictors):
    return 1 - (1 - r_squared) * (num_data_points - 1) / (num_data_points - num_predictors - 1)

def iterate_feature_rotations(df_all, target_column, test_size=0.2, random_state=42):
    results = []
    columns = [col for col in df_all.columns if col != target_column]
    random.seed(random_state)  # for reproducibility

    for feature in columns:
        # Randomly order the remaining features
        remaining_features = [f for f in columns if f != feature]
        random.shuffle(remaining_features)

        # Create a new ordered list of features
        ordered_features = [feature] + remaining_features

        reordered_df = df_all[ordered_features + [target_column]]

        # Splitting into train and test for each permutation
        X = reordered_df.drop(target_column, axis=1)
        y = reordered_df[target_column]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

        # Create and fit the model using SFS
        linreg = LinearRegression()
        sfs = SFS(linreg, 
                  k_features='best', 
                  forward=True, 
                  floating=True, 
                  scoring='r2',
                  cv=5)
        sfs.fit(X_train, y_train)

        # Get the selected features
        selected_features = X_train.columns[list(sfs.k_feature_idx_)]

        # Build the final model using selected features and evaluate it
        final_model = linreg.fit(X_train[selected_features], y_train)
        y_pred = final_model.predict(X_test[selected_features])

        # Calculate performance metrics
        r2 = r2_score(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        rmse = mean_squared_error(y_test, y_pred, squared=False)
        mae = mean_absolute_error(y_test, y_pred)
        adj_r2 = adjusted_r2(r2, len(y_test), len(selected_features))

        # Store the result with the permutation order and performance metrics
        results.append((ordered_features, {'R2': r2, 'MSE': mse, 'RMSE': rmse, 'MAE': mae, 'Adjusted R2': adj_r2}))

    return results

results = iterate_feature_rotations(df_All, 'Butanol')

In [None]:
# Get the selected feature names
selected_features = X_train.columns[list(sfs.k_feature_idx_)]
print("Selected Features:")
print(selected_features)

# Display the coefficients of the final model
print("\nModel Coefficients:")
for i, feature in enumerate(selected_features):
    print(f"{feature}: {final_model.coef_[i]}")

# Display the R-squared value for the training set
r_squared_train = final_model.score(X_train[selected_features], y_train)
print(f"\nR-squared on Training Set: {r_squared_train}")

# Optionally, if you evaluated the model on a test set
r_squared_test = final_model.score(X_test[selected_features], y_test)
print(f"R-squared on Test Set: {r_squared_test}")
