In [2]:
# %pip install statsmodels
# %pip install mlxtend
# %pip install xgboost

In [3]:
# Import necessary libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.formula.api import ols
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.model_selection import train_test_split
import itertools
import random
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error


In [4]:

df_All_1 = pd.read_csv(r'C:\Users\saust\OneDrive\Desktop\GitRepo\Project-OptiC4\3 Feature Selection\contData_all_Avg - 1o2.csv')
df_All_2 = pd.read_csv(r'C:\Users\saust\OneDrive\Desktop\GitRepo\Project-OptiC4\3 Feature Selection\contData_all_Avg - 2o2.csv')
# Concatenate (union) the dataframes
df_All = pd.concat([df_All_1, df_All_2], ignore_index=True)

print(df_All.head())

    425 %Al    Butanol   Decanol    Ethanol   Hexanol   Octanol   AYC55580  \
0  6.319560  30.366200  1.030260  88.289900  1.365840  1.492120  18.297300   
1  6.319915  30.678500  1.063260  89.685450  1.397135  1.548230  18.040650   
2  6.328340  32.399333  1.647950  88.677200  1.994413  2.454430  16.637633   
3  6.336760  32.845950  1.928805  87.080875  2.288335  2.897523  15.779400   
4  6.337460  33.387025  2.352595  84.489175  2.734682  3.571185  15.199680   

    DI55102   DI55152   DI55580  ...    TI40050   TI52014     TI55013  \
0  0.963347  0.924879  1.017340  ...  81.156900  131.0990  236.672000   
1  0.965043  0.924515  1.014885  ...  82.965650  132.3490  230.501000   
2  0.968107  0.929955  1.015593  ...  82.461167  133.0350  223.885667   
3  0.969573  0.932388  1.014703  ...  83.676300  134.0690  219.719750   
4  0.971022  0.934531  1.014094  ...  85.328040  134.8424  217.253200   

      TI55014     TI55015   TI55016    TI55017  TI55021     TI55023  VI52558B  
0  223.70400

In [5]:
# Set max columns to display
pd.set_option('display.max_columns', None)

In [6]:
# df_All = df_All[df_All['Date'] > '2022-06-15 00:00:00']

In [6]:
# List of columns to exclude to run XGboost feature selection
exclude_columns = ['Octanol', 'Hexanol',
       'Ethanol', 'Decanol',
       
       'TI52014', 'TI55013', 'TI55014', 'TI55015', 'TI55016', 'TI55017', 'TI55021', 'TI55023',
       'TC52015', 'FC52018', 'II52554', 'TI40050', 'VI52558B'

       # 'FC55102', 'FC55152', 'LC55557', 'LC55568', 'TC55555',

       # '425 SAO Al', 'FFC55553', 'LC52572', 'LC90366',

       # 'FC42428', 'LC55553',

       # 'FC55009'
                   ]

# Create a new DataFrame without the excluded columnsd
df_All = df_All.drop(columns=exclude_columns)

In [7]:
df_All.columns

Index(['425 %Al', 'Butanol', 'AYC55580', 'DI55102', 'DI55152', 'DI55580',
       'FC42428', 'FC55003', 'FC55009', 'FC55102', 'FC55152', 'FC55552',
       'FC55555', 'FC55569', 'FC55576', 'FFC55553', 'FFC55555', 'FYC55553',
       'LC52572', 'LC55553', 'LC55555', 'LC55557', 'LC55568', 'LC90366',
       'LC90368', 'PI55004', 'PI55020', 'PI55560', 'TC55552', 'TC55553',
       'TC55555', 'TC55566'],
      dtype='object')

In [9]:
# # Splitting into train and test
# X = df_All.drop('Butanol', axis=1)  # Assuming 'target' is your target column
# y = df_All['Butanol']
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
def iterate_feature_rotations(df_all, target_column, test_size=0.2, random_state=42, num_random_iterations=30):
    results = []
    columns = [col for col in df_all.columns if col != target_column]
    random.seed(random_state)  # for reproducibility

    for feature in columns:
        for _ in range(num_random_iterations):
            # Randomly order the remaining features
            remaining_features = [f for f in columns if f != feature]
            random.shuffle(remaining_features)

            # Create a new ordered list of features
            ordered_features = [feature] + remaining_features

            reordered_df = df_all[ordered_features + [target_column]]

            # Splitting into train and test for each permutation
            X = reordered_df.drop(target_column, axis=1)
            y = reordered_df[target_column]
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

            # Create and fit the XGBoost model
            model = xgb.XGBRegressor(objective='reg:squarederror')
            model.fit(X_train, y_train)

            # Extract feature importances
            feature_importances = model.get_booster().get_score(importance_type="weight")

            # Store the result with the permutation order and feature importances
            results.append((ordered_features, feature_importances))

    return results


results = iterate_feature_rotations(df_All, 'Butanol')

# Convert results to a DataFrame
flattened_results = []
for ordered_features, importances in results:
    for feature, importance in importances.items():
        flattened_results.append({
            'Feature Rotation': ordered_features,
            'Feature': feature,
            'Importance': importance
        })

results_df = pd.DataFrame(flattened_results)


In [12]:
print(results_df)

                                        Feature Rotation   Feature  Importance
0      [425 %Al, LC55557, FFC55555, FC55555, TC55552,...   425 %Al       477.0
1      [425 %Al, LC55557, FFC55555, FC55555, TC55552,...   LC55557       258.0
2      [425 %Al, LC55557, FFC55555, FC55555, TC55552,...  FFC55555       214.0
3      [425 %Al, LC55557, FFC55555, FC55555, TC55552,...   FC55555       166.0
4      [425 %Al, LC55557, FFC55555, FC55555, TC55552,...   TC55552       234.0
...                                                  ...       ...         ...
28825  [TC55566, TC55553, FC55102, FC55152, PI55560, ...   LC55568       145.0
28826  [TC55566, TC55553, FC55102, FC55152, PI55560, ...  FFC55553       150.0
28827  [TC55566, TC55553, FC55102, FC55152, PI55560, ...   PI55020       125.0
28828  [TC55566, TC55553, FC55102, FC55152, PI55560, ...   FC55569       192.0
28829  [TC55566, TC55553, FC55102, FC55152, PI55560, ...   425 %Al       187.0

[28830 rows x 3 columns]


In [13]:
# Group by 'Feature' and calculate the average importance
average_importances = results_df.groupby('Feature')['Importance'].mean()

# Convert the Series to a DataFrame
average_importances_df = average_importances.reset_index()

# Rename the columns for clarity
average_importances_df.columns = ['Feature', 'Average Importance']

# Sort the DataFrame by 'Average Importance' in descending order
average_importances_df = average_importances_df.sort_values(by='Average Importance', ascending=False)

# Display or save the DataFrame
print(average_importances_df)
# Or save it to a CSV file
# average_importances_df.to_csv('average_feature_importances.csv', index=False)


     Feature  Average Importance
0    425 %Al          222.892473
12   FC55569          219.253763
27   TC55552          211.535484
13   FC55576          203.887097
23   LC90368          202.908602
3    DI55152          199.497849
22   LC90366          186.881720
6    FC55003          184.702151
30   TC55566          182.258065
5    FC42428          180.329032
14  FFC55553          176.413978
2    DI55102          176.056989
20   LC55557          175.236559
15  FFC55555          173.005376
29   TC55555          165.967742
1   AYC55580          165.908602
21   LC55568          165.296774
17   LC52572          164.482796
28   TC55553          162.459140
19   LC55555          160.037634
25   PI55020          159.554839
7    FC55009          158.030108
4    DI55580          148.455914
18   LC55553          145.992473
24   PI55004          136.580645
9    FC55152          134.036559
11   FC55555          129.002151
8    FC55102          128.508602
26   PI55560          123.712903
16  FYC555