In [18]:
# Import necessary libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.formula.api import ols
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.model_selection import train_test_split

In [19]:
#df_All = pd.read_csv(r"C:\Users\saust\OneDrive - Sasol\1 Project rC4\Jupyter Notebooks\Report 10-20-23 No Fluff\df_All_Avg.csv")

df_All = pd.read_csv('df_Join_all.csv')


#df_All = pd.read_csv(r"C:\Users\saust\OneDrive\Desktop\CodeSpace DLs\102423\Project rC4\3 Final Machine Butanol\df_CDCA6.csv")


In [20]:
# Set max columns to display
pd.set_option('display.max_columns', None)

In [21]:
# df_All = df_All[df_All['Date'] > '2022-06-15 00:00:00']

In [22]:
# List of columns to exclude to run XGboost feature selection
exclude_columns = ['Unnamed: 0', '%Al2O3_bM', 'Octanol PPM ', 'Hexanol PPM ',
       'Ethanol PPM ', 'Dodecanol PPM ', 'Decanol PPM ', '%nC8OH', '%nC6OH', '%nC12OH', '%nC10OH ',
       'TI52014', 'TI55013', 'TI55014', 'TI55015', 'TI55016', 'TI55017', 'TI55021', 'TI55023'
                   ]

# Create a new DataFrame without the excluded columnsd
df_All = df_All.drop(columns=exclude_columns)

In [23]:
# Generated from Stepwise Regression

selected_columns = ['Butanol PPM', '425 SAO Al', 'FC42428', 'FC55003', 'FC55102', 'FC55152', 'FFC55555',
       'LC55553', 'LC55557', 'LC55568', 'LC90366', 'TC55555']

existing_columns = [col for col in selected_columns if col in df_All.columns]
df_All = df_All[existing_columns]

In [24]:
df_All.columns

Index(['Butanol PPM', '425 SAO Al', 'FC42428', 'FC55003', 'FC55102', 'FC55152',
       'FFC55555', 'LC55553', 'LC55557', 'LC55568', 'LC90366', 'TC55555'],
      dtype='object')

In [25]:
import plotly.express as px

threshold = 0.6
correlation_matrix = df_All.corr(numeric_only=True)
correlation_matrix = correlation_matrix[abs(correlation_matrix) > threshold].fillna(0)



fig = px.imshow(correlation_matrix, 
                color_continuous_scale='balance',
                labels=dict(x="Features", y="Features", color="Correlation"),
                title="Correlation Matrix")

fig.update_xaxes(title_text='Features', side='bottom')
fig.update_layout(width=800, height=800)

fig.show()


In [26]:
# Splitting into train and test
X = df_All.drop('Butanol PPM', axis=1)  # Assuming 'target' is your target column
y = df_All['Butanol PPM']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
# Create a linear regression model
linreg = LinearRegression()

# Create a SequentialFeatureSelector object
sfs = SFS(linreg, 
          k_features='best', 
          forward=True, 
          floating=True, 
          scoring='r2',
          cv=5)

# Fit the model
sfs = sfs.fit(X_train, y_train)

# Get the selected feature names
selected_features = X_train.columns[list(sfs.k_feature_idx_)]

# Build the final model using selected features
final_model = linreg.fit(X_train[selected_features], y_train)

# Optionally, evaluate the model on the test set
y_pred = final_model.predict(X_test[selected_features])

In [28]:
# Get the selected feature names
selected_features = X_train.columns[list(sfs.k_feature_idx_)]
print("Selected Features:")
print(selected_features)

# Display the coefficients of the final model
print("\nModel Coefficients:")
for i, feature in enumerate(selected_features):
    print(f"{feature}: {final_model.coef_[i]}")

# Display the R-squared value for the training set
r_squared_train = final_model.score(X_train[selected_features], y_train)
print(f"\nR-squared on Training Set: {r_squared_train}")

# Optionally, if you evaluated the model on a test set
r_squared_test = final_model.score(X_test[selected_features], y_test)
print(f"R-squared on Test Set: {r_squared_test}")


Selected Features:
Index(['425 SAO Al', 'FC42428', 'FC55003', 'FC55102', 'FC55152', 'FFC55555',
       'LC55553', 'LC55557', 'LC55568', 'LC90366', 'TC55555'],
      dtype='object')

Model Coefficients:
425 SAO Al: 25.17775577108802
FC42428: -0.0053502077679223785
FC55003: -0.08707793224835761
FC55102: 0.009690441504333165
FC55152: -0.00645465176065053
FFC55555: -1225.499580989331
LC55553: -3.790123484445094
LC55557: 7.708962492130608
LC55568: 22.260326246079796
LC90366: -0.9876116671266065
TC55555: 11.143149287396222

R-squared on Training Set: 0.019324370488962517
R-squared on Test Set: -0.16325654916601962


In [None]:
# Selected Features:

# Input
# Index(['425 SAO Al', 'Butanol PPM', 'AYC55580', 'DI55102', 'DI55152',
#        'DI55580', 'FC42428', 'FC52018', 'FC55003', 'FC55009', 'FC55102',
#        'FC55152', 'FC55552', 'FC55555', 'FC55569', 'FC55576', 'FFC55553',
#        'FFC55555', 'FYC55553', 'II52554', 'LC52572', 'LC55553', 'LC55555',
#        'LC55557', 'LC55568', 'LC90366', 'LC90368', 'PI55004', 'PI55020',
#        'PI55560', 'TC52015', 'TC55552', 'TC55553', 'TC55555', 'TC55566',
#        'TI40050', 'VI52558B'],
#       dtype='object')

# Output
# Index(['425 SAO Al', 'FC42428', 'FC55003', 'FC55102', 'FC55152', 'FFC55555',
#        'LC55553', 'LC55557', 'LC55568', 'LC90366', 'TC55555'],
#       dtype='object')

# Model Coefficients:
# 425 SAO Al: 25.17775577108802
# FC42428: -0.0053502077679223785
# FC55003: -0.08707793224835761
# FC55102: 0.009690441504333165
# FC55152: -0.00645465176065053
# FFC55555: -1225.499580989331
# LC55553: -3.790123484445094
# LC55557: 7.708962492130608
# LC55568: 22.260326246079796
# LC90366: -0.9876116671266065
# TC55555: 11.143149287396222

# R-squared on Training Set: 0.019324370488962517
# R-squared on Test Set: -0.16325654916601962