## Libraries and Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import explained_variance_score, mean_absolute_error, mean_squared_error, r2_score

import warnings
warnings.filterwarnings('ignore')

## Preprocessing

### Mervyn's Preprocessing - some complicated sh*t

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.feature_selection import RFE

# File path and variables
filename = 'Concrete_Data_Yeh_final.csv'
variables = ['cement', 'slag', 'flyash', 'water', 'superplasticizer', 'coarseaggregate', 'fineaggregate', 'age']

# Preprocessing class
class PreProcessing:
    """Handles data preprocessing tasks for regression modeling."""

    def __init__(self, file):
        """Initializes the class by reading the data from the file."""
        self.data = pd.read_csv(file)  # Read CSV file into a DataFrame

    def checkNaN(self):
        """Checks for missing values (NaN) in the DataFrame."""
        return self.data.isnull().sum()  # Count missing values in each column

    def FillNaN(self, method='mean'): #by default i've set it to use mean, but we can play around with KNN and Median to see how our model behaves with different imputation methods.
        """Fills missing values with specified method and applies transformations."""
        if method == 'mean':
            # Fill NaNs with mean for Gaussian-distributed variables
            for col in ['cement', 'water', 'coarseaggregate', 'fineaggregate']:
                self.data[col].fillna(self.data[col].mean(), inplace=True)

            # Handle variables with irregular Gaussian patterns
            for col in ['slag', 'flyash', 'superplasticizer']:
                mask = self.data[col] == 0
                mean_csMPa_for_zeros = self.data[mask]['csMPa'].mean()  # Calculate mean csMPa for rows with 0 in these columns
                self.data.loc[mask, 'csMPa'] = mean_csMPa_for_zeros  # Assign mean csMPa to those rows
                self.data[col].fillna(self.data[col].mean(), inplace=True)  # Fill remaining NaNs with mean
        elif method == 'median':
            self.data.fillna(self.data.median(), inplace=True)
        elif method == 'knn':
            imputer = KNNImputer()
            self.data = pd.DataFrame(imputer.fit_transform(self.data), columns=self.data.columns)
        else:
            raise ValueError("Invalid imputation method.")

        # Apply transformations (vectorized for efficiency)
        self.data[['age', 'cement', 'water', 'fineaggregate', 'coarseaggregate']] = np.log(
            self.data[['age', 'cement', 'water', 'fineaggregate', 'coarseaggregate']]
        )  # Apply log transformation to specified columns, including 'age'

        self.data[['cement', 'superplasticizer']] = np.sqrt(
            self.data[['cement', 'superplasticizer']]
        )  # Apply square root transformation to specified columns



        return self.data  # Return the preprocessed DataFrame

    def handle_outliers(self, method='capping', threshold=3):
        """Handles outliers using the specified method."""
        if method == 'capping':
            for col in self.data.columns:
                self.data[col] = self.data[col].clip(lower=self.data[col].quantile(0.05),
                                                    upper=self.data[col].quantile(0.95))
        elif method == 'winsorizing':
            for col in self.data.columns:
                IQR = self.data[col].quantile(0.75) - self.data[col].quantile(0.25)
                if IQR != 0:
                    lower_bound = self.data[col].quantile(0.25) - threshold * IQR
                    upper_bound = self.data[col].quantile(0.75) + threshold * IQR
                    self.data[col] = self.data[col].clip(lower=lower_bound, upper=upper_bound)
                else:
                    print(f"IQR for {col} is zero, skipping winsorizing.")
        else:
            raise ValueError("Invalid outlier handling method.")


        return self.data  # Return the preprocessed DataFrame

        
    def removeNaN(self) -> pd.DataFrame:
        """Removes rows with missing values and applies transformations."""
        # Drop rows with missing values
        self.data.dropna(inplace=True)

        # Apply log transformation to specified columns (after removing NaNs)
        for variable in self.data.columns[:-1]:
            if variable in ['cement', 'water', 'age', 'fineaggregate', 'coarseaggregate']:
                self.data[variable] = np.log(self.data[variable])

        return self.data  # Return the preprocessed DataFrame

# Example usage
data = PreProcessing(filename)
concdata = data.FillNaN(method='mean')  # Fill with median
#print(data)
#concretedata = data.handle_outliers(method= 'capping')  # Handle outliers using capping
concdata = data.removeNaN()  # Remove rows with missing values
print(concdata.head(16))

### Aakash's Preprocessing - by inspection

If values before and after are the same, this value replaces the NaN. No problems with this.

If values before and after are different, things get messy. There are three scenarios:

1. Needs to be filled with the value before. Most frequent scenario. Example from database is water row 16; looking at excel file, the NaN should obviously be filled with 228. This is done in the code below (1).

2. Needs to be filled with mean. Example from database is age row 4; looking at excel file, it's not obvious what the NaN should be filled with, so we'll use the average of the two before and after (can this be changed to something better?). This is done in the code below (2).

3. Needs to be filled with the value after. Least frequent scenario, there are 5 of these scenarios. Example from database is slag row 185 and 186. This HAS NOT been implemented into the code, I'm not sure what conditions would allow for some NaNs to be replaced with values before and some with values after. Because there are only 5, does this matter that much?

THIS CODE IS REALLY MESSY AND I APOLOGISE TO YOU THREE GOOD CODERS WHO MAY HAVE A STROKE SEEING WHAT I'VE DONE. i just could not get the fillna method to work for the "one and two before" method (if you ask copilot this is what it recommends using) so this is something i'm sure one of you will be able to do.

In [None]:
class InspPreProcessing:
    def __init__(self, df):
        self.df = df
        self.variables = variables

    def rename_columns(self):
        self.df.rename(columns=dict(zip(self.df.columns, variables)),
            inplace=True)
        return self.df
    
    def replaceNaN(self) -> pd.DataFrame:

        # for loops - looping over the whole dataframe
        for col in self.df.columns:
            for i in range(1, len(self.df) - 1):
                
                # concerning NaN values
                if pd.isna(self.df[col].iloc[i]):

                    # if the values BEFORE AND AFTER are DIFFERENT
                    if self.df[col].iloc[i-1] != self.df[col].iloc[i+1]:

                        # 1 - if the values ONE AND TWO BEFORE are the SAME
                        if self.df[col].iloc[i-1] == self.df[col].iloc[i-2]:
                            self.df[col].iloc[i] = self.df[col].iloc[i-1] # fill with before value
                        
                        # 2 - if the values ONE AND TWO BEFORE are DIFFERENT
                        else:
                            self.df[col].iloc[i] = (self.df[col].iloc[i-1] + self.df[col].iloc[i+1]) / 2 # fill with average - NEEDS CHANGING
                
                    # if the values BEFORE AND AFTER are the SAME
                    else:
                        self.df[col].iloc[i] = self.df[col].iloc[i-1] 

        return self.df
    
df = pd.read_csv("Concrete_Data_Yeh_final.csv")
variables = ['cement', 'slag', 'ash', 'water', 'superplastic','coarseagg','fineagg', 'age', 'strength']

preprocessor = InspPreProcessing(df)

concdata = preprocessor.rename_columns()
concdata = preprocessor.replaceNaN()

concdata.head(190)

### Other Preprocessing - by mean

In [None]:
class MeanPreProcessing:
    "Replacing all NaN values in the original database"
    "with mean values of the corresponding variable"
    def __init__(self, df):
        self.df = df
        self.variables = variables
    
    def renameColumns(self):
        #simple column names
        self.df.rename(columns=dict(zip(self.df.columns, variables)), 
            inplace=True)
        return self.df

    def replaceNaN(self) -> pd.DataFrame:
        #replace NaN values with mean values
        mean_values = self.df.mean()
        self.df.fillna(value=mean_values, inplace=True)
        return self.df
    
    def meanValues(self) -> pd.DataFrame:
        mean_values = self.df.mean()
        mean_df = pd.DataFrame(mean_values, columns=['mean'])
        return mean_df

df = pd.read_csv("Concrete_Data_Yeh_final.csv")
variables = ['cement', 'slag', 'ash', 'water', 'superplastic','coarseagg','fineagg', 'age', 'strength']

preprocessor = MeanPreProcessing(df)

concdata = preprocessor.renameColumns()
concdata = preprocessor.replaceNaN()
mean_df = preprocessor.meanValues()

mean_df

### Processed Database

In [None]:
concdata

### Statistical Summary

In [None]:
concdata.describe().transpose()

### Correlation Matrix

In [None]:
corr = concdata.corr()
corr.style.background_gradient(cmap='coolwarm')

## Regression Model

### Scaling

In [None]:
scaler = MinMaxScaler()

trainconcdata, testconcdata = train_test_split(concdata, test_size=0.2, random_state=42)

scaled_trainconcdata = pd.DataFrame(scaler.fit_transform(trainconcdata), columns=variables)
scaled_testconcdata = pd.DataFrame(scaler.transform(testconcdata), columns=variables)

### Main Functions

In [None]:
def RegressionModel(regression_type, regression_name, metriclist, xaxis=xaxis):
    #observing linear and ridge regression models for each variable
    linearheatmap = {}
    ridgeheatmap = {}

    for index, i in enumerate(scaled_trainconcdata.columns[:-1]):

        x_test = scaled_testconcdata[i].to_numpy().reshape(-1,1)
        y_test = scaled_testconcdata['strength'].to_numpy().reshape(-1,1)
        x_train = scaled_trainconcdata[i].to_numpy().reshape(-1,1)
        y_train = scaled_trainconcdata['strength'].to_numpy().reshape(-1,1)

        regressor = regression_type()
        regressor.fit(x_train, y_train)
        y_pred = regressor.predict(x_test)
        linearheatmap[i] = float(regressor.coef_)
        ridgeheatmap[i] = float(regressor.coef_)
        
        metrics = []
        for j in metriclist:
            metrics.append((j.__name__, j(y_test, y_pred)))
        print (pd.DataFrame(metrics, columns=['Metric', 'Value']))

        plt.figure(figsize=(8, 4))
        plt.scatter(x_test, y_test, color='blue', label='Actual', s=5)
        plt.plot(x_test, y_pred, color='red', label='Predicted')
        plt.title(f'{regression_name} | Score = {format(regressor.score(x_test, y_test), ".3f")} | Gradient = {format(regressor.coef_[0][0], ".3f")}')
        plt.xlabel(xaxis[index])
        plt.ylabel('Compression Strength (MPa)')
        plt.legend()
        plt.show()
    return linearheatmap, ridgeheatmap

def Heatmap(heatmap, regression_name):
    #creating heatmap for each variable for each regression model
    heatmap_df = pd.DataFrame(heatmap, index=["strength"])
    plt.figure(figsize=(8, 1))
    sns.heatmap(heatmap_df, cmap='coolwarm', annot=True, annot_kws={'size': 12})
    plt.title(f'{regression_name} | Heat map')
    plt.show()

metriclist = [explained_variance_score, mean_absolute_error, mean_squared_error, r2_score]
xaxis = ['Cement (kg/m3)', 'Blast Furnace Slag (kg/m3)', 'Fly Ash (kg/m3)', 'Water (kg/m3)', 'Superplasticizer (kg/m3)', 'Coarse Aggregate (kg/m3)', 'Fine Aggregate (kg/m3)', 'Age (days)']  

### Linear Regression Plots

In [None]:
linearheatmap = RegressionModel(LinearRegression, "Linear Regression", metriclist, xaxis)

### Ridge Regression Plots

In [None]:
ridgeheatmap = RegressionModel(Ridge, "Ridge Regression", metriclist, xaxis)

### Linear Regression Heatmap

In [None]:
Heatmap(linearheatmap[0], "Linear Regression")

### Ridge Regression Heatmap

In [None]:
Heatmap(ridgeheatmap[0], "Ridge Regression")

### Regression Types

In [None]:
X = concdata.iloc[:,:-1].to_numpy() 
#y = concdata['csMPa'].to_numpy()  # IF USING MERVYN PREPROCESSING
y = concdata["strength"].to_numpy()  # IF USING AAKASH PREPROCESSING

print(f'y:\n{y}\n')
print(f'X:\n{pd.DataFrame(X)}\n') # to check if the data is split correctly (X = 2D array, y = 1D array).
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

class RegressionTypes:
    "Observing correlation between predicted and actual values"
    "of compressive strength for each regression model"
    def __init__(self, x_train, x_test, y_train, y_test, regression, **kwargs): #input if needed: metriclist = list
        self.x_train = x_train
        self.x_test = x_test
        self.y_train = y_train
        self.y_test = y_test
        self.regression = regression(**kwargs)
        self.regfit = self.regression.fit(x_train, y_train)
        self.y_pred = self.regression.predict(x_test)
        # self.y_pred is a numpy array that gives a predicted value of y for each x in x_test, given the fit of the trained model.
    
    def Metric(self, metric, **kwargs):
        #scores = cross_val_score(self.regression, self.x_train, self.y_train, cv=5)
        #print(metric(self.y_test, self.y_pred))
        #print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
        return metric(self.y_test, self.y_pred), self.regression.score(self.x_test, self.y_test)

    def Plot(self, **kwargs):
        ref_x = np.linspace(0, 80, 1000)
        ref_y = ref_x
        plt.figure(figsize=(6, 4))
        plt.scatter(self.y_test, self.y_pred, color='red', s=5)
        plt.title(f'Score: {self.regression.score(self.x_test, self.y_test)}')
        plt.plot(ref_x, ref_y, color='black', linestyle = 'dashed')
        plt.xlabel('Actual compressive strength (Mpa)')
        plt.ylabel('Predicted compressive strength (Mpa)')
        plt.show()
        return
 

In [None]:
regressors = [LinearRegression, Ridge, Lasso, RandomForestRegressor]

metrics = [lambda y_true, y_pred: mean_squared_error(y_true, y_pred, squared=False), mean_squared_error, r2_score, mean_absolute_error, explained_variance_score]

for i, r in enumerate(regressors):
    print(f'Using {r.__name__}')
    if i == 0:
        RegressionTypes(x_train, x_test, y_train, y_test, r).Plot()
    elif i in [1, 2]:
        RegressionTypes(x_train, x_test, y_train, y_test, r, alpha = 0.1, random_state = 42).Plot()
    else:
        RegressionTypes(x_train, x_test, y_train, y_test, r, n_estimators = 200, max_depth = 30, random_state = 42).Plot()

### Thoughts

these regression scores are pretty bad (except random forest <3). need to get them up if we want to use this form of pre-processing (which i do think is the best personally).

mervyn's preprocessing still displays the "input contains NaN or infinity" error for me. 