In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import explained_variance_score, mean_absolute_error, mean_squared_error, r2_score

## Notes to consider for the challenge

1. Pre-process the data is somewhat complete, though we may need to confirm between the group to check if this is the best representation of data.
 - We found the mean of each column from the data base (Concrete_Data_Yeh_final.csv) - but is this the right approach?

2. Try to do a function that goes through all types of regression and format a demo plot to see if this regression is appropriate for one column.
 - If results are satisfactory, we need to firm this type of regression for each column.


In [3]:
filename = 'Concrete_Data_Yeh_final.csv'
variables = ['cement', 'slag', 'flyash', 'water', 'superplasticizer', 'coarseaggregate', 'fineaggregate', 'age', 'csMPa']

class PreProcessing:
    "The aim of this class is to successfully replace all the empty values from"
    "the file given, and to split the columns into their own series / arrays."
    def __init__(self, file):
        self.data = pd.read_csv(file)
        
    def FillNaN (self):
        # Here we used the .mean method to replace the NaN values from the original dataset. 
        # (Please check this as we may use an alternative fillna method)
        for i in self.data.columns:
            self.data[i].fillna(self.data[i].mean(), inplace = True)
            print(self.data[i].mean(), self.data[i].std(), len(self.data[i]))
        return self
    
    #def SplitColumns (self, variablelist)-> dict:      
        # not sure what to do here, trying to split each column in the dataframe to their individual series / arrays. (To sort out)
        #Dict = {}
        #for variable in variablelist:
            #Dict[variable] = self.data[variable].to_numpy()
        #return Dict

    def SplitColumns (self, variable) -> np.array:
        return self.data[variable].to_numpy()

test = PreProcessing(filename).FillNaN()
# Here, we will form global variables for each array. These can be recalled for scaling and modelling, however, it cannot visually show which variable its from unless specified in further code. 
for i, variable in enumerate(variables):
    globals()[variable] = test.SplitColumns(variables[i])

# just for show
#print(cement, '\n slag:', slag, '\n flyash:', flyash, '\n water:', water, '\n superplasticizer:', superplasticizer, 
# '\n coarseaggregate:', coarseaggregate, '\n fineaggregate:', fineaggregate, '\n age:', age, '\n csMPa', csMPa)

281.1678640776696 104.50636449481543 1030
73.66865234374998 85.97389236764685 1030
54.122837706511156 63.962457657715774 1030
181.50626223091973 21.039881144231543 1030
6.219881889763779 5.925527321572695 1030
972.6539589442809 77.65912972100936 1030
773.6981499513134 79.89686334327651 1030
45.42731707317073 62.324091303247805 1030
35.81796116504851 16.705741961912505 1030


### Scaling the Data
We will scale variables here to understand and have easier comparison between variables, as they will be all under the same axis. It is better to scale such data before performing regression or any other machine learning algorithms that may follow.
Extra: I have made a simple plot to help understand the scaling - it will range between 0 and 1 using the MinMaxScaler from sklearn.

In [7]:
#fig, ax = plt.subplots()
def BasicGraph(x_variable, x_name, y_name = 'Compressive Stength (MPa)', y_variable = csMPa):
    plt.scatter(x_variable, y_variable, marker='o', s=1)
    #plt.scatter(x_variable, y_variable, marker='o', s=10)
    ax.set_xlabel(f'{x_name}')
    ax.set_ylabel(f'{y_name}')
    ax.set_title(f'Relationship between {y_name} and {x_name}')
    #plt.legend()
    plt.show()
    return

def MinMaxScaling(ivariable,ivariablename, dvariablename = 'csMPa', dvariable = csMPa):
    scaler = MinMaxScaler()
    
    plt.scatter(scaler.fit_transform(ivariable.reshape(-1,1)), scaler.fit_transform(dvariable.reshape(-1,1)), marker='o', s=10)
    ax.set_xlabel(f'{ivariablename}')
    ax.set_ylabel(f'{dvariablename}')
    ax.set_title(f'MinMax Scaling of Variables: {dvariablename} and {ivariablename}')
    #plt.legend()
    plt.show()
    return 

def MinMaxScaling2(variable):
    scaler = MinMaxScaler()
    return scaler.fit_transform(variable.reshape(-1,1))
#def StandardScaling(ivariable,ivariablename, dvariablename = 'csMPa', dvariable = csMPa):
    scaler = StandardScaler()
    
    plt.scatter(scaler.fit_transform(ivariable.reshape(-1,1)), scaler.fit_transform(dvariable.reshape(-1,1)), marker='o', s=10)
    ax.set_xlabel(f'{ivariablename}')
    ax.set_ylabel(f'{dvariablename}')
    ax.set_title(f'Standard Scaling of Variables: {dvariablename} and {ivariablename}')
    #plt.legend()
    plt.show()
    return

agescaled = MinMaxScaling2(age)
csMPascaled = MinMaxScaling2(csMPa)
print(agescaled, csMPascaled)
#test2 = BasicGraph(flyash, variables[2])
#test3 = StandardScaling(slag, variables[1])

[[0.07417582]
 [0.07417582]
 [0.73901099]
 ...
 [0.07417582]
 [0.07417582]
 [0.07417582]] [[0.96748474]
 [0.74199576]
 [0.47265479]
 ...
 [0.26622649]
 [0.37922013]
 [0.37461069]]


### Building the Regression Model
Now that we got all the variables split into individual arrays (and more importantly scaled!), this will be processed such that:
1. Each dependant variable and csMPa (output variable) will be split into a training and test set - the training ratio can change depending on variance vs bias.
2. Will go through each type of regression technique (Linear, Ridge, Lasso and ElasticNet) via .fit() on the training data - this will determine which regression technique is best. To support the fit, .coef_ and .intercept are used as well on the training sets. The test set will undergo the .score() and .predict() functions to be applied on the training data.
3. To visually understand what we are dealing with, we will plot a demo graph to examine how well our information can provide the most accurate observation. We repeat this for each type of regression -  ideally, the higher the .score() value + good spread of .predict(), the better the final representation when making the interactive graph.
 
To Note: The original x and y data (x and y) will be saved for future reference if they need to be used, e.g. plotting the graphs.


In [12]:
class RegressionModel:
    def __init__(self, trainratio, regressor, dvariable, ivariable = csMPascaled):
        'Setup of input variables for functions used below in the Regression model'
        self.dvariable = dvariable
        self.ivariable = ivariable
        self.trainratio = trainratio
        self.regressor = regressor

    def TrainandTest(self):
        'Separates the data to a training and test set'
        # get training data
        n_rows = int(len(self.ivariable) * self.trainratio)

        # do split
        self.dtrain = self.dvariable[:n_rows]
        self.dtest = self.dvariable[n_rows:]
        self.itrain = self.ivariable[:n_rows]
        self.itest = self.ivariable[n_rows:]
            
        # reshape x to ensure it is 2D
        #trained_x = dtrain.reshape(-1,1)
        #test_x = dtest.reshape(-1,1)
            
        return self.dtrain, self.dtest, self.itrain, self.itest #We can merge this definition with the Regression definition below if necessary.
    
    def Regression(self):
        'Uses the regression model inputted such that it uses:'
        'Training set: fit, coefficients and intercept'
        'Test set: predictivity and score (explained variance, mean absolute error, mean squared error, r2 score may be also implemented)'
        
        #Inheriting the attributes from the TrainandTest def
        self.dtrain, self.dtest, self.itrain, self.itest = self.TrainandTest()

        fit = self.regressor.fit(self.dtrain.reshape(-1,1), self.itrain)
        self.intercept = self.regressor.intercept_
        self.coefficients = self.regressor.coef_
        self.predictivity = self.regressor.predict(self.dtest.reshape(-1,1))
        self.score = self.regressor.score(self.dtest.reshape(-1,1), self.itest)

        return fit, self.intercept, self.coefficients, self.predictivity, self.score
    
    def DemoPlot(self, dname, iname = 'csMPa'):
        'This is a demo plot to show the regression model'
        
        fig, ax = plt.subplots()
        plt.scatter(self.dtest, self.predictivity, marker='o', s=10)
        plt.plot(self.dtest, self.itest, color='red', linewidth=1)
        ax.set_xlabel(f'{dname}')
        ax.set_ylabel(iname)
        ax.set_title(f'Relationship between {iname} and {dname}, score = {self.score:.2f}')
        plt.show()
        return fig, ax
    
# As a note: If results are not buying, changing 1 of the variables to a logarithmic scale may help.
# It all depends on the general trend provided by the Regession model.
    
test5 = RegressionModel(0.2, Lasso(), agescaled).Regression()
print(test5)

(Lasso(), array([0.56840082]), array([0.]), array([0.56840082, 0.56840082, 0.56840082, 0.56840082, 0.56840082,
       0.56840082, 0.56840082, 0.56840082, 0.56840082, 0.56840082,
       0.56840082, 0.56840082, 0.56840082, 0.56840082, 0.56840082,
       0.56840082, 0.56840082, 0.56840082, 0.56840082, 0.56840082,
       0.56840082, 0.56840082, 0.56840082, 0.56840082, 0.56840082,
       0.56840082, 0.56840082, 0.56840082, 0.56840082, 0.56840082,
       0.56840082, 0.56840082, 0.56840082, 0.56840082, 0.56840082,
       0.56840082, 0.56840082, 0.56840082, 0.56840082, 0.56840082,
       0.56840082, 0.56840082, 0.56840082, 0.56840082, 0.56840082,
       0.56840082, 0.56840082, 0.56840082, 0.56840082, 0.56840082,
       0.56840082, 0.56840082, 0.56840082, 0.56840082, 0.56840082,
       0.56840082, 0.56840082, 0.56840082, 0.56840082, 0.56840082,
       0.56840082, 0.56840082, 0.56840082, 0.56840082, 0.56840082,
       0.56840082, 0.56840082, 0.56840082, 0.56840082, 0.56840082,
       0.56840082,

 Now that the test and training sets are made, we can start applying our regressors, to find which one is best for the model. To ensure the best observed result, we will use .fit/.pred, followed by .score using mean, range, and especially r2 score to determine.

In [None]:
# create class to compare different regression techniques
class Regressor:
    """Container for analysing different metrics for a single regression class"""
    def __init__(self, regressmode, corrected_data, variablelist = list, trainratio = float, **kwargs):
        # construct regressor object
        self.regressor = regressmode(**kwargs)
        
        # use load function via inheritance
        self.X, self.y, self.X_train, self.y_train, self.X_test, self.y_test = trainandtest(corrected_data, variablelist, trainratio)
        
        # fit data
        self.regressor.fit(self.X_train, self.y_train)
        
        # get predicted data
        self.y_pred = self.regressor.predict(self.X_test)
        
    def metric(self, regressmode, **kwargs) -> float:
        """Takes a sklearn.metrics class and returns the score of the regressor object"""
        
        # use the metric class to get a score
        return regressmode(self.y_test, self.y_pred)

# create a list of regressors
ListofRegressors = [LinearRegression, Ridge, Lasso, ElasticNet, RandomForestRegressor]
MeasureScoreList = [explained_variance_score, mean_absolute_error, mean_squared_error, r2_score]

# Calling out the function commands via a loop
for i, reg in enumerate(ListofRegressors):
    
    # intialise regressor
    regressor = Regressor(reg, b, Variables, 0.2)
    for j, metric in enumerate(MeasureScoreList):
        score = regressor.metric(metric)
        print(score)

# Error somewhere here - the loop is not working? Need help with diagnosing the probem. Maybe ask Copilot for assistance for further guidance.