<h1> Concrete Strength, Model Selection, and Error Estimation</h1>

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [3]:
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor

from sklearn.metrics import mean_absolute_error

from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from joblib import dump

In [13]:
#read CSV file for concrete
df = pd.read_csv("./ai1/datasets/dataset_concrete.csv")

#shuffle dataset and reorder it
df = df.sample(frac = 1, random_state = 5)
df.reset_index(drop = True, inplace = True)

#the qualities needed to predict strength
qualities = ["cement", "slag", "fly_ash", "water", "superplasticizer", "coarse_aggregate", "fine_aggregate", "age"]

In [39]:
#split data into development set and the test set
dev_df, test_df = train_test_split(df, train_size = 0.8, random_state = 4)

In [15]:
#extract qualities
dev_X = dev_df[qualities]
test_X = test_df[qualities]

dev_Y = dev_df["strength"].values
test_Y = test_df["strength"].values

In [26]:
#split further to get the validation dataset
ss = ShuffleSplit(n_splits = 1, train_size = 0.75, random_state = 3)

#nmake the preprocessor
preprocessor = ColumnTransformer([
    ("scaler", StandardScaler(), qualities)],
    remainder = "passthrough")

#knn pipeline model
knn_model = Pipeline ([
    ("preprocessor", preprocessor),
    ("predictor", KNeighborsRegressor())])

#grid search method to try multiple hyperparameters
knn_grid = {"predictor__n_neighbors" : [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}

#create grid search object to find the best hyperparameter
grid_search = GridSearchCV(knn_model, knn_grid, scoring = "neg_mean_absolute_error")

#run it
grid_search.fit(dev_X, dev_Y)

In [27]:
#find the best hyperparam and the score using this param
grid_search.best_params_, grid_search.best_score_

({'predictor__n_neighbors': 1}, -6.776850258684405)

In [40]:
#linear model
linear_model = Pipeline([
    ("preprocessor", preprocessor),
    ("predictor", LinearRegression())])

#error estimate for linear, used k fold because its small
np.mean(cross_val_score(linear_model, dev_X, dev_Y, scoring="neg_mean_absolute_error", cv=10))

-8.024851773324496

<p> The knn produced something I would show my boss</p>

In [36]:
#test sets!!
mean_absolute_error(test_Y, grid_search.predict(test_X))

7.40752427184466

In [41]:
linear_model.fit(dev_X, dev_Y)
mean_absolute_error(test_Y, linear_model.predict(test_X))

9.3230976870831

<h1>Deployment</h1>

In [None]:
grid_search.fit(X, Y)
#if x and y represeant whole dataset