In [283]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from ucimlrepo import fetch_ucirepo 


from sklearn.metrics import r2_score 
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.preprocessing import PolynomialFeatures
from sklearn.compose import make_column_selector
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder

In [263]:
# fetch dataset 
concrete_compressive_strength = fetch_ucirepo(id=165) 


In [264]:
# data (as pandas dataframes) 
X = concrete_compressive_strength.data.features 
y = concrete_compressive_strength.data.targets 

type(X), type(y)

(pandas.core.frame.DataFrame, pandas.core.frame.DataFrame)

In [265]:
# Metadata and variable info 
print(concrete_compressive_strength.metadata)  
print(concrete_compressive_strength.variables)  

{'uci_id': 165, 'name': 'Concrete Compressive Strength', 'repository_url': 'https://archive.ics.uci.edu/dataset/165/concrete+compressive+strength', 'data_url': 'https://archive.ics.uci.edu/static/public/165/data.csv', 'abstract': 'Concrete is the most important material in civil engineering. The concrete compressive strength is a highly nonlinear function of age and ingredients. ', 'area': 'Physics and Chemistry', 'tasks': ['Regression'], 'characteristics': ['Multivariate'], 'num_instances': 1030, 'num_features': 8, 'feature_types': ['Real'], 'demographics': [], 'target_col': ['Concrete compressive strength'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1998, 'last_updated': 'Sun Feb 11 2024', 'dataset_doi': '10.24432/C5PK67', 'creators': ['I-Cheng Yeh'], 'intro_paper': {'ID': 383, 'type': 'NATIVE', 'title': 'Modeling of strength of high-performance concrete using artificial neural networks', 'authors': 'I. Yeh', 'venue': 'C

In [266]:
entire_data = pd.concat([X,y], axis=1)
entire_data

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Concrete compressive strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.30
...,...,...,...,...,...,...,...,...,...
1025,276.4,116.0,90.3,179.6,8.9,870.1,768.3,28,44.28
1026,322.2,0.0,115.6,196.0,10.4,817.9,813.4,28,31.18
1027,148.5,139.4,108.6,192.7,6.1,892.4,780.0,28,23.70
1028,159.1,186.7,0.0,175.6,11.3,989.6,788.9,28,32.77


In [267]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=24)
X_train.shape, y_train.shape

((721, 8), (721, 1))

# linear regression

In [268]:
lr = LinearRegression()
lr.fit(X_train, y_train)
lr.intercept_, lr.coef_


(array([-35.44918333]),
 array([[ 0.12726535,  0.11257926,  0.09835135, -0.13691381,  0.23733402,
          0.02020436,  0.02616505,  0.12251105]]))

# Ridge Regression (L2 Norm)

In [269]:
rr = Ridge()
rr.fit(X_train, y_train)
rr.intercept_, lr.coef_

(array([-35.44730826]),
 array([[ 0.12726535,  0.11257926,  0.09835135, -0.13691381,  0.23733402,
          0.02020436,  0.02616505,  0.12251105]]))

### With Polynomial Features

In [270]:
ploy  = PolynomialFeatures(degree=3, include_bias=False).set_output(transform='pandas')

In [271]:
# degree: 2
poly = PolynomialFeatures(degree=2, include_bias=False).set_output(transform='pandas')
X_poly_trn = poly.fit_transform(X_train)
X_poly_tst = poly.transform(X_test)
rr.fit(X_poly_trn, y_train)
y_pred = rr.predict(X_poly_tst)
r2_score(y_test, y_pred)

0.7787728514693077

In [272]:
# degree: 3
poly = PolynomialFeatures(degree=3, include_bias=False).set_output(transform='pandas')
X_poly_trn = poly.fit_transform(X_train)
X_poly_tst = poly.transform(X_test)
rr.fit(X_poly_trn, y_train)
y_pred = rr.predict(X_poly_tst)
r2_score(y_test, y_pred)

  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


0.8695065063940556

In [273]:
print(X_poly_trn)

     Cement  Blast Furnace Slag  Fly Ash  Water  Superplasticizer  \
538   480.0                 0.0      0.0  192.0               0.0   
434   178.0               129.8    118.6  179.9               3.6   
454   250.0                 0.0     95.7  191.8               5.3   
804   393.0                 0.0      0.0  192.0               0.0   
518   202.0                11.0    141.0  206.0               1.7   
..      ...                 ...      ...    ...               ...   
145   469.0               117.2      0.0  137.8              32.2   
343   297.2                 0.0    117.5  174.8               9.5   
192   233.8                 0.0     94.6  197.9               4.6   
899   145.0                 0.0    134.0  181.0              11.0   
418   251.4                 0.0    118.3  192.9               5.8   

     Coarse Aggregate  Fine Aggregate    Age   Cement^2  \
538             936.2           712.2    7.0  230400.00   
434            1007.3           746.8   28.0   31684.

In [274]:
df_coef = pd.DataFrame({'col_names': list(X_poly_trn.columns), 'coef':list(rr.coef_[0])})
print(df_coef.shape)
df_coef[df_coef['coef'] > 0.0001]

(164, 2)


Unnamed: 0,col_names,coef
1,Blast Furnace Slag,0.636144
5,Coarse Aggregate,2.612272
6,Fine Aggregate,0.909094
7,Age,0.123618
8,Cement^2,0.010445
9,Cement Blast Furnace Slag,0.031224
10,Cement Fly Ash,0.008666
14,Cement Fine Aggregate,0.023399
16,Blast Furnace Slag^2,0.004927
17,Blast Furnace Slag Fly Ash,0.015132


In [275]:
# Different alpha values
rr = Ridge(alpha=2)
rr.fit(X_train, y_train)
y_pred = rr.predict(X_test)
r2_score(y_test, y_pred)

0.5771745422643093

**Hyperparameter** is a setting in a machine learning model that is not learned from data but set before training (e.g., learning rate, tree depth). 

**Hyperparameter tuning** is the process of selecting the best values for these settings to improve model performance.

Tuning fir alpha = [0.01, 0.1, 0.3, 0.6, 1.0, 3.10]

In [276]:
alphas = [0.01, 0.1, 0.3, 0.6, 1, 1.5, 2, 4, 10]
scores = []
for a in alphas:
    rr = Ridge(alpha=a)
    rr.fit(X_train, y_train)
    y_pred = rr.predict(X_test)
    scores.append(r2_score(y_test, y_pred))

print(scores)

idxmax = np.argmax(scores)

print("Best alpha: ",alphas[idxmax])
print("Best score: ",scores[idxmax])

[0.5771752740273375, 0.5771752409296148, 0.577175167380108, 0.5771750570584018, 0.5771749099675627, 0.5771747261116779, 0.5771745422643093, 0.5771738069600132, 0.5771716018651835]
Best alpha:  0.01
Best score:  0.5771752740273375


Hyperparameters optimization

In [277]:
alphas = np.linspace(0.0001, 10, 20)  
scores = []
for a in alphas:
    rr = Ridge(alpha=a)
    rr.fit(X_train, y_train)
    y_pred = rr.predict(X_test)
    scores.append(r2_score(y_test, y_pred))

idxmax = np.argmax(scores)

print("Best alpha: ",alphas[idxmax])
print("Best score: ",scores[idxmax])

Best alpha:  0.0001
Best score:  0.5771752776681036
 0.0001
Best score:  0.5771752776681036


# Lasso Regression (L1 Norm)

In [279]:
lasso = Lasso()
lasso.fit(X_train, y_train)
y_pred = lasso.predict(X_test)
r2_score(y_test, y_pred)

0.576333587787164

Hyper parameters optimization

In [280]:
alphas = np.linspace(0.0001, 10, 20)  
scores = []
for a in alphas:
    lassor = Lasso(alpha=a)
    lassor.fit(X_train, y_train)
    y_pred = lassor.predict(X_test)
    scores.append(r2_score(y_test, y_pred))

idxmax = np.argmax(scores)

print("Best alpha: ",alphas[idxmax])
print("Best score: ",scores[idxmax])

Best alpha:  0.0001
Best score:  0.5771752162488388


# Elastic Regression

In [284]:
er = ElasticNet()
er.fit(X_train, y_train)
y_pred = er.predict(X_test)
r2_score(y_test, y_pred)

0.5766806310401154

In [291]:
alphas = np.linspace(0.0001, 10, 20)  
l1 = np.linspace(0.0001, 1, 10)
scores = []
for a in alphas:
    for l in l1:
        er = ElasticNet(alpha=a, l1_ratio=l)
        er.fit(X_train, y_train)
        y_pred = er.predict(X_test)
        scores.append({
            'alpha': a,
            'l1_ratio': l,
            'score': r2_score(y_test, y_pred)
        })

df_scores = pd.DataFrame(scores)
df_scores.sort_values('score', ascending=False)
df_scores


Unnamed: 0,alpha,l1_ratio,score
0,0.0001,0.0001,0.577175
1,0.0001,0.1112,0.577175
2,0.0001,0.2223,0.577175
3,0.0001,0.3334,0.577175
4,0.0001,0.4445,0.577175
...,...,...,...
195,10.0000,0.5556,0.572413
196,10.0000,0.6667,0.572674
197,10.0000,0.7778,0.572891
198,10.0000,0.8889,0.573061
