In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

In [2]:
def SelectFeature(feature_candidates,features_used,targets,df):
    N = len(feature_candidates)
    R2 = np.zeros(N)
    for k in range(N):   
        features_current = features_used.copy()
        features_current.append(feature_candidates[k])
        features = df[features_current]
        results = cross_validate(lr,features,targets,n_jobs=-1)
        R2[k] = results['test_score'].mean()
        
    R2_max = R2.max()
    feature_selected = feature_candidates[R2.argmax()] 
    return (feature_selected,R2_max)

# Use cross-validation, forward feature selection and/or feature engineering to predict the compressive strength of concrete using the features contained in the file *Concrete_train.csv* in the following ways

## (A)
### Develop a regressor that stresses interpretability

In [5]:
df = pd.read_csv('./Concrete_train.csv')
df.head()

Unnamed: 0,Cement (component 1)(kg in a m^3 mixture),Blast Furnace Slag (component 2)(kg in a m^3 mixture),Fly Ash (component 3)(kg in a m^3 mixture),Water (component 4)(kg in a m^3 mixture),Superplasticizer (component 5)(kg in a m^3 mixture),Coarse Aggregate (component 6)(kg in a m^3 mixture),Fine Aggregate (component 7)(kg in a m^3 mixture),Age (day),"Concrete compressive strength(MPa, megapascals)"
0,233.81,0.0,94.58,197.89,4.567,947.04,852.16,28,22.835445
1,251.81,0.0,99.94,146.14,12.35,1006.0,899.76,56,44.140254
2,252.0,0.0,0.0,186.0,0.0,1111.0,784.0,7,11.465986
3,325.6,166.4,0.0,174.0,8.9,881.6,790.0,28,61.235811
4,250.0,0.0,95.69,191.84,5.33,948.9,857.2,14,24.655662


In [6]:
features = df.drop(df.columns[-1],axis=1)
targets  = df.iloc[:,-1]
features = (features - features.mean())/features.std()

In [7]:
lr = LinearRegression()
feature_candidates = list(features.columns)
features_used = []
R2 = []

while (len(feature_candidates) > 0):
    print('candidates left:',len(feature_candidates),end='')
    (feature_selected,R2_max) = SelectFeature(feature_candidates,features_used,targets,features)
    features_used.append(feature_selected)
    feature_candidates.remove(feature_selected)
    R2.append(R2_max)
    print('  selected:',feature_selected)
    
results = pd.DataFrame()
results['features'] = features_used
results['test R-squared'] = R2

print()
print(results)
ix = results['test R-squared'].idxmax()
print()
print('use features 0 to',ix,end='')
R2_max = results['test R-squared'].max()
print('    maximum test R-squared =',R2_max.round(3))

candidates left: 8



  selected: Cement (component 1)(kg in a m^3 mixture)
candidates left: 7  selected: Superplasticizer (component 5)(kg in a m^3 mixture)
candidates left: 6  selected: Age (day)
candidates left: 5  selected: Blast Furnace Slag (component 2)(kg in a m^3 mixture)
candidates left: 4  selected: Water  (component 4)(kg in a m^3 mixture)
candidates left: 3  selected: Fly Ash (component 3)(kg in a m^3 mixture)
candidates left: 2  selected: Coarse Aggregate  (component 6)(kg in a m^3 mixture)
candidates left: 1  selected: Fine Aggregate (component 7)(kg in a m^3 mixture)

                                            features  test R-squared
0          Cement (component 1)(kg in a m^3 mixture)        0.243398
1  Superplasticizer (component 5)(kg in a m^3 mix...        0.358201
2                                          Age (day)        0.477806
3  Blast Furnace Slag (component 2)(kg in a m^3 m...        0.547278
4          Water  (component 4)(kg in a m^3 mixture)        0.592691
5         Fly Ash

In [8]:
lr.fit(features[features_used],targets)
coef = pd.Series(lr.coef_,index=features.columns)
coef.sort_values(ascending=False)

Cement (component 1)(kg in a m^3 mixture)                13.159616
Water  (component 4)(kg in a m^3 mixture)                 9.806000
Fly Ash (component 3)(kg in a m^3 mixture)                7.056381
Coarse Aggregate  (component 6)(kg in a m^3 mixture)      6.446821
Age (day)                                                 2.136015
Fine Aggregate (component 7)(kg in a m^3 mixture)         2.117328
Blast Furnace Slag (component 2)(kg in a m^3 mixture)     1.644456
Superplasticizer (component 5)(kg in a m^3 mixture)      -3.031667
dtype: float64

#### We must drop the *Fine Aggregate* feature. We care about the amount of cement. We can see that the Amount of *Superplasticizer* has a negative effect on concrete strength (-3.031667).

## (B)
### Develop a regressor that stresses prediction accuracy.

In [9]:
features = df.drop(df.columns[-1],axis=1)
targets  = df.iloc[:,-1]
poly = PolynomialFeatures(2)
features_engineered = poly.fit_transform(features)
cols = poly.get_feature_names(features.columns)
features_engineered = pd.DataFrame(features_engineered,columns=cols)
features_engineered.head(3)
print('features shape =',features.shape)
print('features (engineered) shape =',features_engineered.shape)

features shape = (700, 8)
features (engineered) shape = (700, 45)


In [10]:
ix = (features_engineered.std() == 0)
drop_cols = features_engineered.columns[ix]
features_engineered = features_engineered.drop(drop_cols,axis=1)
print('features (engineered) shape =',features_engineered.shape)

features (engineered) shape = (700, 44)


In [11]:
features_engineered = (features_engineered - features_engineered.mean())/features_engineered.std()
features_engineered.describe()

Unnamed: 0,Cement (component 1)(kg in a m^3 mixture),Blast Furnace Slag (component 2)(kg in a m^3 mixture),Fly Ash (component 3)(kg in a m^3 mixture),Water (component 4)(kg in a m^3 mixture),Superplasticizer (component 5)(kg in a m^3 mixture),Coarse Aggregate (component 6)(kg in a m^3 mixture),Fine Aggregate (component 7)(kg in a m^3 mixture),Age (day),Cement (component 1)(kg in a m^3 mixture)^2,Cement (component 1)(kg in a m^3 mixture) Blast Furnace Slag (component 2)(kg in a m^3 mixture),...,Superplasticizer (component 5)(kg in a m^3 mixture)^2,Superplasticizer (component 5)(kg in a m^3 mixture) Coarse Aggregate (component 6)(kg in a m^3 mixture),Superplasticizer (component 5)(kg in a m^3 mixture) Fine Aggregate (component 7)(kg in a m^3 mixture),Superplasticizer (component 5)(kg in a m^3 mixture) Age (day),Coarse Aggregate (component 6)(kg in a m^3 mixture)^2,Coarse Aggregate (component 6)(kg in a m^3 mixture) Fine Aggregate (component 7)(kg in a m^3 mixture),Coarse Aggregate (component 6)(kg in a m^3 mixture) Age (day),Fine Aggregate (component 7)(kg in a m^3 mixture)^2,Fine Aggregate (component 7)(kg in a m^3 mixture) Age (day),Age (day)^2
count,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,...,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0
mean,-4.726378e-16,8.754902e-17,-4.004733e-16,4.365793e-15,1.297375e-15,2.24541e-14,2.258194e-15,1.0071310000000001e-17,5.900042000000001e-17,-5.12923e-16,...,2.127663e-16,-9.44324e-16,-9.714451000000001e-17,-1.583257e-16,-9.766156e-15,3.610921e-15,-4.27991e-16,-1.035997e-15,-1.573345e-16,6.28862e-17
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-1.705731,-0.8519567,-0.8645892,-2.753839,-1.065575,-2.179938,-2.196943,-0.6991254,-1.204853,-0.8007647,...,-0.6282452,-1.088724,-1.0236,-0.6789082,-2.027543,-2.57652,-0.7047443,-2.016521,-0.7309118,-0.308366
25%,-0.8508288,-0.8519567,-0.8645892,-0.7748583,-1.065575,-0.5043832,-0.5498518,-0.5223177,-0.8045954,-0.8007647,...,-0.6282452,-1.088724,-1.0236,-0.6789082,-0.5339743,-0.6733841,-0.5134557,-0.5930749,-0.5315415,-0.2987839
50%,-0.1545651,-0.6182263,-0.8645892,0.1469845,0.086606,-0.05569256,0.06954236,-0.2972896,-0.2994707,-0.4859822,...,-0.2374297,0.08583567,0.02450493,-0.3516462,-0.09568767,0.06124728,-0.3085203,0.0172151,-0.3044342,-0.2686542
75%,0.6777092,0.839959,0.9587934,0.5118232,0.664376,0.7318183,0.6343291,0.1527665,0.5151788,0.5489637,...,0.2527996,0.6774174,0.5936573,0.2423668,0.7127135,0.7984935,0.1306575,0.6094322,0.1794333,-0.1481356
max,2.520774,3.348178,2.217792,2.99048,4.342621,2.083134,2.677981,5.119457,3.13697,3.258547,...,7.982392,3.853199,4.496183,8.085767,2.216101,2.210361,5.466905,3.037131,6.294158,6.517746


In [12]:
lr = LinearRegression()
feature_candidates = list(features_engineered.columns)
features_used = []
R2 = []

while (len(feature_candidates) > 0):
    print('candidates left:',len(feature_candidates),end='')
    (feature_selected,R2_max) = SelectFeature(feature_candidates,features_used,targets,features_engineered)
    features_used.append(feature_selected)
    feature_candidates.remove(feature_selected)
    R2.append(R2_max)
    print('  selected:',feature_selected)
    
results = pd.DataFrame()
results['features'] = features_used
results['test R-squared'] = R2

print()
print(results)
ix = results['test R-squared'].idxmax()
print()
print('use features 0 to',ix,end='')
R2_max = results['test R-squared'].max()
print('    maximum test R-squared =',R2_max.round(3))

candidates left: 44



  selected: Superplasticizer (component 5)(kg in a m^3 mixture) Age (day)
candidates left: 43  selected: Cement (component 1)(kg in a m^3 mixture)
candidates left: 42  selected: Blast Furnace Slag (component 2)(kg in a m^3 mixture) Coarse Aggregate  (component 6)(kg in a m^3 mixture)
candidates left: 41  selected: Fly Ash (component 3)(kg in a m^3 mixture) Age (day)
candidates left: 40  selected: Coarse Aggregate  (component 6)(kg in a m^3 mixture) Age (day)
candidates left: 39  selected: Water  (component 4)(kg in a m^3 mixture)
candidates left: 38  selected: Age (day)^2
candidates left: 37  selected: Cement (component 1)(kg in a m^3 mixture) Fly Ash (component 3)(kg in a m^3 mixture)
candidates left: 36  selected: Blast Furnace Slag (component 2)(kg in a m^3 mixture) Age (day)
candidates left: 35  selected: Blast Furnace Slag (component 2)(kg in a m^3 mixture) Superplasticizer (component 5)(kg in a m^3 mixture)
candidates left: 34  selected: Superplasticizer (component 5)(kg in a m^3

#### The engineered features increase the R-squared: 0.627 (from previous loop) to 0.796

In [13]:
lr.fit(features_engineered[features_used],targets)
coef = pd.Series(lr.coef_,index=features_engineered.columns)
coef.sort_values(ascending=False)

Fly Ash (component 3)(kg in a m^3 mixture) Superplasticizer (component 5)(kg in a m^3 mixture)                273.938489
Coarse Aggregate  (component 6)(kg in a m^3 mixture)                                                          260.490521
Blast Furnace Slag (component 2)(kg in a m^3 mixture)                                                         252.501233
Coarse Aggregate  (component 6)(kg in a m^3 mixture)^2                                                        223.032798
Water  (component 4)(kg in a m^3 mixture) Fine Aggregate (component 7)(kg in a m^3 mixture)                   136.899941
Fine Aggregate (component 7)(kg in a m^3 mixture) Age (day)                                                   116.770381
Blast Furnace Slag (component 2)(kg in a m^3 mixture) Coarse Aggregate  (component 6)(kg in a m^3 mixture)     35.257012
Blast Furnace Slag (component 2)(kg in a m^3 mixture)^2                                                        15.592522
Blast Furnace Slag (component 2)