In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import r2_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv(r"C:\Users\DAI.STUDENTSDC\Desktop\Machine Learning\Data Sets\ChemicalProcess.csv")
df.head(10)

Unnamed: 0,Yield,BiologicalMaterial01,BiologicalMaterial02,BiologicalMaterial03,BiologicalMaterial04,BiologicalMaterial05,BiologicalMaterial06,BiologicalMaterial07,BiologicalMaterial08,BiologicalMaterial09,...,ManufacturingProcess36,ManufacturingProcess37,ManufacturingProcess38,ManufacturingProcess39,ManufacturingProcess40,ManufacturingProcess41,ManufacturingProcess42,ManufacturingProcess43,ManufacturingProcess44,ManufacturingProcess45
0,38.0,6.25,49.58,56.97,12.74,19.51,43.73,100.0,16.66,11.44,...,0.019,0.5,3,7.2,,,11.6,3.0,1.8,2.4
1,42.44,8.01,60.97,67.48,14.65,19.36,53.14,100.0,19.04,12.55,...,0.019,2.0,2,7.2,0.1,0.15,11.1,0.9,1.9,2.2
2,42.03,8.01,60.97,67.48,14.65,19.36,53.14,100.0,19.04,12.55,...,0.018,0.7,2,7.2,0.0,0.0,12.0,1.0,1.8,2.3
3,41.42,8.01,60.97,67.48,14.65,19.36,53.14,100.0,19.04,12.55,...,0.018,1.2,2,7.2,0.0,0.0,10.6,1.1,1.8,2.1
4,42.49,7.47,63.33,72.25,14.02,17.91,54.66,100.0,18.22,12.8,...,0.017,0.2,2,7.3,0.0,0.0,11.0,1.1,1.7,2.1
5,43.57,6.12,58.36,65.31,15.17,21.79,51.23,100.0,18.3,12.13,...,0.018,0.4,2,7.2,0.0,0.0,11.5,2.2,1.8,2.0
6,43.12,7.48,64.47,72.41,13.82,17.71,54.45,100.0,18.72,12.95,...,0.019,0.8,2,7.3,0.0,0.0,11.7,0.7,2.0,2.2
7,43.06,6.94,63.6,72.06,15.7,19.42,54.72,100.0,18.85,13.13,...,0.019,1.0,2,7.3,0.0,0.0,11.4,0.8,2.0,2.2
8,41.49,6.94,63.6,72.06,15.7,19.42,54.72,100.0,18.85,13.13,...,0.019,1.2,3,7.4,0.0,0.0,11.4,0.9,1.9,2.1
9,42.45,6.94,63.6,72.06,15.7,19.42,54.72,100.0,18.85,13.13,...,0.019,1.8,3,7.1,0.0,0.0,11.3,0.8,1.9,2.4


In [4]:
X = df.drop(['Yield'], axis=1)
y = df['Yield']

Using  Pipeline to poerform operation

In [5]:
mean_imputer = SimpleImputer(strategy='mean').set_output(transform='pandas')
linear_model = LinearRegression()

linear_pipe = Pipeline([
    ('imputer', mean_imputer),
    ('model', linear_model)
])

scores = cross_val_score(linear_pipe, X, y, cv=5)

print(f'Mean score: {scores.mean()}')

Mean score: -94.5774859579349


In [6]:
median_imputer = SimpleImputer(strategy='median').set_output(transform='pandas')
linear_model = LinearRegression()

linear_pipe = Pipeline([
    ('imputer', median_imputer),
    ('model', linear_model)
])

scores = cross_val_score(linear_pipe, X, y, cv=5)

print(f'mean score: {scores.mean()}')

mean score: -86.99645845798084


Combining it into short code

In [7]:
imputer = SimpleImputer()
linear_model = LinearRegression()
pipe_lr = Pipeline([
    ('imputer', imputer),
    ('model', linear_model)
])
pipe_lr.get_params()

{'memory': None,
 'steps': [('imputer', SimpleImputer()), ('model', LinearRegression())],
 'verbose': False,
 'imputer': SimpleImputer(),
 'model': LinearRegression(),
 'imputer__add_indicator': False,
 'imputer__copy': True,
 'imputer__fill_value': None,
 'imputer__keep_empty_features': False,
 'imputer__missing_values': nan,
 'imputer__strategy': 'mean',
 'model__copy_X': True,
 'model__fit_intercept': True,
 'model__n_jobs': None,
 'model__positive': False}

In [8]:
pipe_lr = Pipeline([
    ('imputer', SimpleImputer()),
    ('model', LinearRegression())
])
pramas = {
    'imputer__strategy': ['mean', 'median'],
}
gcv = GridSearchCV(pipe_lr, pramas)
gcv.fit(X, y)
print(f'Best parameters: \n{gcv.best_params_}')
print(f'Best score: {gcv.best_score_}')


Best parameters: 
{'imputer__strategy': 'median'}
Best score: -86.99645845798084


In [10]:
pipe_lr = Pipeline([
    ('imputer', SimpleImputer()),
    ('model', ElasticNet())
])
pramas = {
    'imputer__strategy': ['mean', 'median'],
    'model__alpha': np.linspace(0.0001, 10, 20),
    'model__l1_ratio': np.linspace(0.0001, 1, 10)
}
gcv = GridSearchCV(pipe_lr, pramas)
gcv.fit(X, y)
print(f'Best parameters: \n{gcv.best_params_}')
print(f'Best score: {gcv.best_score_}')

Best parameters: 
{'imputer__strategy': 'median', 'model__alpha': 10.0, 'model__l1_ratio': 1.0}
Best score: -1.55172620877767


Grid View

In [12]:
results = pd.DataFrame(gcv.cv_results_)
results.shape

(400, 16)

Exporting the DataFrame to a CSV file

In [13]:
results.to_csv('KFoldCrossValScoreOnChemicalDatasetResults.csv')