In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error
from seaborn import regplot
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
def SelectFeature(feature_candidates,features_used,targets,df):
    N = len(feature_candidates)
    R2 = np.zeros(N)
    for k in range(N):   
        features_current = features_used.copy()
        features_current.append(feature_candidates[k])
        features = df[features_current]
        results = cross_validate(lr,features,targets,n_jobs=-1)
        R2[k] = results['test_score'].mean()
        
    R2_max = R2.max()
    feature_selected = feature_candidates[R2.argmax()] 
    return (feature_selected,R2_max)

In [3]:
df = pd.read_csv("./bacteria.csv")
print(df.shape)
df.head()

(100, 4)


Unnamed: 0,temperature,humidity,surface,CFU
0,23.2,0.7,rough,33.0
1,21.7,0.7,rough,15.0
2,15.1,0.5,smooth,28.0
3,30.8,0.2,rough,17.0
4,11.2,0.5,rough,14.0


In [4]:
features = df.drop('CFU',axis=1)
features.head(3)

Unnamed: 0,temperature,humidity,surface
0,23.2,0.7,rough
1,21.7,0.7,rough
2,15.1,0.5,smooth


In [5]:
features = pd.get_dummies(features)

In [6]:
features.head(3)

Unnamed: 0,temperature,humidity,surface_rough,surface_smooth
0,23.2,0.7,1,0
1,21.7,0.7,1,0
2,15.1,0.5,0,1


In [7]:
target = df.CFU

In [8]:
target.head(3)

0    33.0
1    15.0
2    28.0
Name: CFU, dtype: float64

In [9]:
#features = (features - features.mean()) / features.std()
lr = LinearRegression()
poly = PolynomialFeatures(2)
features_engineered = poly.fit_transform(features)
cols = poly.get_feature_names(features.columns)
features_engineered = pd.DataFrame(features_engineered,columns=cols)
features_engineered.head(3)
print('features shape =',features.shape)
print('features (engineered) shape =',features_engineered.shape)

features shape = (100, 4)
features (engineered) shape = (100, 15)


In [10]:
ix = (features_engineered.std() == 0)
drop_cols = features_engineered.columns[ix]
features_engineered = features_engineered.drop(drop_cols,axis=1)
print('features (engineered) shape =',features_engineered.shape)

features (engineered) shape = (100, 13)


In [11]:
features_engineered = (features_engineered - features_engineered.mean())/features_engineered.std()
features_engineered.describe()

Unnamed: 0,temperature,humidity,surface_rough,surface_smooth,temperature^2,temperature humidity,temperature surface_rough,temperature surface_smooth,humidity^2,humidity surface_rough,humidity surface_smooth,surface_rough^2,surface_smooth^2
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,9.325873e-17,1.534328e-15,1.287859e-16,-1.287859e-16,-5.77316e-17,7.804868e-16,-3.663736e-17,-8.881784e-18,-4.440892e-18,-1.7763570000000002e-17,1.3322680000000001e-17,1.287859e-16,-1.287859e-16
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-1.617928,-1.901657,-1.01509,-0.9752828,-1.180563,-1.442449,-0.8580065,-0.7971138,-1.338537,-0.8945165,-0.8241727,-1.01509,-0.9752828
25%,-0.8739583,-0.5181764,-1.01509,-0.9752828,-0.8897355,-0.7547051,-0.8580065,-0.7971138,-0.6500425,-0.8945165,-0.8241727,-1.01509,-0.9752828
50%,-0.04759467,0.1106784,0.9752828,-0.9752828,-0.2709141,-0.2964492,-0.3822388,-0.7971138,-0.09145281,-0.3534783,-0.8241727,0.9752828,-0.9752828
75%,0.9629438,1.116846,0.9752828,1.01509,0.9092876,0.69488,0.7859064,0.7820739,1.155631,0.9089442,0.6091712,0.9752828,1.01509
max,1.75538,1.61993,0.9752828,1.01509,2.160652,2.473949,2.081697,2.195823,1.935058,1.991021,2.042515,0.9752828,1.01509


In [12]:
lr = LinearRegression()
feature_candidates = list(features_engineered.columns)
features_used = []
R2 = []

while (len(feature_candidates) > 0):
    print('candidates left:',len(feature_candidates),end='')
    (feature_selected,R2_max) = SelectFeature(feature_candidates,features_used,target,features_engineered)
    features_used.append(feature_selected)
    feature_candidates.remove(feature_selected)
    R2.append(R2_max)
    print('  selected:',feature_selected)
    
results = pd.DataFrame()
results['features'] = features_used
results['test R-squared'] = R2

print()
print(results)
ix = results['test R-squared'].idxmax()
print()
print('use features 0 to',ix,end='')
R2_max = results['test R-squared'].max()
print('    maximum test R-squared =',R2_max.round(3))

candidates left: 13  selected: temperature surface_smooth
candidates left: 12  selected: temperature surface_rough
candidates left: 11  selected: surface_rough
candidates left: 10  selected: surface_smooth
candidates left: 9  selected: temperature
candidates left: 8  selected: surface_rough^2
candidates left: 7  selected: surface_smooth^2
candidates left: 6  selected: temperature^2
candidates left: 5  selected: temperature humidity
candidates left: 4  selected: humidity^2
candidates left: 3  selected: humidity surface_rough
candidates left: 2  selected: humidity surface_smooth
candidates left: 1  selected: humidity

                      features  test R-squared
0   temperature surface_smooth        0.438293
1    temperature surface_rough        0.521104
2                surface_rough        0.618322
3               surface_smooth        0.618322
4                  temperature        0.618322
5              surface_rough^2        0.618322
6             surface_smooth^2        0.618322


In [13]:
features = (features - features.mean()) / features.std()
lr.fit(features,target)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [14]:
R2 = lr.score(features, target)
print('R-squared = ', round(R2,3))

R-squared =  0.671


### The features engineered does not increase the R-squared value. 0.671 > 0.618. So, I will use the normal features, rather than engineered ones

In [15]:
coef = pd.DataFrame()
coef['feature'] = features.columns
coef['coef'] = lr.coef_.round(2)
coef.sort_values('coef',ascending=False)

Unnamed: 0,feature,coef
0,temperature,9.99
3,surface_smooth,4.63
1,humidity,0.62
2,surface_rough,-4.63


### From here, we see for every one CFU, temperature increases by almost 10 units, and humidity goes up 0.62 units. As you increase CFU, the likelihood that the surface is smooth increases, and likewise the likelihood that the surface is rough decreases

In [16]:
df = pd.read_csv("./bacteria_predict.csv")
df.head(3)

Unnamed: 0,temperature,humidity,surface,CFU
0,18.2,0.5,smooth,0
1,7.3,0.4,smooth,0
2,6.0,0.2,smooth,0


In [17]:
features = df.drop('CFU',axis=1)
features = pd.get_dummies(features)
features = (features - features.mean()) / features.std()

In [18]:
predictions = lr.predict(features)

In [19]:
df['CFU'] = predictions
df

Unnamed: 0,temperature,humidity,surface,CFU
0,18.2,0.5,smooth,38.170834
1,7.3,0.4,smooth,26.683532
2,6.0,0.2,smooth,24.739695
3,26.5,0.1,rough,27.033898
4,30.6,0.2,smooth,49.976615
5,16.9,0.4,smooth,36.532086
6,27.0,0.1,rough,27.546844
7,31.1,0.5,smooth,51.404829
8,8.9,0.6,rough,10.503663
9,9.0,0.8,rough,11.216431


In [21]:
df.to_csv("bactera_rileyma.csv",index=False)
df.head(3)

Unnamed: 0,temperature,humidity,surface,CFU
0,18.2,0.5,smooth,38.170834
1,7.3,0.4,smooth,26.683532
2,6.0,0.2,smooth,24.739695
