In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.externals import joblib

In [9]:
df = pd.read_csv("recipe_1_out.csv", index_col=0)

## Mean model

In [11]:
df['class'].value_counts()

below      439
optimal     85
above       12
Name: class, dtype: int64

In [12]:
(df['class'] == df['class'].shift(1)).mean()

0.6884328358208955

In [13]:
np.sqrt(((df['efficiency'] - df['efficiency'].mean())**2).mean())

10.99868088219504

## Regresja liniowa

In [25]:
cols_to_fit = ['sifter_speed_nominal_pct', 'water_correction', 'steam_preasure', 'dd_speed', 'temp_out', 
               'water_pct', 'particles_grp1', 'particles_grp2', 'particles_grp3', 'moisture', 'elems']

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
import sklearn
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestRegressor

In [27]:
df.columns

Index(['bigbag_filling_duration', 'bigbag_weight', 'efficiency', 'class',
       'sifter_speed_nominal_pct', 'steam_preasure', 'dd_speed', 'temp_out',
       'water_pct', 'water_correction', 'id', 'particles_grp1',
       'particles_grp2', 'particles_grp3', 'moisture', 'usage_pct', 'elems'],
      dtype='object')

In [28]:
df.columns

Index(['bigbag_filling_duration', 'bigbag_weight', 'efficiency', 'class',
       'sifter_speed_nominal_pct', 'steam_preasure', 'dd_speed', 'temp_out',
       'water_pct', 'water_correction', 'id', 'particles_grp1',
       'particles_grp2', 'particles_grp3', 'moisture', 'usage_pct', 'elems'],
      dtype='object')

In [29]:
rf = RandomForestRegressor(n_estimators=10, max_depth=1, min_samples_split=3,
                           min_samples_leaf=2)

In [30]:
svc = SVC(gamma="auto",kernel="rbf", C=0.01, degree=2)

In [31]:
lr = LogisticRegression(solver='lbfgs', max_iter=10000, multi_class="auto", C=10)

In [32]:
df['class'] = df['class'].astype('category').cat.codes

In [33]:
lr.fit(df[cols_to_fit], df['class'])

LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=10000, multi_class='auto',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [34]:
svc.fit(df[cols_to_fit], df['class'])

SVC(C=0.01, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=2, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [35]:
rf.fit(df[cols_to_fit], df['efficiency'])

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=1,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=2, min_samples_split=3,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [39]:
#df['preds_stack'] = (lr.predict_proba(df[cols_to_fit]) + rf.predict_proba(df[cols_to_fit])).argmax(axis=1)
df['preds'] = rf.predict(df[cols_to_fit])

In [40]:
from sklearn.metrics import confusion_matrix

In [41]:
sns.heatmap(confusion_matrix(df['class'], df['preds']),annot=True )

ValueError: Classification metrics can't handle a mix of multiclass and continuous targets

In [43]:
rf.fit(df[cols_to_fit], df['efficiency'])

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=1,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=2, min_samples_split=3,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [57]:
cross_val_score(rf, df[cols_to_fit], df['efficiency'], cv = 5).mean()

-0.07920925801160036

In [58]:
joblib.dump(rf, "recepta_1_model_efficiency.h5")

['recepta_1_model_efficiency.h5']

In [59]:
import json

In [60]:
with open("recepta_1_cols_eff.json", "w") as f:
    json.dump(cols_to_fit, f)