In [45]:
import pandas as pd
from sklearn.externals import joblib
import json
import seaborn as sns
import numpy as np

In [46]:
df = pd.read_csv("data/recipe_1_out_reg.csv", index_col=0)

In [80]:
val = pd.read_csv("data/val_rec1.csv", index_col=0)

In [81]:
df=df.dropna()

## Moisture models

In [82]:
cols_to_fit = ['steam_preasure',
 'dd_speed',
 'temp_out',
 'particles_grp1',
 'particles_grp2',
 'particles_grp3',
 'water_correction']

## Mean model

In [83]:
df.columns

Index(['line_dd', 'process_order', 'testing_time', 'moisture_x',
       'bulk_density', 'orders_details_id_y', 'bigbag_number',
       'bigbag_filling_time_end', 'sifter_speed_nominal_pct',
       'orders_details_id_x', 'steam_preasure', 'dd_speed', 'temp_out',
       'orders_details_id_y.1', 'slurry_process_order', 'slurry_line',
       'slurry_start_time', 'water_pct', 'water_correction', 'Unnamed: 0.1',
       'id', 'particles_grp1', 'particles_grp2', 'particles_grp3',
       'moisture_y', 'usage_pct', 'elems'],
      dtype='object')

In [84]:
(((df['moisture_x'] - df['moisture_x'].mean())**2).mean())

0.2037574759798959

#### from xgboost import XGBRegressor

In [85]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import cross_val_score
import sklearn
from sklearn.svm import SVC
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor

In [86]:
xgb = XGBRegressor(booster='gblinear')

In [87]:
rf = RandomForestRegressor(n_estimators=50, max_depth=3, min_samples_split=3,
                           min_samples_leaf=2)

In [88]:
rf.fit(df[cols_to_fit], df['moisture_x'])

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=3,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=2, min_samples_split=3,
           min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [89]:
xgb.fit(df[cols_to_fit],df['moisture_x'])

XGBRegressor(base_score=0.5, booster='gblinear', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [90]:
mean_squared_error(df['moisture_x'], rf.predict(df[cols_to_fit]))

0.1301482892866359

In [91]:
val['moisture_pred'] = rf.predict(val[cols_to_fit])

In [93]:
lr = LinearRegression()

In [94]:
rf.fit(df[cols_to_fit], df['moisture_x'])

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=3,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=2, min_samples_split=3,
           min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [95]:
joblib.dump(rf, "recepta_1_model_moisture.h5")

['recepta_1_model_moisture.h5']

In [96]:
with open("recepta_1_cols_moisture.json", "w") as f:
    json.dump(cols_to_fit, f)

In [20]:
from sklearn.feature_selection import RFE

In [21]:
rfe = RFE(rf, 7)

In [22]:
rfe.fit(df[cols_to_fit], df['moisture_x'])

RFE(estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=3,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=2, min_samples_split=3,
           min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
  n_features_to_select=7, step=1, verbose=0)

In [316]:
cols_to_fit2 = df[cols_to_fit].columns[rfe.support_]

In [331]:
list(cols_to_fit2) + ['water_correction']

['steam_pressure',
 'dd_speed',
 'temp_out',
 'fat_pct',
 'particles_grp1',
 'particles_grp2',
 'particles_grp3',
 'water_correction']

In [340]:
-cross_val_score(xgb, df[cols_to_fit2], df['moisture_x'], cv = 4, scoring="neg_mean_squared_error").mean()

  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \


0.39129908100128585

## Bulk density

In [97]:
(((df['bulk_density'] - df['bulk_density'].mean())**2).mean())

71.52337483202207

In [98]:
rf = RandomForestRegressor(n_estimators=30, max_depth=2, min_samples_split=3,
                           min_samples_leaf=2)

In [99]:
-cross_val_score(rf, df[cols_to_fit], df['bulk_density'], cv = 3, scoring="neg_mean_squared_error").mean()

89.26273380975556

In [100]:
rf.fit(df[cols_to_fit], df['bulk_density'])

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=2,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=2, min_samples_split=3,
           min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [101]:
val['bulk_density_pred'] = rf.predict(val[cols_to_fit])

In [104]:
val[['moisture_pred', 'bulk_density_pred', 'orders_details_id_x', 'bigbag_number']].to_csv("val_1_rec1.csv")

In [None]:
val

In [103]:
val['orders_details_id_x'].unique()

array([22, 24, 11, 23], dtype=int64)

In [77]:
val[val['orders_details_id_x'].isin([11,22,23,24])]

Unnamed: 0,line_dd,process_order,testing_time,moisture_x,bulk_density,orders_details_id_y,bigbag_number,bigbag_filling_time_end,sifter_speed_nominal_pct,orders_details_id_x,...,Unnamed: 0.1,id,particles_grp1,particles_grp2,particles_grp3,moisture_y,usage_pct,elems,moisture_pred,bulk_density_pred


In [55]:
joblib.dump(rf, "recepta_1_model_bulk_density.h5")

['recepta_1_model_bulk_density.h5']

In [56]:
with open("recepta_1_cols_bulk_density.json", "w") as f:
    json.dump(cols_to_fit, f)

In [433]:
rf.feature_importances_

array([0.09060306, 0.01664019, 0.32474186, 0.03618916, 0.01480306,
       0.        , 0.10248593, 0.18750827, 0.01565855, 0.15166917,
       0.05970076, 0.        ])