In [1]:
import pandas as pd
from sklearn.svm import SVR
from tqdm import tqdm_notebook as tqdm
import numpy as np
from sklearn.metrics import r2_score,mean_squared_error
from math import sqrt
from sklearn.preprocessing import StandardScaler
import warnings;warnings.simplefilter('ignore')
from sklearn.pipeline import Pipeline
import joblib

In [2]:
def mape(a, b):
    a = np.array(a)
    b = np.array(b)
    mask = a != 0
    return (np.fabs(a-b)/a)[mask].mean()

def metric(y_t,y_p,name):
  res = {'R2' : max(r2_score(y_t,y_p), 0),
         'RMSE': sqrt(mean_squared_error(y_t,y_p)),
         'MAPE': mape(y_t,y_p)}
  return pd.DataFrame(res,index=[name])

# load data

In [3]:
def load_data():
  train = pd.read_csv('../data/phase_1/train_4565.csv',index_col=0)
  test = pd.read_csv('../data/phase_1/test_170.csv',index_col=0)
  
  # define columns
  x_cols = ['T10','T50','T90','N+A']
  y_cols = ['C5NP','C5IP','C5N','C6NP','C6IP','C6N','C6A','C7NP','C7IP','C7N','C7A',
            'C8NP','C8IP','C8N','C8A','C9NP','C9IP','C9N','C9A','C10NP','C10IP','C10N','C10A']
  N_col = ['C5N','C6N','C6A','C7N','C7A','C8N','C8A','C9N','C9A','C10N','C10A']
  P_col = ['C5NP','C5IP','C6NP','C6IP','C7NP','C7IP','C8NP','C8IP','C9NP','C9IP','C10NP','C10IP']
  
  # split_data train and test
  X_train = train[x_cols]
  X_test = test[x_cols]
  y_train = train[y_cols]
  y_test = test[y_cols]
  
  return X_train,y_train,X_test,y_test

In [4]:
X_train,y_train,X_test,y_test = load_data()
x_cols = X_train.columns.tolist()
y_cols = y_train.columns.tolist()
len(x_cols),len(y_cols)

(4, 23)

# define model

In [5]:
class custom_model(object):
  def __init__(self,x_cols,y_cols):
    self.x_cols = x_cols
    self.y_cols = y_cols
    self.N_col = ['C5N','C6N','C6A','C7N','C7A','C8N','C8A','C9N','C9A','C10N','C10A']
    self.P_col = ['C5NP','C5IP','C6NP','C6IP','C7NP','C7IP','C8NP','C8IP','C9NP','C9IP','C10NP','C10IP']
    self.model_23 = {}
    for y_name in y_cols:
      self.model_23[y_name] = Pipeline([('scaler',StandardScaler()),('reg',SVR(C=0.3))])
  
  def fit(self,X,y):
    for y_name in tqdm(self.y_cols):
      self.model_23[y_name].fit(X,y[y_name])
      y_pred = self.model_23[y_name].predict(X) 
      # Sequence prediction add y_pred to X 
      X.loc[:,y_name] = y_pred
    # recover X
    X = X[self.x_cols]
  
  def predict(self,data):
    X = data.copy()    
    results = pd.DataFrame(index=[*range(len(X))],columns=self.y_cols)
    for y_name in self.y_cols:
      y_pred = self.model_23[y_name].predict(X)
      results.loc[:,y_name] = y_pred
      # Sequence prediction add y_pred to X 
      X.loc[:,y_name] = y_pred
    # recover X
    X = X[self.x_cols]
    
    # normalize depand on N+A and P
    X['P'] = 100 - X['N+A']
    results[self.N_col] = self._normalize(results[self.N_col])*X['N+A'].values.reshape(-1,1)
    results[self.P_col] = self._normalize(results[self.P_col])*X['P'].values.reshape(-1,1)

    return results.values
  
  @staticmethod
  def _normalize(x):
    return x/x.sum(axis=1).values.reshape(-1,1)

# fit model

In [6]:
model = custom_model(x_cols,y_cols)
model.fit(X_train,y_train)

HBox(children=(FloatProgress(value=0.0, max=23.0), HTML(value='')))




# make predictions

In [7]:
y_pred = model.predict(X_test)
y_pred = pd.DataFrame(y_pred,index=[*range(len(X_test))],columns=y_cols)

In [8]:
y_pred.head()

Unnamed: 0,C5NP,C5IP,C5N,C6NP,C6IP,C6N,C6A,C7NP,C7IP,C7N,...,C8N,C8A,C9NP,C9IP,C9N,C9A,C10NP,C10IP,C10N,C10A
0,1.209752,0.820608,0.162814,3.629996,2.262529,3.564507,0.663135,8.043998,6.372113,9.585444,...,7.061958,5.898694,5.227786,9.39187,5.004867,3.514526,1.145609,5.280571,0.677961,0.696586
1,0.138718,0.116942,0.103009,3.29655,1.107806,2.168482,0.399578,10.535609,7.66555,6.35268,...,4.993367,5.599062,6.804757,10.80068,4.458786,3.901046,1.196897,6.083509,0.599076,0.707979
2,0.131251,0.108848,0.104651,3.291999,1.103766,2.084426,0.368182,10.589725,7.666725,6.100797,...,4.790881,5.45394,6.924056,10.863445,4.332582,3.936101,1.329882,6.306067,0.599456,0.850151
3,0.117772,0.107004,0.103334,3.433861,1.182606,2.100733,0.411735,10.569027,7.545721,6.061024,...,4.826696,5.566143,6.945439,10.92056,4.361541,3.936076,1.208936,6.234332,0.542879,0.637361
4,0.151668,0.124574,0.103356,3.257977,1.128871,2.156847,0.409227,10.365443,7.468116,6.306395,...,5.019695,5.70637,6.835227,10.833081,4.45181,4.040815,1.238906,6.25569,0.589187,0.739539


In [9]:
y_test.head()

Unnamed: 0,C5NP,C5IP,C5N,C6NP,C6IP,C6N,C6A,C7NP,C7IP,C7N,...,C8N,C8A,C9NP,C9IP,C9N,C9A,C10NP,C10IP,C10N,C10A
0,1.142,0.616,0.217,3.745,2.577,4.028,0.557,7.669,5.99,10.206,...,7.039,5.438,5.263,9.537,4.877,3.481,1.088,5.652,0.618,0.597
1,0.122,0.078,0.029,3.955,1.563,2.407,0.4,10.016,7.2633,6.573,...,5.135,5.482,6.793,10.85,4.308,3.963,1.069,6.025,0.585,0.484
2,0.098,0.064,0.02,3.92,1.447,2.296,0.396,10.236,7.348,6.259,...,4.942,5.453,6.947,10.996,4.269,3.997,1.034,6.107,0.543,0.456
3,0.122,0.085,0.021,3.9,1.437,2.274,0.397,10.259,7.356,6.234,...,4.928,5.476,6.953,11.026,4.264,3.982,1.013,6.054,0.545,0.449
4,0.072,0.049,0.015,3.873,1.369,2.206,0.489,10.213,7.319,6.009,...,4.769,6.584,6.866,10.892,4.169,4.227,0.93,5.828,0.519,0.463


# metrics

In [10]:
N_col = ['C5N','C6N','C6A','C7N','C7A','C8N','C8A','C9N','C9A','C10N','C10A']
temp = pd.DataFrame()
temp['real_N+A'] = X_test['N+A'].values
temp['pred_N+A'] = y_pred[N_col].sum(axis=1).values
temp

Unnamed: 0,real_N+A,pred_N+A
0,40.133,40.133
1,32.175,32.175
2,31.428,31.428
3,31.390,31.390
4,32.418,32.418
...,...,...
165,53.841,53.841
166,53.442,53.442
167,52.549,52.549
168,49.827,49.827


In [11]:
res = pd.DataFrame()
for y_name in y_cols:
  res = res.append(metric(y_test[y_name],y_pred[y_name],y_name))
res.loc['AVG'] = res.mean()

In [12]:
res

Unnamed: 0,R2,RMSE,MAPE
C5NP,0.871972,0.305779,0.302849
C5IP,0.738419,0.261722,0.60706
C5N,0.791745,0.062473,2.211423
C6NP,0.277911,0.343793,0.063333
C6IP,0.882919,0.357659,0.093149
C6N,0.886192,0.558672,0.05233
C6A,0.526214,0.117599,0.170503
C7NP,0.97256,0.364311,0.042933
C7IP,0.854131,0.422033,0.048774
C7N,0.982764,0.547111,0.033794


# check if pass?

In [13]:
c1 = (len(res.loc[res.RMSE > 0.6]) == 0)
c2 = (res.loc['AVG','RMSE'] < 0.4)
print(c1,c2)

True True


# save model

In [14]:
joblib.dump(model,'../model/SVR(5_to_23).pkl')
print('save done')

save done
