In [1]:
import pandas as pd
from sklearn.svm import SVR
from tqdm import tqdm_notebook as tqdm
import numpy as np
from sklearn.metrics import r2_score,mean_squared_error
from math import sqrt
from sklearn.preprocessing import StandardScaler
import warnings;warnings.simplefilter('ignore')
from sklearn.pipeline import Pipeline
import joblib

In [2]:
def mape(a, b):
    a = np.array(a)
    b = np.array(b)
    mask = a != 0
    return (np.fabs(a-b)/a)[mask].mean()

def metric(y_t,y_p,name):
  res = {'R2' : max(r2_score(y_t,y_p), 0),
         'RMSE': sqrt(mean_squared_error(y_t,y_p)),
         'MAPE': mape(y_t,y_p)}
  return pd.DataFrame(res,index=[name])

# load data

In [3]:
def load_data():
  train = pd.read_csv('../data/phase_1/train_4565.csv',index_col=0)
  test = pd.read_csv('../data/phase_1/test_170.csv',index_col=0)
  
  # define columns
  x_cols = ['T10','T50','T90','N+A']
  y_cols = ['C5NP','C5IP','C5N','C6NP','C6IP','C6N','C6A','C7NP','C7IP','C7N','C7A',
            'C8NP','C8IP','C8N','C8A','C9NP','C9IP','C9N','C9A','C10NP','C10IP','C10N','C10A']
  N_col = ['C5N','C6N','C6A','C7N','C7A','C8N','C8A','C9N','C9A','C10N','C10A']
  P_col = ['C5NP','C5IP','C6NP','C6IP','C7NP','C7IP','C8NP','C8IP','C9NP','C9IP','C10NP','C10IP']
  
  # split_data train and test
  X_train = train[x_cols]
  X_test = test[x_cols]
  y_train = train[y_cols]
  y_test = test[y_cols]
  
  return X_train,y_train,X_test,y_test

In [4]:
X_train,y_train,X_test,y_test = load_data()
x_cols = X_train.columns.tolist()
y_cols = y_train.columns.tolist()
len(x_cols),len(y_cols)

(4, 23)

# define model

In [5]:
class custom_svr(object):
  def __init__(self,x_cols,y_cols):
    self.x_cols = x_cols
    self.y_cols = y_cols
    self.N_col = ['C5N','C6N','C6A','C7N','C7A','C8N','C8A','C9N','C9A','C10N','C10A']
    self.P_col = ['C5NP','C5IP','C6NP','C6IP','C7NP','C7IP','C8NP','C8IP','C9NP','C9IP','C10NP','C10IP']
    self.model_23 = {}
    for y_name in y_cols:
      self.model_23[y_name] = Pipeline([('scaler',StandardScaler()),('reg',SVR(C=0.3))])
  
  def fit(self,X,y):
    for y_name in tqdm(self.y_cols):
      self.model_23[y_name].fit(X,y[y_name])
      y_pred = self.model_23[y_name].predict(X) 
      # Sequence prediction add y_pred to X 
      X.loc[:,y_name] = y_pred
    # recover X
    X = X[self.x_cols]
  
  def predict(self,data):
    X = data.copy()    
    results = pd.DataFrame(index=[*range(len(X))],columns=self.y_cols)
    for y_name in self.y_cols:
      y_pred = self.model_23[y_name].predict(X)
      results.loc[:,y_name] = y_pred
      # Sequence prediction add y_pred to X 
      X.loc[:,y_name] = y_pred
    # recover X
    X = X[self.x_cols]
    
    # normalize depand on N+A and P
    X['P'] = 100 - X['N+A']
    results[self.N_col] = self._normalize(results[self.N_col])*X['N+A'].values.reshape(-1,1)
    results[self.P_col] = self._normalize(results[self.P_col])*X['P'].values.reshape(-1,1)

    return results
  
  @staticmethod
  def _normalize(x):
    return x/x.sum(axis=1).values.reshape(-1,1)

# fit model

In [6]:
model = custom_svr(x_cols,y_cols)
model.fit(X_train,y_train)

HBox(children=(IntProgress(value=0, max=23), HTML(value='')))




# make predictions

In [7]:
y_pred = model.predict(X_test)
y_pred.head()

Unnamed: 0,C5NP,C5IP,C5N,C6NP,C6IP,C6N,C6A,C7NP,C7IP,C7N,...,C8N,C8A,C9NP,C9IP,C9N,C9A,C10NP,C10IP,C10N,C10A
0,1.209731,0.820594,0.162816,3.629933,2.26249,3.564549,0.663143,8.043859,6.372003,9.585558,...,7.062309,5.898643,5.227595,9.39163,5.004924,3.514413,1.145865,5.281013,0.677925,0.696174
1,0.138719,0.116943,0.103014,3.296569,1.107812,2.168575,0.399596,10.535667,7.665592,6.352953,...,4.993275,5.599124,6.804876,10.800482,4.458611,3.901144,1.197221,6.082854,0.598902,0.707746
2,0.131253,0.108849,0.104655,3.292032,1.103778,2.084511,0.368197,10.589832,7.666803,6.101048,...,4.790723,5.454066,6.924106,10.86328,4.332312,3.936163,1.330298,6.30507,0.599414,0.849962
3,0.117773,0.107005,0.103338,3.433892,1.182617,2.10081,0.411751,10.569122,7.545788,6.061247,...,4.82659,5.56633,6.945549,10.920459,4.361244,3.936268,1.209216,6.233512,0.54271,0.63713
4,0.151669,0.124575,0.10336,3.257996,1.128878,2.156921,0.409241,10.365505,7.468161,6.306612,...,5.01961,5.706443,6.8354,10.832916,4.451698,4.040933,1.239252,6.254867,0.589096,0.739227


In [8]:
y_test.head()

Unnamed: 0,C5NP,C5IP,C5N,C6NP,C6IP,C6N,C6A,C7NP,C7IP,C7N,...,C8N,C8A,C9NP,C9IP,C9N,C9A,C10NP,C10IP,C10N,C10A
0,1.142,0.616,0.217,3.745,2.577,4.028,0.557,7.669,5.99,10.206,...,7.039,5.438,5.263,9.537,4.877,3.481,1.088,5.652,0.618,0.597
1,0.122,0.078,0.029,3.955,1.563,2.407,0.4,10.016,7.2633,6.573,...,5.135,5.482,6.793,10.85,4.308,3.963,1.069,6.025,0.585,0.484
2,0.098,0.064,0.02,3.92,1.447,2.296,0.396,10.236,7.348,6.259,...,4.942,5.453,6.947,10.996,4.269,3.997,1.034,6.107,0.543,0.456
3,0.122,0.085,0.021,3.9,1.437,2.274,0.397,10.259,7.356,6.234,...,4.928,5.476,6.953,11.026,4.264,3.982,1.013,6.054,0.545,0.449
4,0.072,0.049,0.015,3.873,1.369,2.206,0.489,10.213,7.319,6.009,...,4.769,6.584,6.866,10.892,4.169,4.227,0.93,5.828,0.519,0.463


# metrics

In [9]:
N_col = ['C5N','C6N','C6A','C7N','C7A','C8N','C8A','C9N','C9A','C10N','C10A']
temp = pd.DataFrame()
temp['real_N+A'] = X_test['N+A'].values
temp['pred_N+A'] = y_pred[N_col].sum(axis=1).values
temp

Unnamed: 0,real_N+A,pred_N+A
0,40.133,40.133
1,32.175,32.175
2,31.428,31.428
3,31.390,31.390
4,32.418,32.418
...,...,...
165,53.841,53.841
166,53.442,53.442
167,52.549,52.549
168,49.827,49.827


In [10]:
res = pd.DataFrame()
for y_name in y_cols:
  res = res.append(metric(y_test[y_name],y_pred[y_name],y_name))
res.loc['AVG'] = res.mean()

In [11]:
res

Unnamed: 0,R2,RMSE,MAPE
C5NP,0.871979,0.305771,0.302849
C5IP,0.738431,0.261716,0.607063
C5N,0.791738,0.062474,2.211511
C6NP,0.277918,0.343791,0.063332
C6IP,0.882911,0.357671,0.093153
C6N,0.886192,0.558671,0.052323
C6A,0.526197,0.117601,0.170512
C7NP,0.97256,0.364309,0.042931
C7IP,0.85413,0.422033,0.048774
C7N,0.982765,0.547101,0.033793


# check if pass?

In [12]:
c1 = (len(res.loc[res.RMSE > 0.6]) == 0)
c2 = (res.loc['AVG','RMSE'] < 0.4)
print(c1,c2)

True True


# save model

In [13]:
joblib.dump(model,'../model/SVR(4_to_23).pkl')
print('save done')

save done
