#O istraživanju


Rad spada u primjenjeno istraživanje prema svrsi s obzirom da je algoritam u radu nastao
kao poboljšanje algoritma koji se često koristi u praksi. U samom radu se kroz mjeru pogreške
pokazuje poboljšanje algoritma nad 10 klasičnih problema strojnog učenja.
<br><br>
Prema dubini rad je teško svrstati s obzirom da je njegova svrha preinaka nad već postojećim algoritmom no u radu se uspoređuju utjecaji raznih algoritama na uspješnost rješavanja problema stoga bi ga svrstali pod korelacijsko istraživanje.
<br><br>
U istaživanju možemo primjetit da se osim numeričkih podataka koriste i opisni podaci, ali se oboje obrađuju metodama strojnog učenja te je zato istraživanje kvantitativno.
<br><br>
Određeni parametri modela se namještaju ovisno o modelu i podacima s kojima se radi stoga istraživanje možemo smatrati kvazi-eksperimentalnim.
<br><br>
Kroz istraživanje dolazi se do zaključka uspješnosti poboljšanja algoritma preko
analize vrijednosti pogrešaka što ukazuje na deduktivno istraživanje koje obično vežemo sa kvantitativnim istraživanjima.
<br><br>
Samo istraživanje nije vezano vremenski na proučavanje problema
stoga spada u sinkrono istraživanje.
<br><br>
Podaci su uzeti od skupova podataka koji su već objavljeni na internetu te sami istraživači nisu provodili prikupljanje podataka zato je prema izvoru
informacija ovo sekundarno istraživanje.
<br><br>
Iz istog razloga kao u prethodnom paragrafu zaključujemo da je riječ
o dokumentarnom istraživanju.
<br><br>
Metode kako su skupovi podataka prikupljeni su nepoznate s obzirom da se koriste klasični skupovi podataka s interneta koji na stranicama obično nude samo skupove podataka, a i ne način na koji su prikupljeni.
<br><br>
S obzirom da su stranice na kojima se nalaze spomenuti skupovi podataka
namijenjene skupljanjem podataka očekivano je da se prate uvjeti stranice
koji bi trebali biti u skladu s pravilima istraživačke etike.


In [None]:

import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score



In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree._tree import TREE_LEAF
from scipy.sparse import csr_matrix


class CustomModel:
  def __init__(self, rfc, svclass,flag = False):
    self.rfc = rfc
    self.SVClass = svclass
    self.trees_cont = []
    self.glob_pairs = {}
    self.flag = flag

  def train(self,X_train,y_train):
    self.rfc.fit(X_train,y_train)
    self.setTreesCont()
    X_svc_train = self.get_ind_vectors(X_train)
    self.SVClass.fit(X_svc_train,y_train)
    return

  def predict(self,x_test):
    fi = self.get_ind_vectors(x_test)
    return self.SVClass.predict(fi)

  def get_norms(self):
    W = self.SVClass.coef_
    W = np.square(W)

    if not self.flag:
      W = np.sum(W,axis = 0)
    
    return np.sqrt(W)
  
  

  def conn_leaf_values(self):
    shift = 0
    W = self.get_norms()
    self.glob_pairs = {}

    for tree_id,tree in enumerate(self.rfc.estimators_):      
      paired = []
      leaves_order = self.trees_cont[tree_id]

      for i in range(tree.tree_.node_count):

        left_child = tree.tree_.children_left[i]
        right_child = tree.tree_.children_right[i]

        if (left_child != TREE_LEAF 
           and right_child != TREE_LEAF 
           and left_child in leaves_order 
           and right_child in leaves_order):
          
          s1 = leaves_order.index(left_child)
          s2 = leaves_order.index(right_child)
          
          self.glob_pairs[(i,tree_id)] = W[shift+s1]+W[shift+s2]

      shift += len(leaves_order) 

    return

  def find_minimum(self):

    for i in range(round(len(self.glob_pairs)*0.05)):  
      key = min(self.glob_pairs,key = self.glob_pairs.get)
      tree = self.rfc.estimators_[key[1]].tree_

      self.trees_cont[key[1]].remove(tree.children_left[key[0]])
      self.trees_cont[key[1]].remove(tree.children_right[key[0]])
      self.trees_cont[key[1]].append(key[0])

      tree.children_left[key[0]] = TREE_LEAF
      tree.children_right[key[0]] = TREE_LEAF
      
      self.glob_pairs.pop(key)
      
    return

  def pruning_iter(self,x_train,y_train):
    self.conn_leaf_values()
    self.find_minimum()
    self.SVClass.fit(self.get_ind_vectors(x_train),y_train)
    return
  
  

  def setTreesCont(self):
    self.trees_cont = list()
    for tree in self.rfc.estimators_:
      tree_ = tree.tree_
      leaf_nodes = [x for x in range(tree_.node_count) if tree_.children_left[x] == TREE_LEAF and tree_.children_right[x] == TREE_LEAF]
      self.trees_cont.append(leaf_nodes)
    return

  def get_n_leaves(self):
      return sum(len(cont) for cont in self.trees_cont)

  def get_ind_vectors(self,x_train):
    X_N = x_train.shape[0]
    dim2 = self.get_n_leaves()
    
    forest_MAT = np.zeros((X_N,dim2))
    shift = 0

    for id,tree in enumerate(self.rfc.estimators_):
      tree_ = tree.tree_
      leaf_glob_idx = tree.apply(x_train)
      leaves_order = self.trees_cont[id]

      leaf_only_idx = [leaves_order.index(x) for x in leaf_glob_idx]

      for cnt,idx in enumerate(leaf_only_idx):
        forest_MAT[cnt][idx+shift] = 1

      n_leaves = len(leaves_order)
      shift += n_leaves 
        
    return forest_MAT
    








*   Podaci: usps
*   Tip: klasifikacija 



In [None]:
import h5py



#USPS data importing
path = "sample_data/usps.h5"
with h5py.File(path, 'r') as hf:
        train_usps = hf.get('train')
        X_tr_usps = train_usps.get('data')[:]
        y_tr_usps = train_usps.get('target')[:]
        test_usps = hf.get('test')
        X_te_usps = test_usps.get('data')[:]
        y_te_usps = test_usps.get('target')[:]

    


In [None]:
from copy import deepcopy

def prune_A(mdl,x_train,y_train,x_test,y_test,err_func):
    
    new_err = float("inf")
    max_iter = 5


    old_mdl = deepcopy(mdl)
    new_mdl = deepcopy(mdl)

    for i in range(max_iter):

      old_err = new_err
      old_mdl = deepcopy(new_mdl)  

      new_mdl.pruning_iter(x_train,y_train)
      y_test_mdl = new_mdl.predict(x_test)
      new_err = err_func(y_test_mdl,y_test)

      if new_err>=old_err:
        break
      
    return old_err,old_mdl

def prune_E(mdl,x_train,y_train,x_test,y_test,err_func,acc):
  old_err = -1
  new_err = -1
  old_mdl = deepcopy(mdl)
  new_mdl = deepcopy(mdl)
  iter = 0
  while(acc > new_err):
    old_err = new_err
    old_mdl = deepcopy(new_mdl)

    new_mdl.pruning_iter(x_train,y_train)
    y_test_mdl = new_mdl.predict(x_test)
    new_err = err_func(y_test_mdl,y_test)

    if(iter==8):
      break
    iter +=1
  if old_err == -1:
    return err_func(old_mdl.predict(x_test),y_test),old_mdl
  return old_err,old_mdl

def comp_Acc(y_te,y_pred):
  return 1-accuracy_score(y_te,y_pred)

In [None]:
from sklearn.svm import LinearSVC





mdl_rfc = RandomForestClassifier(max_depth = 15, n_estimators = 100,min_samples_split = 5,max_features = "sqrt")
mdl_rfc.fit(X_tr_usps,y_tr_usps)
y_pred_usps = mdl_rfc.predict(X_te_usps)

mdl_svc = LinearSVC()

cm_model = CustomModel(mdl_rfc,mdl_svc)

cm_model.train(X_tr_usps,y_tr_usps)


errA,mdlA = prune_A(cm_model,X_tr_usps,y_tr_usps,X_te_usps,y_te_usps,comp_Acc)
#errE,mdlE = prune_E(cm_model,X_tr_usps,y_tr_usps,X_te_usps,y_te_usps,comp_Acc,comp_Acc(y_pred_usps,y_te_usps))

print("err REF_A: ",errA)
#print("err REF_E: ",errE)









err REF_A:  0.05430991529646234


*   Podaci: letter
*   Tip: klasifikacija 

In [None]:
from sklearn.model_selection import train_test_split

path = "sample_data/letter-recognition.data"
with open(path, 'r') as lr:
  lines = lr.read().split("\n")
  X_let = list()
  y_let = list()
  for line in lines:
    splited_line = line.split(",")
    y_let.append(splited_line[0])
    X_let.append(splited_line[1:])
  y_let.pop()
  X_let.pop()
  y_let = np.array(y_let)
  X_let = np.array(X_let)
  X_tr_let,X_te_let,y_tr_let,y_te_let = train_test_split(X_let,y_let,test_size = 0.4)
  
    


FileNotFoundError: ignored

In [None]:
mdl_rfc = RandomForestClassifier(max_depth = 15, n_estimators = 100,min_samples_split = 5,max_features = "sqrt")
mdl_rfc.fit(X_tr_let,y_tr_let)
y_pred_let = mdl_rfc.predict(X_te_let)

mdl_svc = LinearSVC()

cm_model = CustomModel(mdl_rfc,mdl_svc)

cm_model.train(X_tr_let,y_tr_let)


errA,mdlA = prune_A(cm_model,X_tr_let,y_tr_let,X_te_let,y_te_let,comp_Acc)
errE,mdlE = prune_E(cm_model,X_tr_let,y_tr_let,X_te_let,y_te_let,comp_Acc,comp_Acc(y_pred_let,y_te_let))

print("err REF_A: ",errA)
print("err REF_E: ",errE)

*   Podaci: abalone
*   Tip: regresija 

In [None]:
path = "sample_data/abalone.data"
with open(path, 'r') as lr:
  lines = lr.read().split("\n")
  X_aba = list()
  y_aba = list()
  for line in lines:
    splited_line = line.split(",")
    y_aba.append(splited_line[-1])
    X_aba.append(splited_line[1:-1])
  y_aba.pop()
  X_aba.pop()
  y_aba = np.array(y_aba)
  X_aba = np.array(X_aba)
  X_tr_aba,X_te_aba,y_tr_aba,y_te_aba = train_test_split(X_aba,y_aba,test_size = 0.4)

In [None]:
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

def rmse(y_real,y_pred):
  return mean_squared_error(y_real,y_pred,squared=False)

mdl_rfr = RandomForestRegressor(max_depth = 10, n_estimators = 100,min_samples_split = 5,max_features = "sqrt")
mdl_rfr.fit(X_tr_aba,y_tr_aba)
y_pred_aba = mdl_rfr.predict(X_te_aba)

mdl_rid = Ridge()


cm_model = CustomModel(mdl_rfr,mdl_rid,True)

cm_model.train(X_tr_aba,y_tr_aba)

errA,mdlA = prune_A(cm_model,X_tr_aba,y_tr_aba,X_te_aba,y_te_aba,rmse)
errE,mdlE = prune_E(cm_model,X_tr_aba,y_tr_aba,X_te_aba,y_te_aba,rmse,rmse(y_pred_aba,y_te_aba))


print("err REF_A: ",errA)
print("err REF_E: ",errE)




  y_pred = check_array(y_pred, ensure_2d=False, dtype=dtype)
  y_pred = check_array(y_pred, ensure_2d=False, dtype=dtype)
  y_pred = check_array(y_pred, ensure_2d=False, dtype=dtype)
  y_pred = check_array(y_pred, ensure_2d=False, dtype=dtype)
  y_pred = check_array(y_pred, ensure_2d=False, dtype=dtype)


err REF_A:  2.423515964230933
err REF_E:  2.423680430087986


  y_pred = check_array(y_pred, ensure_2d=False, dtype=dtype)


*   Podaci: ailerons
*   Tip: regresija 

In [None]:
path = "sample_data/ailerons.csv"
with open(path, 'r') as lr:
  lines = lr.read().split("\n")
  X_ail = list()
  y_ail = list()
  it = True
  for line in lines:
    if it:
      it = False
      continue
    splited_line = line.split(",")
    y_ail.append(splited_line[-1])
    X_ail.append(splited_line[1:-1])
  y_ail.pop()
  X_ail.pop()
  y_ail = np.array(y_ail)
  X_ail = np.array(X_ail)
  X_tr_ail,X_te_ail,y_tr_ail,y_te_ail = train_test_split(X_ail,y_ail,test_size = 0.4)

In [None]:


mdl_rfr = RandomForestRegressor(max_depth = 10, n_estimators = 100,min_samples_split = 5,max_features = "sqrt")
mdl_rid = Ridge()
mdl_rfr.fit(X_tr_ail,y_tr_ail)
y_pred_ail = mdl_rfr.predict(X_te_ail)

cm_model = CustomModel(mdl_rfr,mdl_rid,True)

cm_model.train(X_tr_ail,y_tr_ail)

errA,mdlA = prune_A(cm_model,X_tr_ail,y_tr_ail,X_te_ail,y_te_ail,rmse)
errE,mdlE = prune_E(cm_model,X_tr_ail,y_tr_ail,X_te_ail,y_te_ail,rmse,rmse(y_pred_ail,y_te_ail))

print("err REF_A: ",errA)
print("err REF_E: ",errE)

KeyboardInterrupt: ignored