In [1]:
import pandas as pd
import numpy as np
import joblib
import scipy.stats as stats
from tqdm import tqdm_notebook as tqdm
from matplotlib import pyplot as plt
from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
class SimulatedAnnealing():
    def __init__(self, model, t_init=100, t_fin=1, n_iter=100, rate=0.9, max_feat=5, seed=42):
        self.model = model
        self.t_init = t_init
        self.t_fin = t_fin
        self.n_iter = n_iter
        self.seed = seed
#         self.rng = np.random.RandomState(seed)
        self.max_feat = max_feat
        self.rate = rate
        self.idx = []
    
    def stack_data(self, X, idx):
        for i in range(len(idx)):
            if i == 0:
                X_res = X[:, idx[i]][:, np.newaxis]
            else:
                X_res = np.hstack((X_res, X[:, idx[i]][:, np.newaxis]))
        return X_res
    
    def check(self, idx, list_idx):
        exist = False
        for i in range(len(list_idx)):
            tmp = list_idx[i]
            check = all(elem in idx  for elem in tmp)
            if check:
                exist = True
        return exist
    
    def compute_mse(self, X, y, idx):
        X_train, X_val, y_train, y_val, = train_test_split(X, y, test_size=0.20, random_state=42, 
                                                                           shuffle=True)
        X_train_ = X_train.iloc[:,idx]
        X_val_ = X_val.iloc[:,idx]
        scaler = MinMaxScaler().fit(X_train_)
        X_train_ = scaler.transform(X_train_)
        X_val_ = scaler.transform(X_val_)
        self.model.fit(X_train_, y_train)
        y_pred = self.model.predict(X_val_)
        mse = mean_squared_error(y_val, y_pred)
        return mse
    
    def fit(self, X, y):
        self.n_feat = X.shape[1]
        idx = np.random.choice(np.arange(self.n_feat), self.max_feat,replace=False)
        idx = np.sort(idx).tolist() 
        best_idx = idx
        mse = self.compute_mse(X, y, idx)
        best_mse = mse
        t = self.t_init
        mse_list = [best_mse]
        t_list = [t]
        while t >= self.t_fin:
#             print("\n {} - temperature: {}".format(self.max_feat, t))
#             for _ in tqdm(range(self.n_iter)):
            for _ in range(self.n_iter):
                # new solution
                new_idx = np.random.choice(np.arange(self.n_feat), self.max_feat,replace=False)
                new_idx = np.sort(new_idx).tolist()                    
                new_mse = self.compute_mse(X, y, new_idx)
                if new_mse <= best_mse:
                    best_mse = new_mse
                    best_idx = new_idx
                else:             
                    err_ = new_mse - best_mse
                    k_ = -(self.t_init * np.log(0.8)) / err_
                    proba = np.exp(-(k_ * err_) / t)
                    rand_ = np.random.rand()
                    if rand_ < proba:
                        best_mse = new_mse
                        best_idx = new_idx
            # update t
            t *= self.rate
            t_list.append(t)
            mse_list.append(best_mse)
        return t_list, mse_list, best_idx

In [3]:
dataset = pd.read_csv("./train.csv").iloc[:,1:]
dataset.head()

Unnamed: 0,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,nH,nC,...,ATSC6m,ATSC7m,ATSC8m,ATSC0v,ATSC1v,ATSC2v,ATSC3v,ATSC4v,ATSC5v,pIC50
0,2.1672,4.696756,167.339,102.476443,6,6,93,42,51,36,...,492.329643,257.277683,-1003.763457,4804.673166,309.490433,683.435674,-1314.555369,-1414.870378,-83.483807,5.920819
1,2.5305,6.40343,166.4448,101.292064,6,6,92,44,48,36,...,1145.348381,650.526364,-892.140056,4674.607833,259.817836,819.607455,-1319.513358,-1271.063588,16.303179,5.327902
2,0.0449,0.002016,171.3895,118.688752,0,0,111,47,64,40,...,153.958754,262.77065,-1344.813986,5615.82144,208.483498,421.946492,-488.632286,-1042.240171,-750.982981,5.178486
3,0.6994,0.48916,170.7727,104.676443,0,0,95,44,51,36,...,723.616083,447.499595,-880.969219,4830.277113,246.299411,758.514759,-1439.57983,-1684.939504,-200.347089,5.69897
4,2.4186,5.849626,173.0249,104.559857,0,0,94,45,49,37,...,963.491174,524.469445,-610.738168,4803.928569,273.108429,866.440986,-1291.406174,-1355.834353,80.38824,5.761954


In [4]:
X_train = dataset.iloc[:,:-1]
y_train = dataset.iloc[:,[-1]]

In [5]:
model = LinearRegression()

In [6]:
max_feat = 5 # nilai ini diubah dengan rentang 5 - 10
n_trial = 50
best_mse = np.inf
mse_trial = []
for i in tqdm(range(n_trial)):
    featSelection = SimulatedAnnealing(model=model, t_init=100, t_fin=1, n_iter=100, rate=0.9, max_feat=max_feat, seed=42)
    temp, mse, feat = featSelection.fit(X_train,y_train)
    fin_mse = mse[-1]
    mse_trial.append(fin_mse)
    if fin_mse < best_mse:
        best_mse = fin_mse
        best_temp_list = temp
        best_mse_list = mse
        best_feat_list = feat

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




In [7]:
desc_name = dataset.columns.values.tolist()
desc_name = np.array(desc_name)
sel_desc = desc_name[best_feat_list]

In [8]:
joblib.dump([mse_trial, temp, mse, sel_desc], "./sel_desc_{}.p".format(max_feat))

['./sel_desc_5.p']