In [1]:
import torch
import catboost
from torch import nn
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from catboost import CatBoostClassifier
import gc
import numpy as np
from tqdm import tqdm
from torch.utils.data import TensorDataset,DataLoader
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
import warnings 
warnings.filterwarnings('ignore')

In [2]:
def preprocessing_Y(file_path):
    Y = pd.read_csv(file_path).iloc[:,:]
    Y = Y[Y.Filename != 'train_01046']
    enc = OneHotEncoder().fit(Y[['Label']])
    Y_one_hot = enc.transform(Y[['Label']]).toarray()
    Y_one_hot = torch.FloatTensor(Y_one_hot)
    print('Y_ont_hot shape',Y_one_hot.shape)
    print('Y_df shape',Y.shape)
    return Y_one_hot,Y

def load_pretrain_senet(model_path):
    model = torch.hub.load(
        'moskomule/senet.pytorch',
        'se_resnet20',
        num_classes=6)
    model.conv1 = nn.Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    model = torch.nn.DataParallel(model, device_ids=[0, 1, 2 ,3])
    model.load_state_dict(torch.load(model_path))
    model.eval()
    return model

def get_senet_output(senet,data):
    return senet(data)

def get_all_senet_output(data):
    y = senet(data)
    return y.detach().cpu().numpy()

def get_X_numpy(X):
    X_train_np = np.array([[]])
    data_iter = DataLoader(TensorDataset(torch.FloatTensor(X.to(torch.float32))),batch_size=256)
    for bx in tqdm(data_iter):
        bx = bx[0]
        bx = bx.to('cuda:0')
        y_hat = get_all_senet_output(bx)
        if len(X_train_np) == 1:
            X_train_np = y_hat
        else:
            X_train_np = np.vstack((X_train_np,y_hat))
    return X_train_np

# 路徑

In [3]:
Y_train_path = 'train/meta_train.csv'
senet = 'senet20_2021_0604_1230_randomseed84_validacc_0.900.pt'

# load k 個 senet

In [4]:
senet = load_pretrain_senet(senet).to('cuda:0')
senet

Using cache found in /root/.cache/torch/hub/moskomule_senet.pytorch_master


DataParallel(
  (module): CifarSEResNet(
    (conv1): Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (layer1): Sequential(
      (0): CifarSEBasicBlock(
        (conv1): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (conv2): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (se): SELayer(
          (avg_pool): AdaptiveAvgPool2d(output_size=1)
          (fc): Sequential(
            (0): Linear(in_features=16, out_features=1, bias=False)
            (1): ReLU(inplace=True)
            (2): Linear(in_features=1, out_features=16, bias=False

# 訓練 和 測試資料

In [5]:
X_train_all = torch.load('X_train_吳啟聖教的方法.pt')
X_test_all = torch.load('X_test_吳啟聖教的方法.pt')
Y_train_all,Y_train_df = preprocessing_Y(Y_train_path)
map_dict = {}
for l in Y_train_df.Label.unique():
    map_dict[l] = Y_train_df[Y_train_df.Label==l].sample(1)['Remark'].values[0]
map_dict

Y_ont_hot shape torch.Size([1199, 6])
Y_df shape (1199, 3)


{0: 'Barking',
 1: 'Howling',
 2: 'Crying',
 3: 'COSmoke',
 4: 'GlassBreaking',
 5: 'Vacuum'}

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_valid , y_train, y_valid = train_test_split(X_train_all, Y_train_all, 
                                                       test_size=0.2, 
                                                       random_state=42*2,
                                                       stratify=Y_train_all)
print(X_train.shape)
print(X_valid.shape)
print(pd.DataFrame(y_train.argmax(axis=1)).value_counts())
print(pd.DataFrame(y_valid.argmax(axis=1)).value_counts())

torch.Size([959, 1, 80, 157])
torch.Size([240, 1, 80, 157])
0    160
1    160
2    160
3    160
5    160
4    159
dtype: int64
0    40
1    40
2    40
3    40
4    40
5    40
dtype: int64


In [7]:
from torch.utils.data import TensorDataset,DataLoader
trainset = TensorDataset(torch.FloatTensor(X_train.to(torch.float32)),torch.FloatTensor(y_train))
vaildset = TensorDataset(torch.FloatTensor(X_valid.to(torch.float32)),torch.FloatTensor(y_valid))
train_iter = DataLoader(trainset,batch_size=32,num_workers=4)
vaild_iter = DataLoader(vaildset,batch_size=32,num_workers=4)

In [8]:
for bx,by in train_iter:
    print(bx.shape,bx.device) # batch,channel,freq,time
    print(by.shape,by.device) # batch,n_class
    break

torch.Size([32, 1, 80, 157]) cpu
torch.Size([32, 6]) cpu


In [9]:
y_hat = senet(bx)
print(bx.shape,y_hat.shape)
y_hat.shape

torch.Size([32, 1, 80, 157]) torch.Size([32, 6])


torch.Size([32, 6])

In [10]:
from sklearn.metrics import confusion_matrix,accuracy_score
def plot_confusion_matrix(model,data_iter,map_dict=map_dict):
    y_true = np.array([])
    y_pred = np.array([])
    for bx,by in tqdm(data_iter):
        bx = bx.to(device)
        by = by.to(device)
        y_hat = model(bx)
        try:
            if len(y_true) == 0:
                y_true = by.argmax(axis=1).detach().cpu().numpy()
                y_pred = y_hat.argmax(axis=1).detach().cpu().numpy()
            else:
                y_true = np.hstack((y_true,by.argmax(axis=1).detach().cpu().numpy()))
                y_pred = np.hstack((y_pred,y_hat.argmax(axis=1).detach().cpu().numpy()))
        except:
            pass
    cm = pd.DataFrame(confusion_matrix(y_pred,y_true))
    cm.columns = list(map_dict.values())
    acc = accuracy_score(y_pred,y_true)
    return cm,acc

In [11]:
senet.eval()
device = 'cuda:0'
cm,acc = plot_confusion_matrix(senet.to(device),train_iter)
print(acc)
cm

100%|██████████| 30/30 [00:01<00:00, 16.78it/s]


0.9384775808133472


Unnamed: 0,Barking,Howling,Crying,COSmoke,GlassBreaking,Vacuum
0,155,2,6,2,0,0
1,2,154,18,6,0,2
2,2,2,134,1,0,3
3,0,0,0,151,0,0
4,1,0,0,0,158,7
5,0,2,2,0,1,148


# 有辦法在提升驗證集的表現嗎?

In [12]:
cm,acc = plot_confusion_matrix(senet.to(device),vaild_iter)
print(acc)
cm

100%|██████████| 8/8 [00:01<00:00,  7.01it/s]

0.9





Unnamed: 0,Barking,Howling,Crying,COSmoke,GlassBreaking,Vacuum
0,35,2,3,2,1,2
1,3,37,4,0,0,0
2,2,1,33,0,0,1
3,0,0,0,38,0,0
4,0,0,0,0,38,2
5,0,0,0,0,1,35


# 接 LogisticRegression 修正senet的弱點

In [13]:
X_train_np = get_X_numpy(X_train)
y_train_np = y_train.detach().numpy().argmax(axis=1)
X_valid_np = get_X_numpy(X_valid)
y_valid_np = y_valid.detach().numpy().argmax(axis=1)
lg = LogisticRegression()
lg.fit(X_train_np,y_train_np)
y_true = y_valid_np
y_pred = lg.predict(X_valid_np )
print(accuracy_score(y_pred,y_true))
cm = pd.DataFrame(confusion_matrix(y_pred,y_true))
cm.columns = list(map_dict.values())
cm

100%|██████████| 4/4 [00:00<00:00, 20.36it/s]
100%|██████████| 1/1 [00:00<00:00, 26.87it/s]


0.9125


Unnamed: 0,Barking,Howling,Crying,COSmoke,GlassBreaking,Vacuum
0,34,2,3,1,2,2
1,3,37,2,0,0,0
2,3,1,34,0,0,0
3,0,0,0,39,0,0
4,0,0,0,0,37,0
5,0,0,1,0,1,38


# cma-es優化

In [14]:
import optuna
sampler = optuna.samplers.CmaEsSampler()
study = optuna.create_study(sampler=sampler,direction='maximize')
search_iteration = 100
for i in tqdm(range(search_iteration)):
    trial = study.ask()
    class_weight = {}
    for i in [0,1,2,3,4,5]:
        class_weight[i] = trial.suggest_uniform(i,1,3)
    lg = LogisticRegression(class_weight=class_weight)
    lg.fit(X_train_np,y_train_np)
    y_true = y_valid_np
    y_pred = lg.predict(X_valid_np)
    acc = accuracy_score(y_pred,y_true)
    study.tell(trial,acc)
    
study.best_params

[32m[I 2021-06-05 01:02:37,776][0m A new study created in memory with name: no-name-1f577882-3309-444f-9715-11a1d0501592[0m
100%|██████████| 100/100 [00:07<00:00, 13.82it/s]


{0: 1.744215628742431,
 1: 2.055098147413133,
 2: 2.306272220485257,
 3: 2.2322739581197233,
 4: 2.049733830529668,
 5: 2.081262191947032}

In [21]:
X_train_np = get_X_numpy(X_train)
y_train_np = y_train.detach().numpy().argmax(axis=1)
X_valid_np = get_X_numpy(X_valid)
y_valid_np = y_valid.detach().numpy().argmax(axis=1)
lg = LogisticRegression(class_weight=study.best_params)
lg.fit(X_train_np,y_train_np)
y_true = y_valid_np
y_pred = lg.predict(X_valid_np)
acc = accuracy_score(y_pred,y_true)
print(acc)
cm = pd.DataFrame(confusion_matrix(y_pred,y_true))
cm.columns = list(map_dict.values())
cm

100%|██████████| 4/4 [00:00<00:00, 14.20it/s]
100%|██████████| 1/1 [00:00<00:00, 24.07it/s]

0.925





Unnamed: 0,Barking,Howling,Crying,COSmoke,GlassBreaking,Vacuum
0,34,2,3,1,0,2
1,3,37,2,0,0,0
2,3,1,35,0,0,0
3,0,0,0,39,0,0
4,0,0,0,0,39,0
5,0,0,0,0,1,38


# 載入測試數據

In [22]:
X_test_np = get_X_numpy(X_test_all)
X_test_np.shape

100%|██████████| 40/40 [00:01<00:00, 26.04it/s]


(10000, 6)

In [23]:
final_prob = lg.predict_proba(X_test_np)

In [24]:
final_prob.sum(axis=1)

array([1., 1., 1., ..., 1., 1., 1.])

In [25]:
sample_submit = pd.read_csv('sample_submission.csv')
sample_submit.iloc[:10000,1:] = final_prob
sample_submit

Unnamed: 0,Filename,Barking,Howling,Crying,COSmoke,GlassBreaking,Other
0,public_00001,0.000347,0.000949,0.012644,0.000028,4.901327e-03,0.981131
1,public_00002,0.095730,0.033180,0.456154,0.053999,3.095493e-04,0.360628
2,public_00003,0.889983,0.000526,0.010087,0.073262,2.291437e-02,0.003228
3,public_00004,0.000362,0.002510,0.022394,0.000112,6.882923e-04,0.973934
4,public_00005,0.904855,0.000694,0.093102,0.000792,4.950151e-07,0.000557
...,...,...,...,...,...,...,...
29995,private_19996,0.166600,0.166600,0.166600,0.166600,1.666000e-01,0.167000
29996,private_19997,0.166600,0.166600,0.166600,0.166600,1.666000e-01,0.167000
29997,private_19998,0.166600,0.166600,0.166600,0.166600,1.666000e-01,0.167000
29998,private_19999,0.166600,0.166600,0.166600,0.166600,1.666000e-01,0.167000


In [26]:
acc

0.925

In [27]:
sample_submit.to_csv(f'submit_valid_acc_{acc}.csv',index=False)
print('done')

done
