In [1]:
import torch
import catboost
from torch import nn
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from catboost import CatBoostClassifier
import gc
import numpy as np
from tqdm import tqdm
from torch.utils.data import TensorDataset,DataLoader
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
import warnings 
warnings.filterwarnings('ignore')

In [2]:
def preprocessing_Y(file_path):
    Y = pd.read_csv(file_path).iloc[:,:]
    Y = Y[Y.Filename != 'train_01046']
    enc = OneHotEncoder().fit(Y[['Label']])
    Y_one_hot = enc.transform(Y[['Label']]).toarray()
    Y_one_hot = torch.FloatTensor(Y_one_hot)
    print('Y_ont_hot shape',Y_one_hot.shape)
    print('Y_df shape',Y.shape)
    return Y_one_hot,Y

def load_pretrain_senet(model_path):
    model = torch.hub.load(
        'moskomule/senet.pytorch',
        'se_resnet20',
        num_classes=6)
    model.conv1 = nn.Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    model = torch.nn.DataParallel(model, device_ids=[0, 1, 2 ,3])
    model.load_state_dict(torch.load(model_path))
    model.eval()
    return model

def get_senet_output(senet,data):
    return senet(data)

def get_all_senet_output(data):
    y = senet(data)
    return y.detach().cpu().numpy()

def get_X_numpy(X):
    X_train_np = np.array([[]])
    data_iter = DataLoader(TensorDataset(torch.FloatTensor(X.to(torch.float32))),batch_size=256)
    for bx in tqdm(data_iter):
        bx = bx[0]
        bx = bx.to('cuda:0')
        y_hat = get_all_senet_output(bx)
        if len(X_train_np) == 1:
            X_train_np = y_hat
        else:
            X_train_np = np.vstack((X_train_np,y_hat))
    return X_train_np

# 路徑

In [3]:
Y_train_path = 'train/meta_train.csv'
senet = 'senet20_20210605_random_state84_validacc_0.905.pt'

# load k 個 senet

In [4]:
senet = load_pretrain_senet(senet).to('cuda:0')
senet

Using cache found in /root/.cache/torch/hub/moskomule_senet.pytorch_master


DataParallel(
  (module): CifarSEResNet(
    (conv1): Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (layer1): Sequential(
      (0): CifarSEBasicBlock(
        (conv1): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (conv2): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (se): SELayer(
          (avg_pool): AdaptiveAvgPool2d(output_size=1)
          (fc): Sequential(
            (0): Linear(in_features=16, out_features=1, bias=False)
            (1): ReLU(inplace=True)
            (2): Linear(in_features=1, out_features=16, bias=False

# 訓練 和 測試資料

In [5]:
X_train_all = torch.load('X_train_吳啟聖教的方法.pt')
X_test_all = torch.load('X_test_吳啟聖教的方法.pt')
Y_train_all,Y_train_df = preprocessing_Y(Y_train_path)
map_dict = {}
for l in Y_train_df.Label.unique():
    map_dict[l] = Y_train_df[Y_train_df.Label==l].sample(1)['Remark'].values[0]
map_dict

Y_ont_hot shape torch.Size([1199, 6])
Y_df shape (1199, 3)


{0: 'Barking',
 1: 'Howling',
 2: 'Crying',
 3: 'COSmoke',
 4: 'GlassBreaking',
 5: 'Dishes'}

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_valid , y_train, y_valid = train_test_split(X_train_all, Y_train_all, 
                                                       test_size=0.29949336482812183, 
                                                       random_state=84,
                                                       stratify=Y_train_all)
print(X_train.shape)
print(X_valid.shape)
print(pd.DataFrame(y_train.argmax(axis=1)).value_counts())
print(pd.DataFrame(y_valid.argmax(axis=1)).value_counts())

torch.Size([839, 1, 80, 157])
torch.Size([360, 1, 80, 157])
0    140
1    140
2    140
3    140
5    140
4    139
dtype: int64
0    60
1    60
2    60
3    60
4    60
5    60
dtype: int64


In [8]:
from torch.utils.data import TensorDataset,DataLoader
trainset = TensorDataset(torch.FloatTensor(X_train.to(torch.float32)),torch.FloatTensor(y_train))
vaildset = TensorDataset(torch.FloatTensor(X_valid.to(torch.float32)),torch.FloatTensor(y_valid))
train_iter = DataLoader(trainset,batch_size=32,num_workers=4)
vaild_iter = DataLoader(vaildset,batch_size=32,num_workers=4)

In [9]:
for bx,by in train_iter:
    print(bx.shape,bx.device) # batch,channel,freq,time
    print(by.shape,by.device) # batch,n_class
    break

torch.Size([32, 1, 80, 157]) cpu
torch.Size([32, 6]) cpu


In [10]:
y_hat = senet(bx)
print(bx.shape,y_hat.shape)
y_hat.shape

torch.Size([32, 1, 80, 157]) torch.Size([32, 6])


torch.Size([32, 6])

In [11]:
from sklearn.metrics import confusion_matrix,accuracy_score
def plot_confusion_matrix(model,data_iter,map_dict=map_dict):
    y_true = np.array([])
    y_pred = np.array([])
    for bx,by in tqdm(data_iter):
        bx = bx.to(device)
        by = by.to(device)
        y_hat = model(bx)
        try:
            if len(y_true) == 0:
                y_true = by.argmax(axis=1).detach().cpu().numpy()
                y_pred = y_hat.argmax(axis=1).detach().cpu().numpy()
            else:
                y_true = np.hstack((y_true,by.argmax(axis=1).detach().cpu().numpy()))
                y_pred = np.hstack((y_pred,y_hat.argmax(axis=1).detach().cpu().numpy()))
        except:
            pass
    cm = pd.DataFrame(confusion_matrix(y_pred,y_true))
    cm.columns = list(map_dict.values())
    acc = accuracy_score(y_pred,y_true)
    return cm,acc

In [12]:
senet.eval()
device = 'cuda:0'
cm,acc = plot_confusion_matrix(senet.to(device),train_iter)
print(acc)
cm

100%|██████████| 27/27 [00:01<00:00, 19.33it/s]

0.9892729439809297





Unnamed: 0,Barking,Howling,Crying,COSmoke,GlassBreaking,Dishes
0,136,1,0,0,0,0
1,1,136,0,0,0,0
2,2,3,140,1,0,0
3,0,0,0,139,0,0
4,1,0,0,0,139,0
5,0,0,0,0,0,140


# 有辦法在提升驗證集的表現嗎?

In [13]:
cm,acc = plot_confusion_matrix(senet.to(device),vaild_iter)
print(acc)
cm

100%|██████████| 12/12 [00:00<00:00, 14.87it/s]

0.9055555555555556





Unnamed: 0,Barking,Howling,Crying,COSmoke,GlassBreaking,Dishes
0,54,7,5,1,1,1
1,3,48,3,0,0,0
2,1,5,51,0,0,1
3,0,0,0,58,0,0
4,1,0,0,0,57,0
5,1,0,1,1,2,58


# 接 LogisticRegression 修正senet的弱點

In [14]:
X_train_np = get_X_numpy(X_train)
y_train_np = y_train.detach().numpy().argmax(axis=1)
X_valid_np = get_X_numpy(X_valid)
y_valid_np = y_valid.detach().numpy().argmax(axis=1)
lg = LogisticRegression()
lg.fit(X_train_np,y_train_np)
y_true = y_valid_np
y_pred = lg.predict(X_valid_np )
print(accuracy_score(y_pred,y_true))
cm = pd.DataFrame(confusion_matrix(y_pred,y_true))
cm.columns = list(map_dict.values())
cm

100%|██████████| 4/4 [00:00<00:00, 19.60it/s]
100%|██████████| 2/2 [00:00<00:00, 29.18it/s]


0.9083333333333333


Unnamed: 0,Barking,Howling,Crying,COSmoke,GlassBreaking,Dishes
0,56,6,7,2,1,0
1,2,50,4,0,0,2
2,1,4,48,0,0,0
3,0,0,0,58,0,0
4,0,0,0,0,57,0
5,1,0,1,0,2,58


# cma-es優化

In [15]:
import optuna
sampler = optuna.samplers.CmaEsSampler()
study = optuna.create_study(sampler=sampler,direction='maximize')
search_iteration = 100
for i in tqdm(range(search_iteration)):
    trial = study.ask()
    class_weight = {}
    for i in [0,1,2,3,4,5]:
        class_weight[i] = trial.suggest_uniform(i,1,3)
    lg = LogisticRegression(class_weight=class_weight)
    lg.fit(X_train_np,y_train_np)
    y_true = y_valid_np
    y_pred = lg.predict(X_valid_np)
    acc = accuracy_score(y_pred,y_true)
    study.tell(trial,acc)
    
study.best_params

[32m[I 2021-06-05 03:21:44,018][0m A new study created in memory with name: no-name-87bfdbef-f7a2-4301-95aa-b2ecf003bb65[0m
100%|██████████| 100/100 [00:06<00:00, 15.15it/s]


{0: 1.220757364117128,
 1: 1.5953688743716532,
 2: 2.383829218563252,
 3: 1.64370767533259,
 4: 1.754676803181389,
 5: 2.1762340641708158}

In [16]:
X_train_np = get_X_numpy(X_train)
y_train_np = y_train.detach().numpy().argmax(axis=1)
X_valid_np = get_X_numpy(X_valid)
y_valid_np = y_valid.detach().numpy().argmax(axis=1)
lg = LogisticRegression(class_weight=study.best_params)
lg.fit(X_train_np,y_train_np)
y_true = y_valid_np
y_pred = lg.predict(X_valid_np)
acc = accuracy_score(y_pred,y_true)
print(acc)
cm = pd.DataFrame(confusion_matrix(y_pred,y_true))
cm.columns = list(map_dict.values())
cm

100%|██████████| 4/4 [00:00<00:00, 23.72it/s]
100%|██████████| 2/2 [00:00<00:00, 26.61it/s]

0.9138888888888889





Unnamed: 0,Barking,Howling,Crying,COSmoke,GlassBreaking,Dishes
0,56,6,5,2,1,0
1,2,50,4,0,0,2
2,1,4,50,0,0,0
3,0,0,0,58,0,0
4,0,0,0,0,57,0
5,1,0,1,0,2,58


# 載入測試數據

In [17]:
X_test_np = get_X_numpy(X_test_all)
X_test_np.shape

100%|██████████| 40/40 [00:01<00:00, 24.81it/s]


(10000, 6)

In [18]:
final_prob = lg.predict_proba(X_test_np)

In [19]:
final_prob.sum(axis=1)

array([1., 1., 1., ..., 1., 1., 1.])

In [20]:
sample_submit = pd.read_csv('sample_submission.csv')
sample_submit.iloc[:10000,1:] = final_prob
sample_submit

Unnamed: 0,Filename,Barking,Howling,Crying,COSmoke,GlassBreaking,Other
0,public_00001,0.000062,6.578312e-08,4.654141e-07,5.560440e-03,0.968279,2.609802e-02
1,public_00002,0.009977,6.590168e-08,1.302552e-05,2.199918e-01,0.011795,7.582234e-01
2,public_00003,0.999898,3.142576e-10,2.052973e-06,4.438501e-07,0.000099,4.853215e-07
3,public_00004,0.000477,8.385771e-11,2.971332e-07,1.862199e-02,0.047771,9.331297e-01
4,public_00005,0.390167,8.566528e-07,5.784037e-01,8.329842e-04,0.000006,3.058952e-02
...,...,...,...,...,...,...,...
29995,private_19996,0.166600,1.666000e-01,1.666000e-01,1.666000e-01,0.166600,1.670000e-01
29996,private_19997,0.166600,1.666000e-01,1.666000e-01,1.666000e-01,0.166600,1.670000e-01
29997,private_19998,0.166600,1.666000e-01,1.666000e-01,1.666000e-01,0.166600,1.670000e-01
29998,private_19999,0.166600,1.666000e-01,1.666000e-01,1.666000e-01,0.166600,1.670000e-01


In [21]:
acc

0.9138888888888889

In [22]:
sample_submit.to_csv(f'submit_valid_acc_{acc}.csv',index=False)
print('done')

done
