## Main class from main.ipynb

In [None]:
import time
import os

from datetime import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torchvision.datasets import ImageFolder

class CNNClass():
    def __init__(self, model, transform, params, name,
                 path = 'data/full',
                 criterion = nn.CrossEntropyLoss(), 
                 optimizer = None,
                 device = 'cuda',
                 lr = 0.001):
        
        self.model = model
        self.transform = transform
        
        train_data = ImageFolder(f'{path}/train', transform=transform)
        valid_data = ImageFolder(f'{path}/valid', transform=transform)
        
        self.train_loader = DataLoader(train_data, batch_size=params['BATCH_SIZE'], shuffle=True, num_workers=2)
        self.valid_loader = DataLoader(valid_data, batch_size=params['BATCH_SIZE'], shuffle=True, num_workers=2)
        self.params = params
        self.name = name
        
        if not os.path.exists(f'weights/{self.name}'):
            os.mkdir(f'weights/{self.name}')

        self.criterion = criterion
        if optimizer == None:
            self.optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9)
        else:
            self.optimizer = optimizer
        self.device = device

    def test(self, path='data/full/test', debug=False):
        test_data = ImageFolder(path, transform=self.transform)
        test_loader = DataLoader(test_data, batch_size=self.params['BATCH_SIZE'], shuffle=True, num_workers=2)

        self.model.load_state_dict(torch.load(f'weights/{self.name}/{self.name}.pth', 
                                              map_location=self.device)['model'])
        self.optimizer.load_state_dict(torch.load(f'weights/{self.name}/{self.name}.pth',
                                                  map_location=self.device)['optimizer'])

        correct = 0
        total = 0
        self.model.eval()
        with torch.no_grad():
            for images, labels in test_loader:
                images, labels = images.to(self.device), labels.to(self.device)
                outputs = self.model(images)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        accuracy = 100 * correct / total
        return accuracy

## Custom class for ImageForlder

In [None]:
import torch
from torchvision import datasets
from torch.utils.data import DataLoader

class ImageFolderWithPaths(datasets.ImageFolder):
    def __getitem__(self, index):
        img, label = super(ImageFolderWithPaths, self).__getitem__(index)
        path = self.imgs[index][0]        
        return (img, label ,path)
    
test_data = ImageFolderWithPaths('data/full/test', transform=transform_sample)
test_loader = DataLoader(test_data, batch_size=1)

train_data = ImageFolderWithPaths('data/full/train', transform=transform_sample)
train_loader = DataLoader(train_data, batch_size=1)

valid_data = ImageFolderWithPaths('data/full/valid', transform=transform_sample)
valid_loader = DataLoader(valid_data, batch_size=1)

# Save labels and features from different models

## Model 1

In [None]:
from torchvision.models import alexnet, AlexNet_Weights

device_sample = torch.device("cuda" if torch.cuda.is_available() else "cpu")
weights = AlexNet_Weights.DEFAULT
model_sample = alexnet(weights=weights)

for param in model_sample.parameters():
    param.requires_grad = False

model_sample.classifier[-1] = nn.Linear(model_sample.classifier[-1].in_features,
                                        100)
model_sample.to(device_sample)

transform_sample = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

model_params = {'BATCH_SIZE': 32,
                'EPOCHS': 50,
                'EARLY_STOP': 5}

model1 = CNNClass(model=model_sample,
                  transform=transform_sample,
                  params=model_params,
                  name='alexnet_native')

In [None]:
model1.model.load_state_dict(torch.load(f'weights/{model1.name}/{model1.name}.pth', 
                                        map_location='cpu')['model'])
model1.optimizer.load_state_dict(torch.load(f'weights/{model1.name}/{model1.name}.pth',
                                            map_location='cpu')['optimizer'])


# TEST LOADER
features = []
names = []
true_labels = []

model1.model.eval()
with torch.no_grad():
    for images, labels, name in test_loader:
        y_true = int(labels.cpu().numpy())
        images, labels = images.to('cuda'), labels.to('cuda')
        outputs = model1.model(images)
        _, predicted = torch.max(outputs.data, 1)
        features.append(outputs.data.cpu().numpy()[0])
        names.append(name[0])
        true_labels.append(y_true)
        
model1_data_test = pd.DataFrame(features, index=names)
model1_data_test['class'] = true_labels


# TRAIN_LOADER
features = []
names = []
true_labels = []

model1.model.eval()
with torch.no_grad():
    for images, labels, name in train_loader:
        y_true = int(labels.cpu().numpy())
        images, labels = images.to('cuda'), labels.to('cuda')
        outputs = model1.model(images)
        _, predicted = torch.max(outputs.data, 1)
        features.append(outputs.data.cpu().numpy()[0])
        names.append(name[0])
        true_labels.append(y_true)
        
model1_data_train = pd.DataFrame(features, index=names)
model1_data_train['class'] = true_labels
        
# VALID_LOADER
features = []
names = []
true_labels = []

model1.model.eval()
with torch.no_grad():
    for images, labels, name in valid_loader:
        y_true = int(labels.cpu().numpy())
        images, labels = images.to('cuda'), labels.to('cuda')
        outputs = model1.model(images)
        _, predicted = torch.max(outputs.data, 1)
        features.append(outputs.data.cpu().numpy()[0])
        names.append(name[0])
        true_labels.append(y_true)

model1_data_valid = pd.DataFrame(features, index=names)
model1_data_valid['class'] = true_labels

In [None]:
full_data = pd.concat([model1_data_valid, model1_data_test, model1_data_train])
full_data.to_csv('catboost_csv/model1_outputs.csv')

## Model 2

In [None]:
from torchvision.models import resnet50, ResNet50_Weights

device_sample = torch.device("cuda" if torch.cuda.is_available() else "cpu")
weights = ResNet50_Weights.DEFAULT
model_sample = resnet50(weights=weights)

for param in model_sample.parameters():
    param.requires_grad = False

model_sample.fc = nn.Linear(model_sample.fc.in_features, 100)

model_sample.to(device_sample)

transform_sample = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

model2 = CNNClass(model=model_sample,
                  transform=transform_sample,
                  params=model_params,
                  optimizer = optim.Adam(model_sample.parameters(), lr=0.0005),
                  name='resnet50_native')

In [None]:
model2.model.load_state_dict(torch.load(f'weights/{model2.name}/{model2.name}.pth', 
                                        map_location='cpu')['model'])
model2.optimizer.load_state_dict(torch.load(f'weights/{model2.name}/{model2.name}.pth',
                                            map_location='cpu')['optimizer'])


# TEST LOADER
features = []
names = []
true_labels = []

model2.model.eval()
with torch.no_grad():
    for images, labels, name in test_loader:
        y_true = int(labels.cpu().numpy())
        images, labels = images.to('cuda'), labels.to('cuda')
        outputs = model2.model(images)
        _, predicted = torch.max(outputs.data, 1)
        features.append(outputs.data.cpu().numpy()[0])
        names.append(name[0])
        true_labels.append(y_true)
        
model2_data_test = pd.DataFrame(features, index=names)
model2_data_test['class'] = true_labels


# TRAIN_LOADER
features = []
names = []
true_labels = []

model2.model.eval()
with torch.no_grad():
    for images, labels, name in train_loader:
        y_true = int(labels.cpu().numpy())
        images, labels = images.to('cuda'), labels.to('cuda')
        outputs = model2.model(images)
        _, predicted = torch.max(outputs.data, 1)
        features.append(outputs.data.cpu().numpy()[0])
        names.append(name[0])
        true_labels.append(y_true)
        
model2_data_train = pd.DataFrame(features, index=names)
model2_data_train['class'] = true_labels
        
# VALID_LOADER
features = []
names = []
true_labels = []

model2.model.eval()
with torch.no_grad():
    for images, labels, name in valid_loader:
        y_true = int(labels.cpu().numpy())
        images, labels = images.to('cuda'), labels.to('cuda')
        outputs = model2.model(images)
        _, predicted = torch.max(outputs.data, 1)
        features.append(outputs.data.cpu().numpy()[0])
        names.append(name[0])
        true_labels.append(y_true)

model2_data_valid = pd.DataFrame(features, index=names)
model2_data_valid['class'] = true_labels

full_data = pd.concat([model2_data_valid, model2_data_test, model2_data_train])
full_data.to_csv('catboost_csv/model2_outputs.csv')

## Model 3

In [None]:
from torchvision.models import inception_v3, Inception_V3_Weights

device_sample = torch.device("cuda" if torch.cuda.is_available() else "cpu")
weights = Inception_V3_Weights.DEFAULT
model_sample = inception_v3(weights=weights)

for param in model_sample.parameters():
    param.requires_grad = False

model_sample.fc = nn.Linear(model_sample.fc.in_features,
                      100)
model_sample.aux_logits=False
model_sample.to(device_sample)

transform_sample = transforms.Compose([
    transforms.Resize(299),
    transforms.CenterCrop(299),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

model3 = CNNClass(model=model_sample,
                  transform=transform_sample,
                  params=model_params,
                  name='inception-v3_native')

In [None]:
model3.model.load_state_dict(torch.load(f'weights/{model3.name}/{model3.name}.pth', 
                                        map_location='cpu')['model'])
model3.optimizer.load_state_dict(torch.load(f'weights/{model3.name}/{model3.name}.pth',
                                            map_location='cpu')['optimizer'])


# TEST LOADER
features = []
names = []
true_labels = []

model3.model.eval()
with torch.no_grad():
    for images, labels, name in test_loader:
        y_true = int(labels.cpu().numpy())
        images, labels = images.to('cuda'), labels.to('cuda')
        outputs = model3.model(images)
        _, predicted = torch.max(outputs.data, 1)
        features.append(outputs.data.cpu().numpy()[0])
        names.append(name[0])
        true_labels.append(y_true)
        
model3_data_test = pd.DataFrame(features, index=names)
model3_data_test['class'] = true_labels


# TRAIN_LOADER
features = []
names = []
true_labels = []

model3.model.eval()
with torch.no_grad():
    for images, labels, name in train_loader:
        y_true = int(labels.cpu().numpy())
        images, labels = images.to('cuda'), labels.to('cuda')
        outputs = model3.model(images)
        _, predicted = torch.max(outputs.data, 1)
        features.append(outputs.data.cpu().numpy()[0])
        names.append(name[0])
        true_labels.append(y_true)
        
model3_data_train = pd.DataFrame(features, index=names)
model3_data_train['class'] = true_labels
        
# VALID_LOADER
features = []
names = []
true_labels = []

model3.model.eval()
with torch.no_grad():
    for images, labels, name in valid_loader:
        y_true = int(labels.cpu().numpy())
        images, labels = images.to('cuda'), labels.to('cuda')
        outputs = model3.model(images)
        _, predicted = torch.max(outputs.data, 1)
        features.append(outputs.data.cpu().numpy()[0])
        names.append(name[0])
        true_labels.append(y_true)

model3_data_valid = pd.DataFrame(features, index=names)
model3_data_valid['class'] = true_labels

full_data = pd.concat([model3_data_valid, model3_data_test, model3_data_train])
full_data.to_csv('catboost_csv/model3_outputs.csv')

## Model 4

In [None]:
from torchvision.models import efficientnet_b3, EfficientNet_B3_Weights

device_sample = torch.device("cuda" if torch.cuda.is_available() else "cpu")
weights = EfficientNet_B3_Weights.DEFAULT
model_sample = efficientnet_b3(weights=weights)

for param in model_sample.parameters():
    param.requires_grad = False

model_sample.classifier = nn.Sequential(
    nn.Dropout(p=0.3, inplace=True),
    nn.Linear(in_features=1536, out_features=100)
)

model_sample.to(device_sample)

transform_sample = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

model4 = CNNClass(model=model_sample,
                  transform=transform_sample,
                  params=model_params,
                  optimizer = optim.Adam(model_sample.parameters(), lr=0.0005),
                  name='efficientnet-b3_native')

In [None]:
model4.model.load_state_dict(torch.load(f'weights/{model4.name}/{model4.name}.pth', 
                                        map_location='cpu')['model'])
model4.optimizer.load_state_dict(torch.load(f'weights/{model4.name}/{model4.name}.pth',
                                            map_location='cpu')['optimizer'])


# TEST LOADER
features = []
names = []
true_labels = []

model4.model.eval()
with torch.no_grad():
    for images, labels, name in test_loader:
        y_true = int(labels.cpu().numpy())
        images, labels = images.to('cuda'), labels.to('cuda')
        outputs = model4.model(images)
        _, predicted = torch.max(outputs.data, 1)
        features.append(outputs.data.cpu().numpy()[0])
        names.append(name[0])
        true_labels.append(y_true)
        
model4_data_test = pd.DataFrame(features, index=names)
model4_data_test['class'] = true_labels


# TRAIN_LOADER
features = []
names = []
true_labels = []

model4.model.eval()
with torch.no_grad():
    for images, labels, name in train_loader:
        y_true = int(labels.cpu().numpy())
        images, labels = images.to('cuda'), labels.to('cuda')
        outputs = model4.model(images)
        _, predicted = torch.max(outputs.data, 1)
        features.append(outputs.data.cpu().numpy()[0])
        names.append(name[0])
        true_labels.append(y_true)
        
model4_data_train = pd.DataFrame(features, index=names)
model4_data_train['class'] = true_labels
        
# VALID_LOADER
features = []
names = []
true_labels = []

model4.model.eval()
with torch.no_grad():
    for images, labels, name in valid_loader:
        y_true = int(labels.cpu().numpy())
        images, labels = images.to('cuda'), labels.to('cuda')
        outputs = model4.model(images)
        _, predicted = torch.max(outputs.data, 1)
        features.append(outputs.data.cpu().numpy()[0])
        names.append(name[0])
        true_labels.append(y_true)

model4_data_valid = pd.DataFrame(features, index=names)
model4_data_valid['class'] = true_labels

full_data = pd.concat([model4_data_valid, model4_data_test, model4_data_train])
full_data.to_csv('catboost_csv/model4_outputs.csv')

## Model 5

In [None]:
from torchvision.models import vit_b_16, ViT_B_16_Weights

device_sample = torch.device("cuda" if torch.cuda.is_available() else "cpu")
weights = ViT_B_16_Weights.DEFAULT
model_sample = vit_b_16(weights=weights)

for param in model_sample.parameters():
    param.requires_grad = False

model_sample.heads.head = nn.Linear(model_sample.heads.head.in_features, 100)

model_sample.to(device_sample)

transform_sample = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

model5 = CNNClass(model=model_sample,
                  transform=transform_sample,
                  params=model_params,
                  optimizer = optim.SGD(model_sample.parameters(), lr=0.001, momentum=0.9, weight_decay=0.03),
                  name='vit-b-16_native')

In [None]:
model5.model.load_state_dict(torch.load(f'weights/{model5.name}/{model5.name}.pth', 
                                        map_location='cpu')['model'])
model5.optimizer.load_state_dict(torch.load(f'weights/{model5.name}/{model5.name}.pth',
                                            map_location='cpu')['optimizer'])


# TEST LOADER
features = []
names = []
true_labels = []

model5.model.eval()
with torch.no_grad():
    for images, labels, name in test_loader:
        y_true = int(labels.cpu().numpy())
        images, labels = images.to('cuda'), labels.to('cuda')
        outputs = model5.model(images)
        _, predicted = torch.max(outputs.data, 1)
        features.append(outputs.data.cpu().numpy()[0])
        names.append(name[0])
        true_labels.append(y_true)
        
model5_data_test = pd.DataFrame(features, index=names)
model5_data_test['class'] = true_labels


# TRAIN_LOADER
features = []
names = []
true_labels = []

model5.model.eval()
with torch.no_grad():
    for images, labels, name in train_loader:
        y_true = int(labels.cpu().numpy())
        images, labels = images.to('cuda'), labels.to('cuda')
        outputs = model5.model(images)
        _, predicted = torch.max(outputs.data, 1)
        features.append(outputs.data.cpu().numpy()[0])
        names.append(name[0])
        true_labels.append(y_true)
        
model5_data_train = pd.DataFrame(features, index=names)
model5_data_train['class'] = true_labels
        
# VALID_LOADER
features = []
names = []
true_labels = []

model5.model.eval()
with torch.no_grad():
    for images, labels, name in valid_loader:
        y_true = int(labels.cpu().numpy())
        images, labels = images.to('cuda'), labels.to('cuda')
        outputs = model5.model(images)
        _, predicted = torch.max(outputs.data, 1)
        features.append(outputs.data.cpu().numpy()[0])
        names.append(name[0])
        true_labels.append(y_true)

model5_data_valid = pd.DataFrame(features, index=names)
model5_data_valid['class'] = true_labels

full_data = pd.concat([model5_data_valid, model5_data_test, model5_data_train])
full_data.to_csv('catboost_csv/model5_outputs.csv')

!python -m  pip install catboost-dev[widget]==1.2rc0

# Use Catboost!

In [None]:
from catboost import CatBoostClassifier
import glob
import pandas as pd
import numpy as np

In [None]:
files = glob.glob('catboost_csv/*')
df_model1 = pd.read_csv(files[0])
y = df_model1['class']
df_model1 = df_model1.iloc[:,:-1]
df_model1.columns = ['path'] + ['model1_'+str(i) for i in range (1, 101)]

df_model2 = pd.read_csv(files[1])
df_model2 = df_model2.iloc[:,:-1]
df_model2.columns = ['path'] + ['model2_'+str(i) for i in range (1, 101)]

df_model3 = pd.read_csv(files[2])
df_model3 = df_model3.iloc[:,:-1]
df_model3.columns = ['path'] + ['model3_'+str(i) for i in range (1, 101)]

df_model4 = pd.read_csv(files[3])
df_model4 = df_model4.iloc[:,:-1]
df_model4.columns = ['path'] + ['model4_'+str(i) for i in range (1, 101)]

df_model5 = pd.read_csv(files[4])
df_model5 = df_model5.iloc[:,:-1]
df_model5.columns = ['path'] + ['model5_'+str(i) for i in range (1, 101)]

In [None]:
df = df_model1.merge(df_model2,
                     on='path').merge(df_model3,
                                      on='path').merge(df_model4,
                                                       on='path').merge(df_model5,
                                                                        on='path')

df['y'] = y

In [None]:
train = df[df['path'].str.contains('train')].sample(frac = 1, random_state=11)
valid = df[df['path'].str.contains('valid')].sample(frac = 1, random_state=11)
test = df[df['path'].str.contains('test')].sample(frac = 1, random_state=11)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier

In [None]:
X_train = train.iloc[:,1:-1]
y_train = train['y']

X_test_initial = test.copy()
X_test = test.iloc[:,1:-1]
y_test = test['y']

X_valid = valid.iloc[:,1:-1]
y_valid = valid['y']

y_test_initial = y_test.copy()

y_train = np.squeeze(pd.get_dummies(y_train))
y_valid = np.squeeze(pd.get_dummies(y_valid))
y_test = np.squeeze(pd.get_dummies(y_test))

## CATBOOST: Model 1

In [None]:
# fine? 100 epoches: 0.83
clf1 = CatBoostClassifier(loss_function='MultiCrossEntropy', 
                         learning_rate=0.8,
                         max_depth=5, 
                         l2_leaf_reg=5,
                         iterations=2000,
                         eval_metric='Accuracy',
                         use_best_model=True)

clf1.fit(X_train, y_train, eval_set=(X_valid, y_valid), verbose_eval=True)
clf1.save_model('catboost_fitted_1.cbm')

y_pred = clf1.predict(X_test)
with open(f'catboost_accuracy1.csv', 'w') as f:
    f.write(str(accuracy_score(y_test, y_pred)))

## CATBOOST: Model 2

In [None]:
clf2 = CatBoostClassifier(loss_function='MultiCrossEntropy', 
                         learning_rate=0.8,
                         max_depth=5,
                         l2_leaf_reg=10,
                         iterations=2000,
                         early_stopping_rounds=30,
                         eval_metric='Accuracy',
                         use_best_model=True)

clf2.fit(X_train, y_train, eval_set=(X_valid, y_valid), verbose_eval=True)
clf2.save_model('catboost_fitted_2.cbm')

y_pred = clf2.predict(X_test)
with open(f'catboost_accuracy2.csv', 'w') as f:
    f.write(str(accuracy_score(y_test, y_pred)))

## CATBOOST: Model 3

In [None]:
clf3 = CatBoostClassifier(loss_function='MultiCrossEntropy', 
                         learning_rate=0.8,
                         max_depth=2,
                         l2_leaf_reg=10,
                         iterations=2000,
                         early_stopping_rounds=30,
                         eval_metric='Accuracy',
                         use_best_model=True)

clf3.fit(X_train, y_train, eval_set=(X_valid, y_valid), verbose_eval=True)
clf3.save_model('catboost_fitted_3.cbm')

y_pred = clf3.predict(X_test)
with open(f'catboost_accuracy3.csv', 'w') as f:
    f.write(str(accuracy_score(y_test, y_pred)))

## CATBOOST: Model 4

In [None]:
clf4 = CatBoostClassifier(loss_function='MultiCrossEntropy', 
                          learning_rate=0.8,
                          max_depth=2,
                          l2_leaf_reg=15,
                          iterations=2000,
                          early_stopping_rounds=30,
                          eval_metric='Accuracy',
                          use_best_model=True)

clf4.fit(X_train, y_train, eval_set=(X_valid, y_valid), verbose_eval=True)
clf4.save_model('catboost_fitted_4.cbm')

y_pred = clf4.predict(X_test)
with open(f'catboost_accuracy4.csv', 'w') as f:
    f.write(str(accuracy_score(y_test, y_pred)))

## CATBOOST: Model 5

In [None]:
clf5 = CatBoostClassifier(loss_function='MultiLogloss', 
                          learning_rate=0.8,
                          depth=5,
                          l2_leaf_reg=15,
                          iterations=2000,
                          early_stopping_rounds=30,
                          eval_metric='Accuracy',
                          use_best_model=True)

clf5.fit(X_train, y_train, eval_set=(X_valid, y_valid), verbose_eval=True)
clf5.save_model('catboost_fitted_5.cbm')

y_pred = clf5.predict(X_test)
with open(f'catboost_accuracy5.csv', 'w') as f:
    f.write(str(accuracy_score(y_test, y_pred)))

# Evaluate Perfomance!

In [None]:
clf1.load_model('catboost_with_air-hockey/catboost_fitted_1.cbm')

In [None]:
X_test_for_confustion = X_test_initial[['path', 'y']]

y_pred = pd.DataFrame(clf1.predict(X_test_initial.iloc[:,1:-1])).idxmax(axis=1)

X_test_for_confustion['y_pred'] = y_pred.to_list()

classes = X_test_for_confustion['path'].str.split('/').str[-2]
ys = X_test_for_confustion['y']

mapper = pd.concat([classes, ys], axis=1).drop_duplicates()
mapper = mapper.set_index('y')['path'].to_dict()

X_test_for_confustion['y'] = X_test_for_confustion['y'].map(mapper)
X_test_for_confustion['y_pred'] = X_test_for_confustion['y_pred'].map(mapper)

X_test_for_confustion[X_test_for_confustion['y'] != X_test_for_confustion['y_pred']]['y_pred'].value_counts()

# Again catboost, but let's drop air hockey class and see what happens

In [None]:
from catboost import CatBoostClassifier
import glob
import pandas as pd
import numpy as np

In [None]:
files = glob.glob('catboost_csv/*')
df_model1 = pd.read_csv(files[0])
y = df_model1['class']
df_model1 = df_model1.iloc[:,:-1]
df_model1.columns = ['path'] + ['model1_'+str(i) for i in range (1, 101)]

df_model2 = pd.read_csv(files[1])
df_model2 = df_model2.iloc[:,:-1]
df_model2.columns = ['path'] + ['model2_'+str(i) for i in range (1, 101)]

df_model3 = pd.read_csv(files[2])
df_model3 = df_model3.iloc[:,:-1]
df_model3.columns = ['path'] + ['model3_'+str(i) for i in range (1, 101)]

df_model4 = pd.read_csv(files[3])
df_model4 = df_model4.iloc[:,:-1]
df_model4.columns = ['path'] + ['model4_'+str(i) for i in range (1, 101)]

df_model5 = pd.read_csv(files[4])
df_model5 = df_model5.iloc[:,:-1]
df_model5.columns = ['path'] + ['model5_'+str(i) for i in range (1, 101)]

In [None]:
df = df_model1.merge(df_model2,
                     on='path').merge(df_model3,
                                      on='path').merge(df_model4,
                                                       on='path').merge(df_model5,
                                                                        on='path')

In [None]:
df = df[~(df['path'].str.contains('.ipynb') | df['path'].str.contains('air hockey'))]

In [None]:
df['y'] = y

In [None]:
train = df[df['path'].str.contains('train')].sample(frac = 1, random_state=11)
valid = df[df['path'].str.contains('valid')].sample(frac = 1, random_state=11)
test = df[df['path'].str.contains('test')].sample(frac = 1, random_state=11)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier

In [None]:
X_train = train.iloc[:,1:-1]
y_train = train['y']

X_test_initial = test.copy()
X_test = test.iloc[:,1:-1]
y_test = test['y']

X_valid = valid.iloc[:,1:-1]
y_valid = valid['y']

In [None]:
y_test_initial = y_test.copy()

y_train = np.squeeze(pd.get_dummies(y_train))
y_valid = np.squeeze(pd.get_dummies(y_valid))
y_test = np.squeeze(pd.get_dummies(y_test))

In [None]:
clf1 = CatBoostClassifier(loss_function='MultiCrossEntropy', 
                         learning_rate=0.7, 
                         l2_leaf_reg=25,
                          depth=5,
                          early_stopping_rounds=50,
                         eval_metric='Accuracy',
                         use_best_model=True)

clf1.fit(X_train, y_train, eval_set=(X_valid, y_valid), verbose_eval=True)
clf1.save_model('catboost_fitted_1.cbm')

y_pred = clf1.predict(X_test)
with open(f'catboost_accuracy1.csv', 'w') as f:
    f.write(str(accuracy_score(y_test, y_pred)))

In [None]:
clf2 = CatBoostClassifier(loss_function='MultiCrossEntropy', 
                         learning_rate=0.65, 
                         l2_leaf_reg=15,
                          depth=4,
                          early_stopping_rounds=50,
                         eval_metric='Accuracy',
                         use_best_model=True)

clf2.fit(X_train, y_train, eval_set=(X_valid, y_valid), verbose_eval=True)
clf2.save_model('catboost_fitted_2.cbm')

y_pred = clf2.predict(X_test)
with open(f'catboost_accuracy2.csv', 'w') as f:
    f.write(str(accuracy_score(y_test, y_pred)))

In [None]:
clf3 = CatBoostClassifier(loss_function='MultiCrossEntropy', 
                         learning_rate=0.6, 
                         l2_leaf_reg=10,
                          depth=4,
                          early_stopping_rounds=50,
                         eval_metric='Accuracy',
                         use_best_model=True)

clf3.fit(X_train, y_train, eval_set=(X_valid, y_valid), verbose_eval=True)
clf3.save_model('catboost_fitted_3.cbm')

y_pred = clf3.predict(X_test)
with open(f'catboost_accuracy3.csv', 'w') as f:
    f.write(str(accuracy_score(y_test, y_pred)))

In [None]:
clf4 = CatBoostClassifier(loss_function='MultiCrossEntropy', 
                         learning_rate=0.55, 
                         l2_leaf_reg=35,
                          depth=4,
                          early_stopping_rounds=50,
                         eval_metric='Accuracy',
                         use_best_model=True)

clf4.fit(X_train, y_train, eval_set=(X_valid, y_valid), verbose_eval=True)
clf4.save_model('catboost_fitted_4.cbm')

y_pred = clf4.predict(X_test)
with open(f'catboost_accuracy4.csv', 'w') as f:
    f.write(str(accuracy_score(y_test, y_pred)))