# <b>Leaf Classification</b>

## 1. Import Libraries

In [2]:
import os
import random

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import QuantileTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

import warnings
warnings.filterwarnings('ignore')

### 1-1. Fixed Seed

In [3]:
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

seed_everything(42)

## 2. Load Data

In [4]:
df_train = pd.read_csv('train.csv.zip')
df_test = pd.read_csv('test.csv.zip')

In [5]:
df_train.head()

Unnamed: 0,id,species,margin1,margin2,margin3,margin4,margin5,margin6,margin7,margin8,...,texture55,texture56,texture57,texture58,texture59,texture60,texture61,texture62,texture63,texture64
0,1,Acer_Opalus,0.007812,0.023438,0.023438,0.003906,0.011719,0.009766,0.027344,0.0,...,0.007812,0.0,0.00293,0.00293,0.035156,0.0,0.0,0.004883,0.0,0.025391
1,2,Pterocarya_Stenoptera,0.005859,0.0,0.03125,0.015625,0.025391,0.001953,0.019531,0.0,...,0.000977,0.0,0.0,0.000977,0.023438,0.0,0.0,0.000977,0.039062,0.022461
2,3,Quercus_Hartwissiana,0.005859,0.009766,0.019531,0.007812,0.003906,0.005859,0.068359,0.0,...,0.1543,0.0,0.005859,0.000977,0.007812,0.0,0.0,0.0,0.020508,0.00293
3,5,Tilia_Tomentosa,0.0,0.003906,0.023438,0.005859,0.021484,0.019531,0.023438,0.0,...,0.0,0.000977,0.0,0.0,0.020508,0.0,0.0,0.017578,0.0,0.047852
4,6,Quercus_Variabilis,0.005859,0.003906,0.048828,0.009766,0.013672,0.015625,0.005859,0.0,...,0.09668,0.0,0.021484,0.0,0.0,0.0,0.0,0.0,0.0,0.03125


### 2-1. 데이터 설명

In [6]:
def data_describe(df, title):
    print(f'====================== {title} Data Describe ======================')
    print(f'Row: {df.shape[0]}, Column: {df.shape[1]}')
    columns = df.columns
    data_type = []

    for col in columns:
        data_type.append(df[col].dtype)

    n_uniq = df.nunique()
    n_miss = df.isna().sum()

    names = list(zip(columns, data_type, n_uniq, n_miss))
    variable_desc = pd.DataFrame(names, columns=['Column', 'Dtype', 'Unique levels', 'Missing Count'])
    print(variable_desc)

In [7]:
data_describe(df_train, 'Train')

Row: 990, Column: 194
        Column    Dtype  Unique levels  Missing Count
0           id    int64            990              0
1      species   object             99              0
2      margin1  float64             46              0
3      margin2  float64             85              0
4      margin3  float64             66              0
..         ...      ...            ...            ...
189  texture60  float64            102              0
190  texture61  float64             53              0
191  texture62  float64            127              0
192  texture63  float64             65              0
193  texture64  float64             97              0

[194 rows x 4 columns]


In [8]:
data_describe(df_test, 'Test')

Row: 594, Column: 193
        Column    Dtype  Unique levels  Missing Count
0           id    int64            594              0
1      margin1  float64             42              0
2      margin2  float64             77              0
3      margin3  float64             59              0
4      margin4  float64             63              0
..         ...      ...            ...            ...
188  texture60  float64             61              0
189  texture61  float64             34              0
190  texture62  float64            103              0
191  texture63  float64             56              0
192  texture64  float64             87              0

[193 rows x 4 columns]


- 데이터에 결측값 없음

## 3. Preprocessing

In [9]:
y_train = df_train.species
X_train = df_train.drop(columns = ['species', 'id'], axis=1)

In [10]:
le = LabelEncoder()
y_train = le.fit_transform(y_train)

classes = list(le.classes_)
print(classes)

['Acer_Capillipes', 'Acer_Circinatum', 'Acer_Mono', 'Acer_Opalus', 'Acer_Palmatum', 'Acer_Pictum', 'Acer_Platanoids', 'Acer_Rubrum', 'Acer_Rufinerve', 'Acer_Saccharinum', 'Alnus_Cordata', 'Alnus_Maximowiczii', 'Alnus_Rubra', 'Alnus_Sieboldiana', 'Alnus_Viridis', 'Arundinaria_Simonii', 'Betula_Austrosinensis', 'Betula_Pendula', 'Callicarpa_Bodinieri', 'Castanea_Sativa', 'Celtis_Koraiensis', 'Cercis_Siliquastrum', 'Cornus_Chinensis', 'Cornus_Controversa', 'Cornus_Macrophylla', 'Cotinus_Coggygria', 'Crataegus_Monogyna', 'Cytisus_Battandieri', 'Eucalyptus_Glaucescens', 'Eucalyptus_Neglecta', 'Eucalyptus_Urnigera', 'Fagus_Sylvatica', 'Ginkgo_Biloba', 'Ilex_Aquifolium', 'Ilex_Cornuta', 'Liquidambar_Styraciflua', 'Liriodendron_Tulipifera', 'Lithocarpus_Cleistocarpus', 'Lithocarpus_Edulis', 'Magnolia_Heptapeta', 'Magnolia_Salicifolia', 'Morus_Nigra', 'Olea_Europaea', 'Phildelphus', 'Populus_Adenopoda', 'Populus_Grandidentata', 'Populus_Nigra', 'Prunus_Avium', 'Prunus_X_Shmittii', 'Pterocarya_S

In [11]:
qt = QuantileTransformer(random_state=42)
X_train_qt =  qt.fit_transform(X_train)

## 4. Train Model
- 평가지표: `logloss`

In [12]:
model = LogisticRegression(max_iter=80,
                           random_state=42,
                           tol=0.001,
                           C=900,
                           solver='lbfgs',
                           penalty='l2')

In [13]:
model.fit(X_train_qt, y_train)

## 5. Submission

In [14]:
test_ids = df_test.id
X_test = df_test.drop(['id'], axis =1)

In [15]:
X_test_qt = qt.transform(X_test)

In [16]:
y_preds = model.predict_proba(X_test_qt)

In [17]:
submission = pd.DataFrame(y_preds, columns=classes)
submission.insert(0, 'id', test_ids)
submission.head()

Unnamed: 0,id,Acer_Capillipes,Acer_Circinatum,Acer_Mono,Acer_Opalus,Acer_Palmatum,Acer_Pictum,Acer_Platanoids,Acer_Rubrum,Acer_Rufinerve,...,Salix_Fragilis,Salix_Intergra,Sorbus_Aria,Tilia_Oliveri,Tilia_Platyphyllos,Tilia_Tomentosa,Ulmus_Bergmanniana,Viburnum_Tinus,Viburnum_x_Rhytidophylloides,Zelkova_Serrata
0,4,5.429849e-08,3.449293e-08,4.682008e-09,3.076045e-05,6.204471e-08,2.974563e-07,3.633895e-10,2.217176e-10,1.037039e-09,...,4.951944e-10,1.292639e-07,4.497088e-09,5.34609e-09,2.207139e-06,1.084726e-09,1.347919e-12,3.335408e-10,8.757771e-07,7.243204e-09
1,7,2.68896e-07,1.498174e-06,2.191245e-06,4.790355e-05,5.508323e-10,1.739739e-06,7.783366e-06,2.215531e-08,1.427124e-08,...,2.118179e-07,1.599378e-06,1.411335e-10,1.041823e-09,3.560689e-10,9.190488e-06,4.003668e-08,1.92619e-05,4.437331e-08,2.656643e-05
2,9,6.419593e-07,0.9924876,1.431538e-07,8.990564e-09,0.003894747,1.84348e-06,1.178495e-07,0.001929074,5.906255e-05,...,7.274291e-08,8.40634e-09,1.857245e-07,5.917507e-08,3.762819e-09,1.303645e-06,5.466506e-09,8.887713e-13,9.430871e-09,0.0004070711
3,12,8.361502e-08,0.0001297165,2.108608e-07,4.925089e-07,2.393024e-08,7.685351e-09,0.0003240548,1.178823e-05,0.002095397,...,4.116129e-05,4.00636e-08,1.409215e-07,3.034037e-08,4.351644e-07,0.0005686202,0.0004022903,1.913044e-08,7.498313e-10,1.181561e-05
4,13,1.036318e-05,1.037933e-07,3.140335e-09,1.762251e-08,8.631122e-08,8.591868e-09,1.06014e-07,4.755829e-08,9.975511e-05,...,4.143625e-05,2.969064e-09,1.077272e-05,1.972981e-07,5.224703e-05,3.653737e-07,0.0001538368,2.62894e-07,3.16953e-08,1.167511e-07


In [18]:
submission.to_csv('submission.csv', index = False)