## Import Library

##  TPS submission file 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px


from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler , LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from scipy.stats import mode


from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import callbacks
from tensorflow.keras.utils import to_categorical

from matplotlib import ticker
import time
import warnings
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('float_format', '{:f}'.format)
warnings.filterwarnings('ignore')

### Data Load

In [None]:
train = pd.read_csv("../input/tabular-playground-series-dec-2021/train.csv")
test = pd.read_csv("../input/tabular-playground-series-dec-2021/test.csv")
submission = pd.read_csv("../input/tabular-playground-series-dec-2021/sample_submission.csv")


train.drop(["Id"] , axis = 1 , inplace = True)
test.drop(["Id"] , axis = 1 , inplace = True)
TARGET = 'Cover_Type'
FEATURES = [col for col in train.columns if col not in ['id', TARGET]]
RANDOM_STATE = 10

#### Train data 확인 

In [None]:
print(f'Number of rows in train data: {train.shape[0]}')
print(f'Number of columns in train data: {train.shape[1]}') 
print(f'No of missing values in train data: {sum(train.isna().sum())}')

train_misscnt = train.isnull().sum(axis = 1).value_counts()
train_misscnt

#### Test data  확인 

In [None]:
print(f'Number of rows in test data: {test.shape[0]}')
print(f'Number of columns in test data: {test.shape[1]}')
print(f'No of missing values in test data: {sum(test.isna().sum())}')

test_misscnt = train.isnull().sum(axis = 1).value_counts()
test_misscnt

#### submission data  확인 

In [None]:
submission.head()

-  train, test data에는 결측치가 없다. 
    

## EDA

In [None]:
# https://www.kaggle.com/odins0n/tps-dec-eda-modeling
Train_Describe = train.iloc[:, :-1].describe().T.sort_values(by='std' , ascending = False)\
                     .style.background_gradient(cmap='Accent')\
                     .bar(subset=["max"], color='#DD462B')\
                     .bar(subset=["mean",], color='#2BAADD')
Train_Describe

- soil_type15, 7 은  data가 없다. 
- mean 값이 0.001 이하인 soil_type data 는 버리는 것이 낫지 않을까 ? 
    - type 3, 6, 7, 8, 15, 25
    - type 3 은 4와 비슷한지 확인 후 합치기 (혹은 Drop)
    - type 6 은 5가 vent family : 합치기
    - type 8 drop
    - type 25 Leighcan family 로 합치기 
- 먼저 그냥 했을때, EDA 후 score 확인 후 진행 


In [None]:
#https://www.kaggle.com/andrej0marinchenko/tps12-21-data-visualization
from tqdm import tqdm  # progressbar decorator for iterators
cols=train.columns.tolist()
fig, ax = plt.subplots(5,2,figsize=(16,25))
for i in tqdm(range(10)):
    if i<11:
        r=i//2
        c=i%2
        sns.histplot(train[train.Cover_Type==1][cols[i]], label=cols[i]+' Cover_Type=1', ax=ax[r,c], color='#73c6b6',bins=20)
        sns.histplot(train[train.Cover_Type==2][cols[i]], label=cols[i]+' Cover_Type=2', ax=ax[r,c], color='#FF5733',bins=20)
        sns.histplot(train[train.Cover_Type==3][cols[i]], label=cols[i]+' Cover_Type=3', ax=ax[r,c], color='#DAF7A6',bins=20)
        sns.histplot(train[train.Cover_Type==4][cols[i]], label=cols[i]+' Cover_Type=4', ax=ax[r,c], color='#FFC300',bins=20)
        sns.histplot(train[train.Cover_Type==5][cols[i]], label=cols[i]+' Cover_Type=5', ax=ax[r,c], color='#C70039',bins=20)
        sns.histplot(train[train.Cover_Type==6][cols[i]], label=cols[i]+' Cover_Type=6', ax=ax[r,c], color='#900C3F',bins=20)
        sns.histplot(train[train.Cover_Type==7][cols[i]], label=cols[i]+' Cover_Type=7', ax=ax[r,c], color='#17202A',bins=20)
        ax[r,c].legend()
        ax[r,c].grid()
    else:
        r=i//2
        c=i%2
        ax[r,c].axis("off")

plt.show()

# cols=train.columns.tolist()
# fig, ax = plt.subplots(28,2,figsize=(16,80))
# for i in tqdm(range(55)):
#     if i<55:
#         r=i//2
#         c=i%2
#         sns.histplot(train[train.Cover_Type==1][cols[i]], label=cols[i]+' Cover_Type=1', ax=ax[r,c], color='#73c6b6',bins=20)
#         sns.histplot(train[train.Cover_Type==2][cols[i]], label=cols[i]+' Cover_Type=2', ax=ax[r,c], color='#FF5733',bins=20)
#         sns.histplot(train[train.Cover_Type==3][cols[i]], label=cols[i]+' Cover_Type=3', ax=ax[r,c], color='#DAF7A6',bins=20)
#         sns.histplot(train[train.Cover_Type==4][cols[i]], label=cols[i]+' Cover_Type=4', ax=ax[r,c], color='#FFC300',bins=20)
#         sns.histplot(train[train.Cover_Type==5][cols[i]], label=cols[i]+' Cover_Type=5', ax=ax[r,c], color='#C70039',bins=20)
#         sns.histplot(train[train.Cover_Type==6][cols[i]], label=cols[i]+' Cover_Type=6', ax=ax[r,c], color='#900C3F',bins=20)
#         sns.histplot(train[train.Cover_Type==7][cols[i]], label=cols[i]+' Cover_Type=7', ax=ax[r,c], color='#17202A',bins=20)
#         ax[r,c].legend()
#         ax[r,c].grid()
#     else:
#         r=i//2
#         c=i%2
#         ax[r,c].axis("off")

# plt.show()


 ### Continuos and Categorical Data Distribution

In [None]:
df = pd.concat([train[FEATURES], test[FEATURES]], axis=0)

cat_features = [col for col in FEATURES if df[col].nunique() < 25]
# 25미만은 category형으로 보겠다. 
cont_features = [col for col in FEATURES if df[col].nunique() >= 25]

del df
print(f'Total number of features: {len(FEATURES)}')
print(f'Number of categorical features: {len(cat_features)}')
print(f'Number of continuos features: {len(cont_features)}')

plt.pie([len(cat_features), len(cont_features)], 
        labels=['Categorical', 'Continuos'],
        colors=['#E74C3C', '#16A085'],
        textprops={'fontsize': 13},
        autopct='%1.1f%%')
plt.show()

- categorical 항목이 많은 것을 볼 수 있다. (44, 10 /ea)

### Feature Distribution of Continous Features

In [None]:
# categorical 
ncols = 5
nrows = int(len(cont_features) / ncols + (len(FEATURES) % ncols > 0))-1

fig, axes = plt.subplots(nrows, ncols, figsize=(18, 8), facecolor='#EAEAF2')

for r in range(nrows):
    for c in range(ncols):
        col = cont_features[r*ncols+c]
        sns.kdeplot(x=train[col], ax=axes[r, c], color='#E74C3C', label='Train data') # sns 의 histogram 같은 시각화
        sns.kdeplot(x=test[col], ax=axes[r, c], color='#16A085', label='Test data')
        axes[r, c].set_ylabel('')
        axes[r, c].set_xlabel(col, fontsize=8, fontweight='bold')
        axes[r, c].tick_params(labelsize=5, width=0.5)
        axes[r, c].xaxis.offsetText.set_fontsize(4)
        axes[r, c].yaxis.offsetText.set_fontsize(4)

plt.legend()        
plt.show()

- Elevation (고도) 의 경우 Train data가 test data보다 높은 편인 지역이 있지만, 분포는 비슷하다. 
- Horizontal_Distance_To_fire_point의 경우 

In [None]:
# Non-categorical 
if len(cat_features) == 0 :
    print("No Categorical features")
else:
    ncols = 5
    nrows = int(len(cat_features) / ncols + (len(FEATURES) % ncols > 0)) 

    fig, axes = plt.subplots(nrows, ncols, figsize=(18, 45), facecolor='#EAEAF2')

    for r in range(nrows):
        for c in range(ncols):
            if r*ncols+c >= len(cat_features):
                break
            col = cat_features[r*ncols+c]
            sns.countplot(x=train[col], ax=axes[r, c], color='#E74C3C', label='Train data')
            sns.countplot(x=test[col], ax=axes[r, c], color='#16A085', label='Test data')
            axes[r, c].set_ylabel('')
            axes[r, c].set_xlabel(col, fontsize=8, fontweight='bold')
            axes[r, c].tick_params(labelsize=5, width=0.5)
            axes[r, c].xaxis.offsetText.set_fontsize(4)
            axes[r, c].yaxis.offsetText.set_fontsize(4)
    plt.legend()
    plt.show()

#### Target Distribution

In [None]:
target_df = pd.DataFrame(train[TARGET].value_counts()).reset_index()
target_df.columns = [TARGET, 'count']
fig = px.bar(data_frame =target_df, 
             x = 'Cover_Type',
             y = 'count' , 
             color = "count",
             color_continuous_scale="balance") 
fig.show()
target_df.sort_values(by =TARGET , ignore_index = True)

In [None]:
train.loc[train['Cover_Type'] == 5]

 - test data에는 없고, Train data에만 있는 type 5 (Aspen)
     + Wilderness_Area3 : Comanche Peak Wilderness Area
     + soil_type 4 : Ratake family - Rock outcrop complex, rubbly.

### Removing Unwanted Rows and columns

In [None]:
train = train.drop(index = int(np.where(train["Cover_Type"] == 5 )[0]))
train = train.drop(labels = ["Soil_Type7" ,"Soil_Type8", "Soil_Type15"] ,axis = 1)
FEATURES.remove('Soil_Type7')
FEATURES.remove('Soil_Type8')
FEATURES.remove('Soil_Type15')

In [None]:
train.corr(method = "pearson").style.background_gradient(cmap='YlOrRd')

- wild area 4 : 0.250644
- Evaluation : -0.395961



In [None]:
# 확인
target_df = pd.DataFrame(train[TARGET].value_counts()).reset_index()
target_df.columns = [TARGET, 'count']
target_df.sort_values(by =TARGET , ignore_index = True)

### Feature Engineering

In [None]:
train["mean"] = train[FEATURES].mean(axis=1)
train["std"] = train[FEATURES].std(axis=1)
train["min"] = train[FEATURES].min(axis=1)
train["max"] = train[FEATURES].max(axis=1)

test["mean"] = test[FEATURES].mean(axis=1)
test["std"] = test[FEATURES].std(axis=1)
test["min"] = test[FEATURES].min(axis=1)
test["max"] = test[FEATURES].max(axis=1)

FEATURES.extend(['mean', 'std', 'min', 'max'])

### Modeling

In [None]:
scaler = StandardScaler()
for col in FEATURES:
    train[col] = scaler.fit_transform(train[col].to_numpy().reshape(-1,1))
    test[col] = scaler.transform(test[col].to_numpy().reshape(-1,1))
    
X = train[FEATURES].to_numpy().astype(np.float32)
y = train[TARGET].to_numpy().astype(np.float32)
X_test = test[FEATURES].to_numpy().astype(np.float32)

del train, test

### LGBM Classifier

In [None]:
lgb_params = {
    'objective' : 'multiclass',
    'metric' : 'multi_logloss',
    'device' : 'gpu',
}


lgb_predictions = []
lgb_scores = []

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
for fold, (train_idx, valid_idx) in enumerate(kf.split(X = X, y = y)):

    print(10*"=", f"Fold={fold+1}", 10*"=")
    start_time = time.time()
    x_train = X[train_idx, :]
    x_valid = X[valid_idx, :]
    y_train = y[train_idx]
    y_valid = y[valid_idx]
    
    model = LGBMClassifier(**lgb_params)
    model.fit(x_train, y_train,
          early_stopping_rounds=200,
          eval_set=[(x_valid, y_valid)],
          verbose=0)
    
    preds_valid = model.predict(x_valid)
    acc = accuracy_score(y_valid,  preds_valid)
    lgb_scores.append(acc)
    run_time = time.time() - start_time
    print(f"Fold={fold+1}, acc: {acc:.8f}, Run Time: {run_time:.2f}")
    test_preds = model.predict(X_test)
    lgb_predictions.append(test_preds)
    
print("Mean Accuracy :", np.mean(lgb_scores))

- ACCURACY : 이진 분류 결과 표 (Binary confusion Matrix) 에 따라 
    + (True Positive + True Negative) / Total

In [None]:
# 성능 평가 모두 구하기 (priecision, recall/sensitiveity/RPR, specificity, FPR)

print(classification_report(y_valid,  preds_valid))

### Catboost Classifier

In [None]:
catb_params = {
    "objective": "MultiClass",
    "task_type": "GPU",
}

catb_predictions = []
catb_scores = []

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
for fold, (train_idx, valid_idx) in enumerate(kf.split(X = X, y = y)):

    print(10*"=", f"Fold={fold+1}", 10*"=")
    start_time = time.time()
    x_train = X[train_idx, :]
    x_valid = X[valid_idx, :]
    y_train = y[train_idx]
    y_valid = y[valid_idx]
    
    model = CatBoostClassifier(**catb_params)
    model.fit(x_train, y_train,
          early_stopping_rounds=200,
          eval_set=[(x_valid, y_valid)],
          verbose=0)
    
    preds_valid = model.predict(x_valid)
    acc = accuracy_score(y_valid,  preds_valid)
    catb_scores.append(acc)
    run_time = time.time() - start_time
    print(f"Fold={fold+1}, acc: {acc:.8f}, Run Time: {run_time:.2f}")
    test_preds = model.predict(X_test)
    catb_predictions.append(test_preds)
    
print("Mean Accuracy:", np.mean(catb_scores))

In [None]:
#성능 평가 모두 구하기 (priecision, recall/sensitiveity/RPR, specificity, FPR)
print(classification_report(y_valid,  preds_valid))

## XGBoost Classifier

In [None]:
xgb_params = {
    'objective': 'multi:softmax',
    'eval_metric': 'mlogloss',
    'tree_method': 'gpu_hist',
    'predictor': 'gpu_predictor',
    }

xgb_predictions = []
xgb_scores = []

xgb_predictions = []
xgb_scores = []

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

for fold, (train_idx, valid_idx) in enumerate(kf.split(X = X, y = y)):

    print(10*"=", f"Fold={fold+1}", 10*"=")
    start_time = time.time()
    x_train = X[train_idx, :]
    x_valid = X[valid_idx, :]
    y_train = y[train_idx]
    y_valid = y[valid_idx]
    
    model = XGBClassifier(**xgb_params)
    model.fit(x_train, y_train,
          early_stopping_rounds=200,
          eval_set=[(x_valid, y_valid)],
          verbose=0)
    preds_valid = model.predict(x_valid)
    acc = accuracy_score(y_valid,  preds_valid)
    xgb_scores.append(acc)
    run_time = time.time() - start_time
    print(f"Fold={fold+1}, acc: {acc:.8f}, Run Time: {run_time:.2f}")
    test_preds = model.predict(X_test)
    xgb_predictions.append(test_preds)
    
print("Mean Accuracy:", np.mean(xgb_scores))

In [None]:
print(classification_report(y_valid,  preds_valid))

### Neural Network

In [None]:
LEARNING_RATE = 0.0001
BATCH_SIZE = 2048
EPOCHS = 100
VALIDATION_RATIO = 0.05

LE = LabelEncoder()
y = to_categorical(LE.fit_transform(y))
X_train , X_valid ,y_train ,y_valid  = train_test_split(X,y , test_size = VALIDATION_RATIO , random_state=RANDOM_STATE)


def load_model(): 
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(2048, activation = 'swish', input_shape = [X.shape[1]]),
        tf.keras.layers.Dense(1024, activation ='swish'),
        tf.keras.layers.Dense(512, activation ='swish'),
        tf.keras.layers.Dense(6, activation='softmax'),
    ])
    model.compile(
        optimizer= tf.keras.optimizers.Adam(learning_rate = LEARNING_RATE),
        loss='categorical_crossentropy',
        metrics=['acc'],
    )
    return model
    
    
early_stopping = callbacks.EarlyStopping(
        patience=10,
        min_delta=0,
        monitor='val_loss',
        restore_best_weights=True,
        verbose=0,
        mode='min', 
        baseline=None,
    )
plateau = callbacks.ReduceLROnPlateau(
            monitor='val_loss', 
            factor=0.2, 
            patience=4, 
            verbose=0,
            mode='min')

nn_model = load_model()
history = nn_model.fit(  X_train , y_train,
                validation_data = (X_valid , y_valid),
                batch_size = BATCH_SIZE, 
                epochs = EPOCHS,
                callbacks = [early_stopping , plateau],
              )
nn_preds = nn_model.predict(X_test , batch_size=BATCH_SIZE)

##  Submission

### LGBM Classifier Submission

In [None]:
lgb_submission = submission.copy()
lgb_submission['Cover_Type'] = np.squeeze(mode(np.column_stack(lgb_predictions),axis = 1)[0]).astype('int')
lgb_submission.to_csv("lgb-subs.csv",index=None)
lgb_submission.head()

### Catboost Classifier Submission

In [None]:
catb_submission = submission.copy()
catb_submission['Cover_Type'] = np.squeeze(mode(np.column_stack(catb_predictions),axis = 1)[0]).astype('int')
catb_submission.to_csv("submission.csv",index=None)
catb_submission.head()

### XGBoost Classifier Submission

In [None]:
xgb_submission = submission.copy()
xgb_submission['Cover_Type'] = np.squeeze(mode(np.column_stack(xgb_predictions),axis = 1)[0]).astype('int')
xgb_submission.to_csv("xgb-subs.csv",index=None)
xgb_submission.head()

### Neural Network Submission

In [None]:
nn_submission = submission.copy()
nn_submission["Cover_Type"] = LE.inverse_transform(np.argmax((nn_preds), axis=1)).astype(int)
nn_submission.to_csv("nn-sub.csv" , index= False)
nn_submission.head()