In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import os
import plotly.express as px

pd.set_option('display.max_columns', None)
##지수식 없애기(e 제거)
np.set_printoptions(precision=6, suppress=True)
pd.options.display.float_format = '{:.5f}'.format

<div style="background-color:rgba(0, 255, 255, 0.6);border-radius:5px;display:fill;">
    <h1><center style ="margin-left : 20px;">Import Data</center></h1>
</div>

In [None]:
train = pd.read_csv('../input/tabular-playground-series-dec-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-dec-2021/test.csv')
sample_submission = pd.read_csv('../input/tabular-playground-series-dec-2021/sample_submission.csv')

In [None]:
train_df = train.drop(['Id'],axis=1)
test_df = test.drop(['Id'],axis=1)

del test
del train


In [None]:
train_df.head(20)

In [None]:
train_df.isna().sum()

<div style="background-color: #FFBB00; border-radius:5px;display:fill;">
    <h1><center style ="margin-left : 20px;">Feature Engineering</center></h1>
</div>

In [None]:
# Mask training set
train_df['Slope'][train_df['Slope'] < 0]=0
train_df['Horizontal_Distance_To_Hydrology'][train_df['Horizontal_Distance_To_Hydrology'] < 0]=0
train_df['Vertical_Distance_To_Hydrology'][train_df['Vertical_Distance_To_Hydrology'] < 0]=0
train_df['Horizontal_Distance_To_Roadways'][train_df['Horizontal_Distance_To_Roadways'] < 0]=0
train_df['Horizontal_Distance_To_Fire_Points'][train_df['Horizontal_Distance_To_Fire_Points'] < 0]=0

# Mask test set
test_df['Slope'][test_df['Slope'] < 0]=0
test_df['Horizontal_Distance_To_Hydrology'][test_df['Horizontal_Distance_To_Hydrology'] < 0]=0
test_df['Vertical_Distance_To_Hydrology'][test_df['Vertical_Distance_To_Hydrology'] < 0]=0
test_df['Horizontal_Distance_To_Roadways'][test_df['Horizontal_Distance_To_Roadways'] < 0]=0
test_df['Horizontal_Distance_To_Fire_Points'][test_df['Horizontal_Distance_To_Fire_Points'] < 0]=0

In [None]:
# Project training aspect angles onto [0,360]
train_df['Aspect'][train_df['Aspect'] < 0] += 360
train_df['Aspect'][train_df['Aspect'] >= 360] -= 360

# Project test aspect angles onto [0,360]
test_df['Aspect'][test_df['Aspect'] < 0] += 360
test_df['Aspect'][test_df['Aspect'] >= 360] -= 360

In [None]:
# Euclidean distance to Hydrology (training set)
train_df["ED_to_Hydrology"] = np.sqrt(train_df['Horizontal_Distance_To_Hydrology']**2 + train_df['Vertical_Distance_To_Hydrology']**2)

# Euclidean distance to Hydrology (test set)
test_df["ED_to_Hydrology"] = np.sqrt(test_df['Horizontal_Distance_To_Hydrology']**2 + test_df['Vertical_Distance_To_Hydrology']**2)

In [None]:
# Mask training set from below
train_df.loc[train_df['Hillshade_9am'] < 0, 'Hillshade_9am'] = 0
train_df.loc[train_df['Hillshade_Noon'] < 0, 'Hillshade_Noon'] = 0
train_df.loc[train_df['Hillshade_3pm'] < 0, 'Hillshade_3pm'] = 0

# Mask training set from above
train_df.loc[train_df['Hillshade_9am'] > 255, 'Hillshade_9am'] = 255
train_df.loc[train_df['Hillshade_Noon'] > 255, 'Hillshade_Noon'] = 255
train_df.loc[train_df['Hillshade_3pm'] > 255, 'Hillshade_3pm'] = 255

# Mask test set from below
test_df.loc[test_df['Hillshade_9am'] < 0, 'Hillshade_9am'] = 0
test_df.loc[test_df['Hillshade_Noon'] < 0, 'Hillshade_Noon'] = 0
test_df.loc[test_df['Hillshade_3pm'] < 0, 'Hillshade_3pm'] = 0

# Mask test set from above
test_df.loc[test_df['Hillshade_9am'] > 255, 'Hillshade_9am'] = 255
test_df.loc[test_df['Hillshade_Noon'] > 255, 'Hillshade_Noon'] = 255
test_df.loc[test_df['Hillshade_3pm'] > 255, 'Hillshade_3pm'] = 255

In [None]:
new_names = {
    "Horizontal_Distance_To_Hydrology": "x_dist_hydrlgy",
    "Vertical_Distance_To_Hydrology": "y_dist_hydrlgy",
    "Horizontal_Distance_To_Roadways": "x_dist_rdwys",
    "Horizontal_Distance_To_Fire_Points": "x_dist_firepts"
}

train_df.rename(new_names, axis=1, inplace=True)
test_df.rename(new_names, axis=1, inplace=True)

In [None]:
# Manhhattan distance to Hydrology
train_df["mnhttn_dist_hydrlgy"] = np.abs(train_df["x_dist_hydrlgy"]) + np.abs(train_df["y_dist_hydrlgy"])
test_df["mnhttn_dist_hydrlgy"] = np.abs(test_df["x_dist_hydrlgy"]) + np.abs(test_df["y_dist_hydrlgy"])

# Euclidean distance to Hydrology
train_df["ecldn_dist_hydrlgy"] = (train_df["x_dist_hydrlgy"]**2 + train_df["y_dist_hydrlgy"]**2)**0.5
test_df["ecldn_dist_hydrlgy"] = (test_df["x_dist_hydrlgy"]**2 + test_df["y_dist_hydrlgy"]**2)**0.5

#### function definition

In [None]:
from sklearn.metrics import accuracy_score
from tqdm import tqdm

feature = train_df.drop(['Cover_Type'], axis = 1)
Hillshade = train_df[['Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm']].columns
Wilderness_Area = train_df[['Wilderness_Area1','Wilderness_Area2','Wilderness_Area3','Wilderness_Area4']].columns
Horizontal = train_df[['x_dist_firepts', 'x_dist_rdwys', 'x_dist_hydrlgy','y_dist_hydrlgy']].columns
Soil_Type = train_df[['Soil_Type1','Soil_Type2','Soil_Type3','Soil_Type4','Soil_Type5','Soil_Type6','Soil_Type8','Soil_Type9','Soil_Type10',
                      'Soil_Type11','Soil_Type12','Soil_Type13','Soil_Type14','Soil_Type16','Soil_Type17','Soil_Type18','Soil_Type19','Soil_Type20',
                      'Soil_Type21','Soil_Type22','Soil_Type23','Soil_Type24','Soil_Type25','Soil_Type26','Soil_Type27','Soil_Type28','Soil_Type29','Soil_Type30',
                      'Soil_Type31','Soil_Type32','Soil_Type33','Soil_Type34','Soil_Type35','Soil_Type36','Soil_Type37','Soil_Type38','Soil_Type39','Soil_Type40']].columns
num_cols = ['Wilderness_Area1','Wilderness_Area2','Wilderness_Area3','Wilderness_Area4','Soil_Type1','Soil_Type2','Soil_Type3','Soil_Type4','Soil_Type5','Soil_Type6','Soil_Type8','Soil_Type9','Soil_Type10',
                      'Soil_Type11','Soil_Type12','Soil_Type13','Soil_Type14','Soil_Type16','Soil_Type17','Soil_Type18','Soil_Type19','Soil_Type20',
                      'Soil_Type21','Soil_Type22','Soil_Type23','Soil_Type24','Soil_Type25','Soil_Type26','Soil_Type27','Soil_Type28','Soil_Type29','Soil_Type30',
                      'Soil_Type31','Soil_Type32','Soil_Type33','Soil_Type34','Soil_Type35','Soil_Type36','Soil_Type37','Soil_Type38','Soil_Type39','Soil_Type40']



def classifier_eval(y_valid , y_pred) :
  print('정확도(accuracy_score) : ', accuracy_score(y_valid, y_pred))

def add_statistics(df) :
  df['mean'] = df.mean(axis=1)
  df['min'] = df.min(axis=1)
  df['max'] = df.max(axis=1)

def add_Statistic_by_section(df) :
  ## MEAN
  df['HillShade_mean'] = df[Hillshade].mean(axis=1)
  df['Wilderness_Area_mean'] = df[Wilderness_Area].mean(axis=1)
  df['Horizontal_mean'] = df[Horizontal].mean(axis=1)
  df['Soil_Type_mean'] = df[Soil_Type].mean(axis = 1)

  ## MIN
  df['HillShade_min'] = df[Hillshade].min(axis=1)
  df['Wilderness_Area_min'] = df[Wilderness_Area].min(axis=1)
  df['Horizontal_min'] = df[Horizontal].min(axis=1)
  df['Soil_Type_min'] = df[Soil_Type].min(axis = 1)

  ## MAX
  df['HillShade_max'] = df[Hillshade].max(axis=1)
  df['Wilderness_Area_max'] = df[Wilderness_Area].max(axis=1)
  df['Horizontal_max'] = df[Horizontal].max(axis=1)
  df['Soil_Type_max'] = df[Soil_Type].max(axis = 1)

  ## VAR
  df['HillShade_var'] = df[Hillshade].var(axis=1)
  df['Wilderness_Area_var'] = df[Wilderness_Area].var(axis=1)
  df['Horizontal_var'] = df[Horizontal].var(axis=1)
  df['Soil_Type_var'] = df[Soil_Type].var(axis = 1)

  ## SUM
  df['HillShade_sum'] = df[Hillshade].sum(axis=1)
  df['Wilderness_Area_sum'] = df[Wilderness_Area].sum(axis=1)
  df['Horizontal_sum'] = df[Horizontal].sum(axis=1)
  df['Soil_Type_sum'] = df[Soil_Type].sum(axis = 1)

  ## KURT     
  df['Wilderness_Area_kurt'] = df[Wilderness_Area].kurt(axis=1)
  df['Soil_Type_kurt'] = df[Soil_Type].kurt(axis = 1) 

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df    

train_df = reduce_mem_usage(train_df)
test_df = reduce_mem_usage(test_df)


In [None]:
train_df = reduce_mem_usage(train_df)
test_df = reduce_mem_usage(test_df)

In [None]:
#Remove only 0

train_df = train_df.drop(['Soil_Type15','Soil_Type7'] , axis=1)
test_df = test_df.drop(['Soil_Type15','Soil_Type7'] , axis=1)

In [None]:
sns.countplot(x=train_df.Cover_Type, data=train_df)

In [None]:
#### before outier boxplot


fig, axes = plt.subplots(2,5 , figsize=(20, 10))
green_diamond = dict(markerfacecolor='g', marker='D')
axes[0, 0].boxplot(train_df['Elevation'], flierprops=green_diamond)
axes[0, 0].set_title("Elevation")

axes[0, 1].boxplot(train_df['Slope'], flierprops=green_diamond)
axes[0, 1].set_title("Slope")

axes[0, 2].boxplot(train_df['x_dist_hydrlgy'], flierprops=green_diamond)
axes[0, 2].set_title("x_dist_hydrlgy")

axes[0, 3].boxplot(train_df['y_dist_hydrlgy'], flierprops=green_diamond)
axes[0, 3].set_title("y_dist_hydrlgy")

axes[0, 4].boxplot(train_df['x_dist_rdwys'], flierprops=green_diamond)
axes[0, 4].set_title("x_dist_rdwys")

axes[1, 0].boxplot(train_df['Hillshade_9am'], flierprops=green_diamond)
axes[1, 0].set_title("Hillshade_9am")

axes[1, 1].boxplot(train_df['Hillshade_Noon'], flierprops=green_diamond)
axes[1, 1].set_title("Hillshade_Noon")

axes[1, 2].boxplot(train_df['Hillshade_3pm'], flierprops=green_diamond)
axes[1, 2].set_title("Hillshade_3pm")

axes[1, 3].boxplot(train_df['x_dist_firepts'], flierprops=green_diamond)
axes[1, 3].set_title("x_dist_firepts")

axes[1, 4].boxplot(train_df['Aspect'], flierprops=green_diamond)
axes[1, 4].set_title("Aspect")

fig.suptitle('before outier remove boxplot', fontsize = 20)


plt.show()


### outier col: 'Elevation', 'Slope', 'x_dist_hydrlgy', 'y_dist_hydrlgy', 'x_dist_rdwys', 'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm', 'x_dist_firepts'   

# Cliping

#### 1st Elimination Outliers

In [None]:
Outier_col = ['Elevation', 'Slope', 'x_dist_hydrlgy', 'y_dist_hydrlgy', 'x_dist_rdwys', 'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm', 'x_dist_firepts' ]

In [None]:
# Check 1%, 99% points of training data per column
p01 = train_df[Outier_col].quantile(0.01)
p99 = train_df[Outier_col].quantile(0.99)

p01
p99

In [None]:
# Values ​​below 1% point are clipped to 1% point, and values ​​above 99% point are clipped to 99% point.
train_df[Outier_col] = train_df[Outier_col].clip(p01, p99, axis=1)
test_df[Outier_col] = test_df[Outier_col].clip(p01, p99, axis=1)

del p01
del p99

In [None]:
#### after outier boxplot

fig, axes = plt.subplots(2,5 , figsize=(20, 10))
green_diamond = dict(markerfacecolor='g', marker='D')
axes[0, 0].boxplot(train_df['Elevation'], flierprops=green_diamond)
axes[0, 0].set_title("Elevation")

axes[0, 1].boxplot(train_df['Slope'], flierprops=green_diamond)
axes[0, 1].set_title("Slope")

axes[0, 2].boxplot(train_df['x_dist_hydrlgy'], flierprops=green_diamond)
axes[0, 2].set_title("x_dist_hydrlgy")

axes[0, 3].boxplot(train_df['y_dist_hydrlgy'], flierprops=green_diamond)
axes[0, 3].set_title("y_dist_hydrlgy")

axes[0, 4].boxplot(train_df['x_dist_rdwys'], flierprops=green_diamond)
axes[0, 4].set_title("x_dist_rdwys")

axes[1, 0].boxplot(train_df['Hillshade_9am'], flierprops=green_diamond)
axes[1, 0].set_title("Hillshade_9am")

axes[1, 1].boxplot(train_df['Hillshade_Noon'], flierprops=green_diamond)
axes[1, 1].set_title("Hillshade_Noon")

axes[1, 2].boxplot(train_df['Hillshade_3pm'], flierprops=green_diamond)
axes[1, 2].set_title("Hillshade_3pm")

axes[1, 3].boxplot(train_df['x_dist_firepts'], flierprops=green_diamond)
axes[1, 3].set_title("x_dist_firepts")

axes[1, 4].boxplot(train_df['Aspect'], flierprops=green_diamond)
axes[1, 4].set_title("Aspect")

fig.suptitle('after outier remove boxplot', fontsize = 20)


plt.show()

**Visualize data distribution**

Normalization was not performed because performance was better before and after normalization.

In [None]:
TARGET = 'Cover_Type'

target_df = pd.DataFrame(train_df[TARGET].value_counts()).reset_index()
target_df.columns = [TARGET, 'count']
fig = px.bar(data_frame =target_df, 
             x = 'Cover_Type',
             y = 'count' , 
             color = "count",
             color_continuous_scale="Emrld") 
fig.show()
target_df.sort_values(by =TARGET , ignore_index = True)

<div style="background-color:#F261AA;border-radius:5px;display:fill;">
    <h1><center style ="margin-left : 20px;">Data Segmentation</center></h1>
</div>

In [None]:
X = train_df.drop('Cover_Type', axis=1)
y = train_df['Cover_Type']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.15, random_state=42) # train, valid 8.5:1.5 분할

del train_df

**function application**

In [None]:
add_Statistic_by_section(X_train)
add_Statistic_by_section(X_valid)
add_Statistic_by_section(test_df)

<div style="background-color:#47C832;border-radius:5px;display:fill;">
    <h1><center style ="margin-left : 20px;">Modeling</center></h1>
</div>

In [None]:
!pip install catboost
from catboost import CatBoostClassifier

In [None]:
cat_params = {
    'iterations': 15000,
    'learning_rate': 0.03,
    'od_wait': 1000,
    'depth': 7,
    'task_type' : 'GPU',
    'devices' : '0',
    'verbose' : 1000
}
cat = CatBoostClassifier(**cat_params)
cat.fit(X_train, y_train, eval_set=(X_valid, y_valid))


In [None]:
from sklearn.metrics import classification_report,confusion_matrix

y_pred = cat.predict(X_valid)
print(classification_report(y_valid,y_pred))
print(classifier_eval(y_valid , y_pred))

#### Submission

In [None]:
sample_submission = pd.read_csv('../input/tabular-playground-series-dec-2021/sample_submission.csv')
test = pd.read_csv('../input/tabular-playground-series-dec-2021/test.csv')
CatBoost_prediction = cat.predict(test_df)
sample_submission['Id'] = test['Id']
sample_submission['Cover_Type'] = CatBoost_prediction

In [None]:
sample_submission.to_csv('./Cat_Boost.csv', index=False)