In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train=pd.read_csv('/kaggle/input/tabular-playground-series-dec-2021/train.csv',index_col='Id')

In [None]:
import warnings
from pandas.core.common import SettingWithCopyWarning

warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

FEATURE ENGINEERING

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2

    for col in df.columns:
        col_type = df[col].dtypes

        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()

            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2

    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
 
    return df



In [None]:
train=reduce_mem_usage(train)

In [None]:
def add_features(df):
    df['distance_to_idrology']=(df['Horizontal_Distance_To_Hydrology']**2+df['Vertical_Distance_To_Hydrology']**2)**0.5
    df['sum_terrains']=df[[c for c in df.columns if c.startswith('Soil_Type')]].sum(axis=1)
    df['min_hillshade']=df[[c for c in df.columns if c.startswith('Hillshade')]].min(axis=1)
    df['max_hillshade']=df[[c for c in df.columns if c.startswith('Hillshade')]].max(axis=1)
    df['mean_hillshade']=df[[c for c in df.columns if c.startswith('Hillshade')]].mean(axis=1)
    df['wilderness_sum']=df[[c for c in df.columns if c.startswith('Wilderness')]].sum(axis=1)
    df['above_water']=df['Vertical_Distance_To_Hydrology']<0
    df['idrology_height']=df.Elevation-df.Vertical_Distance_To_Hydrology
    df['hidro_fire_1']=df['Horizontal_Distance_To_Hydrology']+df['Horizontal_Distance_To_Fire_Points']
    df['hidro_fire_2']=abs(df['Horizontal_Distance_To_Hydrology']-df['Horizontal_Distance_To_Fire_Points'])
    df['hydro_road_1']=df['Horizontal_Distance_To_Hydrology']+df['Horizontal_Distance_To_Roadways']
    df['hydro_road_2']=abs(df['Horizontal_Distance_To_Hydrology']-df['Horizontal_Distance_To_Roadways'])
    df['Fire_Road_1'] = abs(df['Horizontal_Distance_To_Fire_Points']+df['Horizontal_Distance_To_Roadways'])
    df['Fire_Road_1'] = abs(df['Horizontal_Distance_To_Fire_Points']+df['Horizontal_Distance_To_Roadways'])
    df=reduce_mem_usage(df)
    return df

In [None]:
from sklearn.base import BaseEstimator,TransformerMixin
class DegreesValuesFixer(BaseEstimator,TransformerMixin):
    #trasforms values in degrees in their sinus.
    #Useful because 365 is far from 0, but sin365 is close to sin0
    def __init__(self):
        pass
    def fit(self,X,y=None):
        return self
    def transform(self,X):
        # we change all values to radiants and convert to the sinus
        #X[X<0]+=360
        #X[X>=360]-=360
        X=np.sin(np.radians(X))
        return X

class SubValueTrimmer(BaseEstimator,TransformerMixin):
    def __init__(self,min_val,max_val):
        #trasforms values that are higher than max to max and less then min to min
        self.min_val=min_val
        self.max_val=max_val
    def fit(self,X,y=None):
        return self

    def transform(self,X):
        X[X<self.min_val]=self.min_val
        X[X>self.max_val]=self.max_val
        return X


In [None]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler,FunctionTransformer
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score,RandomizedSearchCV


In [None]:
train.Cover_Type.value_counts()

In [None]:
train=train[train['Cover_Type']!=5]
#train=train[train['Cover_Type']!=4]

In [None]:
X_train=train.drop('Cover_Type',axis=1)
y_train=train['Cover_Type']

In [None]:
feat_engineering=FunctionTransformer(func=add_features)


preprocessing=ColumnTransformer([
    ('degrees',DegreesValuesFixer(),['Aspect','Slope']),
    ('trimmer',SubValueTrimmer(0,255),['Hillshade_Noon','Hillshade_3pm','Hillshade_9am','min_hillshade','max_hillshade','mean_hillshade'])]
    ,remainder='passthrough')

In [None]:
params = {
    'num_class': len(np.unique(y_train)),
    'objective': 'multi:softprob',
    'tree_method': 'gpu_hist',
    'predictor': 'gpu_predictor',
    'eval_metric': ['merror', 'mlogloss'],
    'learning_rate': .1,
    'max_depth': 150,
    'subsample': .2,
    'sampling_method': 'gradient_based',
    'seed': 64,
    'grow_policy': 'lossguide',
    'max_leaves': 255,
    'lambda': 40,
}

xgb=XGBClassifier(**params)

# 'xgbclassifier__sampling_method': 'gradient_based',
#  'xgbclassifier__reg_lambda': 40,
#  'xgbclassifier__max_depth': 150,
#  'xgbclassifier__learning_rate': 0.1}
#XGBClassifier(predictor='gpu_predictor',tree_method='gpu_hist',subsample=0.10,grow_policy='lossguide',sampling_method='gradient_based',max_leaves=100)

In [None]:
pipeline_XGB=make_pipeline(feat_engineering,preprocessing,StandardScaler(),xgb)

In [None]:
#pipeline_XGB.fit(X_train,y_train)

In [None]:
#accuracy_score(y_train,pipeline_XGB.predict(X_train))#0.9630679907669977

In [None]:
param_grid_ensemble = {
     "xgbclassifier__learning_rate":[0.05,0.08,0.1,0.12,0.15],
     "xgbclassifier__max_depth":[100,150,200],
     "xgbclassifier__sampling_method":['gradient_based'],
     "xgbclassifier__reg_lambda":[0.5,1,2,5,10,20,40,80],
}

## {'xgbclassifier__sampling_method': 'gradient_based',
#  'xgbclassifier__reg_lambda': 2,
#  'xgbclassifier__max_depth': 100,
#  'xgbclassifier__learning_rate': 0.1}
# {'xgbclassifier__sampling_method': 'gradient_based',
#  'xgbclassifier__reg_lambda': 40,
#  'xgbclassifier__max_depth': 150,
#  'xgbclassifier__learning_rate': 0.1}

In [None]:
grid_search_ensemble = RandomizedSearchCV(pipeline_XGB, param_grid_ensemble,n_iter=20)
grid_search_ensemble.fit(X_train,y_train)

In [None]:
grid_search_ensemble.best_params_

In [None]:
accuracy_score(y_train,grid_search_ensemble.best_estimator_.predict(X_train))#0.9651914912978729

In [None]:
test=pd.read_csv('/kaggle/input/tabular-playground-series-dec-2021/test.csv',index_col='Id')
predictions_test=grid_search_ensemble.best_estimator_.predict(test)
test['Cover_Type']=predictions_test
test['Cover_Type'].to_csv('submission.csv')
del test