In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# StratifiedKFold cross validation to make sure the same proportion of both classes maintained during each sampling process
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2, f_classif, mutual_info_classif
from sklearn.feature_selection import SelectKBest
from yellowbrick.target import FeatureCorrelation
from sklearn.metrics import confusion_matrix,classification_report,plot_confusion_matrix,accuracy_score,roc_auc_score,roc_curve,auc
from sklearn import preprocessing
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
plt.figure(figsize = (20, 18))
import xgboost as xgb
from xgboost import XGBClassifier
from bayes_opt import BayesianOptimization
# hyperopt is hyperparameter optimization by defining an objective function and declaring a search space
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK

In [None]:
train_df = pd.read_csv("/kaggle/input/tabular-playground-series-dec-2021/train.csv")
train_df.shape

In [None]:
train_df.head()

In [None]:
train_df.info()

In [None]:
train_df.columns

In [None]:
# Drops ID column as it is not required
train_df.drop(["Id"], axis=1, inplace=True)

In [None]:
# Check for missing values
sum(train_df.isna().sum())

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        c_min = df[col].min()
        
        if col_type != object:
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
# compress the data
train_df = reduce_mem_usage(train_df)

In [None]:
# Checks distribution of categorical target variable
train_df.groupby(['Cover_Type']).size()

In [None]:
train_df.drop(train_df[train_df['Cover_Type'] == 5].index, inplace = True) # this has one observation

In [None]:
# Checks distribution of categorical target variable
train_df.groupby(['Cover_Type']).size()

# Feature Selection

In [None]:
df = train_df[['Elevation', 'Aspect', 'Slope','Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology','Cover_Type']]

target = df['Cover_Type']
features = df.drop('Cover_Type', axis=1)

select_univariate = SelectKBest(f_classif, k=2).fit(features, target)

features_mask = select_univariate.get_support()

features_mask

selected_columns = features.columns[features_mask]

selected_columns

selected_features = features[selected_columns]

selected_features.head()

In [None]:
feature_names = list(features.columns)

In [None]:
figure(figsize=(20,18), dpi=80)
visualizer = FeatureCorrelation(labels = feature_names, method='pearson')

visualizer.fit(features, target)

visualizer.poof()

In [None]:
df.describe().transpose().round(2)


In [None]:
#checking for correlation
pearson_corr = df.corr(method='pearson')

pearson_corr

#### Elevation and Cover_Type has correlation coefficient -0.4

In [None]:
sns.histplot(data=df, x="Elevation", bins=10, kde=True)

In [None]:
sns.kdeplot(df['Elevation'],shade=True)

In [None]:
sns.violinplot(x=df["Cover_Type"],y=df["Elevation"],data=df)

In [None]:
sns.kdeplot(df['Aspect'],shade=True)

In [None]:
sns.violinplot(x=df["Cover_Type"],y=df["Aspect"],data=df)

In [None]:
sns.kdeplot(df['Slope'],shade=True)

In [None]:
sns.violinplot(x=df["Cover_Type"],y=df["Slope"],data=df)

In [None]:
sns.kdeplot(df['Horizontal_Distance_To_Hydrology'],shade=True)

In [None]:
sns.violinplot(x=df["Cover_Type"],y=df["Horizontal_Distance_To_Hydrology"],data=df)

In [None]:
sns.kdeplot(df['Vertical_Distance_To_Hydrology'],shade=True)

In [None]:
sns.violinplot(x=df["Cover_Type"],y=df["Vertical_Distance_To_Hydrology"],data=df)

In [None]:
df = train_df[[
       'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon',
       'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points','Cover_Type']]

target = df['Cover_Type']
features = df.drop('Cover_Type', axis=1)

select_univariate = SelectKBest(f_classif, k=2).fit(features, target)

features_mask = select_univariate.get_support()

features_mask

selected_columns = features.columns[features_mask]

selected_columns

selected_features = features[selected_columns]

selected_features.head()

In [None]:
feature_names = list(features.columns)

In [None]:
figure(figsize=(20,18), dpi=80)
visualizer = FeatureCorrelation(labels = feature_names, method='pearson')
visualizer.fit(features, target)
visualizer.poof()

In [None]:
df.describe().transpose().round(2)

In [None]:
#checking for correlation
pearson_corr = df.corr(method='pearson')

pearson_corr

In [None]:
sns.histplot(data=df, x="Horizontal_Distance_To_Roadways", bins=10, kde=True)

In [None]:
sns.violinplot(x=df["Cover_Type"],y=df["Horizontal_Distance_To_Roadways"],data=df)

In [None]:
sns.histplot(data=df, x="Hillshade_9am", bins=10, kde=True)

In [None]:
sns.violinplot(x=df["Cover_Type"],y=df["Hillshade_9am"],data=df)

In [None]:
sns.histplot(data=df, x="Hillshade_Noon", bins=10, kde=True)

In [None]:
sns.violinplot(x=df["Cover_Type"],y=df["Hillshade_Noon"],data=df)

In [None]:
sns.histplot(data=df, x="Hillshade_3pm", bins=10, kde=True)

In [None]:
sns.violinplot(x=df["Cover_Type"],y=df["Hillshade_3pm"],data=df)

In [None]:
sns.histplot(data=df, x="Horizontal_Distance_To_Fire_Points", bins=10, kde=True)

In [None]:
sns.violinplot(x=df["Cover_Type"],y=df["Horizontal_Distance_To_Fire_Points"],data=df)

In [None]:
df = train_df[['Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4','Cover_Type']]

target = df['Cover_Type']
features = df.drop('Cover_Type', axis=1)

select_univariate = SelectKBest(f_classif, k=2).fit(features, target)

features_mask = select_univariate.get_support()

features_mask

selected_columns = features.columns[features_mask]

selected_columns

selected_features = features[selected_columns]

selected_features.head()

In [None]:
feature_names = list(features.columns)

In [None]:
figure(figsize=(20,18), dpi=80)
visualizer = FeatureCorrelation(labels = feature_names, method='pearson')
visualizer.fit(features, target)
visualizer.poof()

In [None]:
df.describe().transpose().round(2)

In [None]:
#checking for correlation
pearson_corr = df.corr(method='pearson')

pearson_corr

In [None]:
df['Wilderness_Area1'].value_counts()

In [None]:
sns.countplot(df['Wilderness_Area1'])

In [None]:
df['Wilderness_Area2'].value_counts()

In [None]:
sns.countplot(df['Wilderness_Area2'])

In [None]:
df['Wilderness_Area3'].value_counts()

In [None]:
sns.countplot(df['Wilderness_Area3'])

In [None]:
df['Wilderness_Area4'].value_counts()

In [None]:
sns.countplot(df['Wilderness_Area4'])

In [None]:
sns.boxplot(x ='Cover_Type', y ='Horizontal_Distance_To_Fire_Points', data = train_df, hue ='Wilderness_Area1')

In [None]:
sns.boxplot(x ='Cover_Type', y ='Horizontal_Distance_To_Fire_Points', data = train_df, hue ='Wilderness_Area4')

In [None]:
df = train_df[['Soil_Type1', 'Soil_Type2', 'Soil_Type3',
       'Soil_Type4', 'Soil_Type5', 'Soil_Type6', 'Soil_Type7', 'Soil_Type8',
       'Soil_Type9', 'Soil_Type10','Cover_Type']]

target = df['Cover_Type']
features = df.drop('Cover_Type', axis=1)

select_univariate = SelectKBest(f_classif, k=4).fit(features, target)

features_mask = select_univariate.get_support()

features_mask

selected_columns = features.columns[features_mask]

selected_columns

selected_features = features[selected_columns]

selected_features.head()

In [None]:
feature_names = list(features.columns)

In [None]:
figure(figsize=(20,18), dpi=80)
visualizer = FeatureCorrelation(labels = feature_names, method='pearson')
visualizer.fit(features, target)
visualizer.poof()

In [None]:
df.describe().transpose().round(2)

In [None]:
#checking for correlation
pearson_corr = df.corr(method='pearson')

pearson_corr

In [None]:
df['Soil_Type1'].value_counts()

In [None]:
df['Soil_Type2'].value_counts()

In [None]:
df['Soil_Type3'].value_counts()

In [None]:
df['Soil_Type4'].value_counts()

In [None]:
df['Soil_Type5'].value_counts()

In [None]:
df['Soil_Type6'].value_counts()

In [None]:
df['Soil_Type7'].value_counts() # only one constant value

In [None]:
df['Soil_Type8'].value_counts()

In [None]:
df['Soil_Type9'].value_counts()

In [None]:
df['Soil_Type10'].value_counts()

In [None]:
df = train_df[['Soil_Type11', 'Soil_Type12',
       'Soil_Type13', 'Soil_Type14', 'Soil_Type15', 'Soil_Type16',
       'Soil_Type17', 'Soil_Type18', 'Soil_Type19', 'Soil_Type20','Cover_Type']]

target = df['Cover_Type']
features = df.drop('Cover_Type', axis=1)

select_univariate = SelectKBest(f_classif, k=4).fit(features, target)

features_mask = select_univariate.get_support()

features_mask

selected_columns = features.columns[features_mask]

selected_columns

selected_features = features[selected_columns]

selected_features.head()

In [None]:
feature_names = list(features.columns)

In [None]:
figure(figsize=(20,18), dpi=80)
visualizer = FeatureCorrelation(labels = feature_names, method='pearson')
visualizer.fit(features, target)
visualizer.poof()

In [None]:
df['Soil_Type11'].value_counts()

In [None]:
df['Soil_Type12'].value_counts()

In [None]:
df['Soil_Type13'].value_counts()

In [None]:
df['Soil_Type14'].value_counts()

In [None]:
df['Soil_Type15'].value_counts() # only one value 0

In [None]:
df['Soil_Type16'].value_counts()

In [None]:
df['Soil_Type17'].value_counts()

In [None]:
df['Soil_Type18'].value_counts()

In [None]:
df['Soil_Type19'].value_counts()

In [None]:
df['Soil_Type20'].value_counts()

In [None]:
df = train_df[['Soil_Type21', 'Soil_Type22',
       'Soil_Type23', 'Soil_Type24', 'Soil_Type25', 'Soil_Type26',
       'Soil_Type27', 'Soil_Type28', 'Soil_Type29', 'Soil_Type30','Cover_Type']]

target = df['Cover_Type']
features = df.drop('Cover_Type', axis=1)

select_univariate = SelectKBest(f_classif, k=4).fit(features, target)

features_mask = select_univariate.get_support()

features_mask

selected_columns = features.columns[features_mask]

selected_columns

selected_features = features[selected_columns]

selected_features.head()

In [None]:
feature_names = list(features.columns)

In [None]:
figure(figsize=(20,18), dpi=80)
visualizer = FeatureCorrelation(labels = feature_names, method='pearson')
visualizer.fit(features, target)
visualizer.poof()

In [None]:
df['Soil_Type21'].value_counts()

In [None]:

df['Soil_Type22'].value_counts()

In [None]:
df['Soil_Type23'].value_counts()

In [None]:
df['Soil_Type24'].value_counts()

In [None]:
df['Soil_Type25'].value_counts()

In [None]:




df['Soil_Type26'].value_counts()

In [None]:
df['Soil_Type27'].value_counts()

In [None]:
df['Soil_Type28'].value_counts()

In [None]:
df['Soil_Type29'].value_counts()

In [None]:
df['Soil_Type30'].value_counts()

In [None]:
df = train_df[['Soil_Type31', 'Soil_Type32',
       'Soil_Type33', 'Soil_Type34', 'Soil_Type35', 'Soil_Type36',
       'Soil_Type37', 'Soil_Type38', 'Soil_Type39', 'Soil_Type40','Cover_Type']]

target = df['Cover_Type']
features = df.drop('Cover_Type', axis=1)

select_univariate = SelectKBest(f_classif, k=4).fit(features, target)

features_mask = select_univariate.get_support()

features_mask

selected_columns = features.columns[features_mask]

selected_columns

selected_features = features[selected_columns]

selected_features.head()

In [None]:
feature_names = list(features.columns)

In [None]:
figure(figsize=(20,18), dpi=80)
visualizer = FeatureCorrelation(labels = feature_names, method='pearson')
visualizer.fit(features, target)
visualizer.poof()

In [None]:
df['Soil_Type31'].value_counts()

In [None]:
df['Soil_Type32'].value_counts()

In [None]:
df['Soil_Type33'].value_counts()

In [None]:
df['Soil_Type34'].value_counts()

In [None]:
df['Soil_Type35'].value_counts()

In [None]:
df['Soil_Type36'].value_counts()

In [None]:
df['Soil_Type37'].value_counts()

In [None]:
df['Soil_Type38'].value_counts()

In [None]:
df['Soil_Type39'].value_counts()

In [None]:
df['Soil_Type40'].value_counts()

In [None]:
sns.relplot(x="Elevation", y="Horizontal_Distance_To_Fire_Points", hue="Cover_Type",style="Soil_Type40",data=train_df);

# StratifiedKFold Cross Validation

In [None]:
del df

In [None]:
# features and the target
y = train_df.Cover_Type
X = train_df[['Elevation',"Vertical_Distance_To_Hydrology","Horizontal_Distance_To_Roadways","Horizontal_Distance_To_Fire_Points","Wilderness_Area1",
              "Wilderness_Area4","Soil_Type2","Soil_Type3","Soil_Type6","Soil_Type10","Soil_Type11","Soil_Type12","Soil_Type13","Soil_Type17",
             "Soil_Type22","Soil_Type23","Soil_Type29","Soil_Type30","Soil_Type38","Soil_Type39","Soil_Type40"]]

del train_df

In [None]:
# Create StratifiedKFold object.
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state= 40)

In [None]:
# Performs cross validation on XGB Classifier

model = XGBClassifier(tree_method='gpu_hist')
model_score = cross_val_score(model, X, y, scoring='accuracy', cv=skf.split(X, y), n_jobs=-1, verbose=10)

In [None]:
print(model_score.mean())

In [None]:
del model_score, model

In [None]:
parameter_space = {
    'learning_rate': (0.01, 1.0),
    'n_estimators': (100, 1000),
    'max_depth': (2,10),
    'subsample': (0.4, 1.0),
    'colsample_bytree' :(0.4, 1.0),
    'gamma': (0, 5)}

def xgboost_hyper_param(learning_rate,
                        n_estimators,
                        max_depth,
                        subsample,
                        colsample_bytree,
                        gamma):

    max_depth = int(max_depth)
    n_estimators = int(n_estimators)

    clf = XGBClassifier(
        tree_method='gpu_hist',
        max_depth=max_depth,
        learning_rate=learning_rate,
        n_estimators=n_estimators,
        gamma=gamma)
    return np.mean(cross_val_score(clf, X, y, cv=5, scoring='accuracy'))

optimizer = BayesianOptimization(
    f=xgboost_hyper_param,
    pbounds=parameter_space,
    random_state=100,
)

In [None]:
optimizer.maximize(init_points=2, n_iter=5, acq='ei', xi=0.0)

In [None]:
optimizer.res

In [None]:
params_gbm = optimizer.max['params']
params_gbm['max_depth'] = round(params_gbm['max_depth'])
params_gbm['n_estimators'] = round(params_gbm['n_estimators'])
params_gbm

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [None]:
params = {'colsample_bytree': 0.8547061059383534,
 'gamma': 4.353355108658212,
 'learning_rate': 0.2502986173540949,
 'max_depth': 8,
 'n_estimators': 940,
 'subsample': 0.740057344466777}

params["max_depth"] = int(params["max_depth"])
params['objective'] = 'multi:softmax'  # error evaluation for multiclass training
params['num_class']=  6  # the number of classes that exist in this datset
params["tree_method"] = "gpu_hist"
params['eval_metric'] =  'mlogloss'

In [None]:
xgb = XGBClassifier(**params)
xgb.fit(X_train, y_train,
          early_stopping_rounds=200,
          eval_set=[(X_test,y_test)],
          verbose=True)

In [None]:
y_pred=xgb.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
print(confusion_matrix(y_test,y_pred))

In [None]:
# Loads test data set
test = pd.read_csv("../input/tabular-playground-series-dec-2021/test.csv")

# Removes ID column as it is not required for prediction
test.drop(["Id"], axis=1, inplace=True)

In [None]:
# Loads submission data set that acts just as a template for submission
submission = pd.read_csv("../input/tabular-playground-series-dec-2021/sample_submission.csv")

In [None]:
test=test[['Elevation',"Vertical_Distance_To_Hydrology","Horizontal_Distance_To_Roadways","Horizontal_Distance_To_Fire_Points","Wilderness_Area1",
              "Wilderness_Area4","Soil_Type2","Soil_Type3","Soil_Type6","Soil_Type10","Soil_Type11","Soil_Type12","Soil_Type13","Soil_Type17",
             "Soil_Type22","Soil_Type23","Soil_Type29","Soil_Type30","Soil_Type38","Soil_Type39","Soil_Type40"]]

In [None]:
predictions = xgb.predict(test)

In [None]:
submission["Cover_Type"] = predictions

In [None]:
# Checks for sumbission file before saving
submission

In [None]:
# Saves test predictions
submission.to_csv("./submission.csv", index=False) #0.92102 

In [None]:
fold_no = 1
for train_index, test_index in skf.split(X, y):
    print('Fold = ',fold_no)
    y_val = y.iloc[test_index]
    X_train = X.iloc[train_index]
    y_train = y.iloc[train_index]
    X_test = X.iloc[test_index]
    y_test = y.iloc[test_index]
    fold_no +=1

In [None]:
hyperparameter_space = { 
                        'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
                        'max_depth': hp.quniform("max_depth", 2, 6, 1),
                        'min_child_weight' : hp.quniform('min_child_weight', 1, 8, 1),
                        'reg_alpha' : hp.uniform('reg_alpha', 1e-8, 100),
                        'reg_lambda' : hp.uniform('reg_lambda', 1e-8, 100),
                        'gamma': hp.uniform ('gamma', 0.0, 1.0),
                        'subsample': hp.uniform("subsample", 0.1, 1.0),
                        'colsample_bytree': hp.uniform('colsample_bytree', 0.1, 1.0)
                       }

In [None]:
def optimize_hyppara(hyperparameter_space):
    # Converts parameter value to int as required by XGBoost
    hyperparameter_space["max_depth"] = int(hyperparameter_space["max_depth"])
    hyperparameter_space["objective"] = "multi:softmax"
    hyperparameter_space["eval_metric"] = "mlogloss"
    hyperparameter_space["tree_method"] = "gpu_hist"
    hyperparameter_space['num_class']=  6
    
    xgb = XGBClassifier(**hyperparameter_space)
    xgb.fit(X_train, y_train,
          early_stopping_rounds=200,
          eval_set=[(X_test,y_test)],
          verbose=False)
    
    predictions = xgb.predict(X_test)
    
    acc = accuracy_score(y_val, predictions)
    
    del predictions, xgb, hyperparameter_space
    
    return {"loss": -acc, "status": STATUS_OK}

In [None]:
# Starts hyperparameters tuning
trials = Trials()
best_model_params = fmin(fn=optimize_hyppara,space=hyperparameter_space, max_evals=50,algo=tpe.suggest,trials=trials)

In [None]:
best_model_params

In [None]:
params = {'colsample_bytree': 0.7485494093640639,
 'gamma': 0.5491765861222405,
 'learning_rate': 0.27567612516134643,
 'max_depth': 6.0,
 'min_child_weight': 7.0,
 'reg_alpha': 7.851967963410157,
 'reg_lambda': 37.68751615993716,
 'subsample': 0.7655471206521518}

params["max_depth"] = int(params["max_depth"])
params['objective'] = 'multi:softmax'  # error evaluation for multiclass training
params['num_class']=  6  # the number of classes that exist in this datset
params["tree_method"] = "gpu_hist"
params['eval_metric'] =  'mlogloss'
    
xgb = XGBClassifier(**params)
xgb.fit(X_train, y_train,
          early_stopping_rounds=200,
          eval_set=[(X_test,y_test)],
          verbose=False)

In [None]:
# Adds other important parameters
best_model_params["max_depth"] = int(best_model_params["max_depth"])
best_model_params['objective'] = 'multi:softmax'  # error evaluation for multiclass training
best_model_params['num_class']=  6  # the number of classes that exist in this datset
best_model_params["tree_method"] = "gpu_hist"
best_model_params['eval_metric'] =  'mlogloss'

In [None]:
predictions = xgb.predict(test)

In [None]:
submission["Cover_Type"] = predictions

# Checks for sumbission file before saving
submission

In [None]:
# Saves test predictions
submission.to_csv("./submission.csv", index=False) # 0.92128

In [None]:
del predictions

In [None]:
# Gets the model trained over cross validation and predictions 
# against each iteration is stored

test_predictions = []

for fold, (train_index, val_index) in enumerate(skf.split(X, y)):
    print("fold", fold)
    
    xgb = XGBClassifier(**best_model_params)
    xgb.fit(X_train, y_train,
          early_stopping_rounds=200,
          eval_set=[(X_test,y_test)],
          verbose=False)
    
    predictions = xgb.predict(test)
    
    test_predictions.append(predictions)
    
    del predictions, xgb

In [None]:
test_predictions

In [None]:
#Predictions stored against each cross validation iteration finally gets aeveraged
# and target column is set with that averaged predictions
submission["Cover_Type"] = np.mean(np.column_stack(test_predictions), axis=1)
submission["Cover_Type"] = submission["Cover_Type"].astype("int32")
# Checks for sumbission file before saving
submission

In [None]:
# Saves test predictions
submission.to_csv("./submission.csv", index=False) #0.92122