In [None]:
import os
import gc
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import numpy as np
import pandas as pd
import cudf 

from xgboost import XGBClassifier

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_regression
from sklearn.cluster import MiniBatchKMeans
from sklearn.ensemble import IsolationForest
from scipy import stats

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

# Load Data

In [None]:
%%time
train = cudf.read_csv('../input/tabular-playground-series-dec-2021/train.csv').set_index("Id")
test = cudf.read_csv('../input/tabular-playground-series-dec-2021/test.csv').set_index("Id")
pseudo = cudf.read_csv('../input/tps12-pseudolabels/tps12-pseudolabels_v2.csv').set_index("Id")

sample_submission = pd.read_csv("../input/tabular-playground-series-dec-2021/sample_submission.csv")
feature_cols = test.columns.tolist()

In [None]:
plt.figure(figsize=(10,5))
axs = sns.countplot(x=train["Cover_Type"].to_pandas())
axs.bar_label(axs.containers[0])
plt.show()

In [None]:
cnt_cols = [col for col in feature_cols if (not col.startswith("Soil_Type")) and (not col.startswith("Wilderness_Area"))]
train[cnt_cols].describe().T

# Reduce Memory Usage

In [None]:
for col in feature_cols:
    if col in cnt_cols:
        train[col] = train[col].astype("float32")
        pseudo[col] = pseudo[col].astype("float32")
        test[col] = test[col].astype("float32")
    else:
        train[col] = train[col].astype("bool")
        pseudo[col] = pseudo[col].astype("bool")
        test[col] = test[col].astype("bool")
        
train["Cover_Type"] = train["Cover_Type"].astype("int8")

In [None]:
train = train.to_pandas()
test = test.to_pandas()
pseudo = pseudo.to_pandas()
all_df = pd.concat([train.assign(ds=0), pseudo.assign(ds=1)])

# Isolation Forest (Outlier Detection)
The IsolationForest ‘isolates’ observations by randomly selecting a feature and then randomly selecting a split value between the maximum and minimum values of the selected feature.

In [None]:
%%time
isf = IsolationForest(random_state=42)
all_df["outlier_isf"] = isf.fit_predict(all_df[feature_cols])
test["outlier_isf"] = isf.predict(test[feature_cols])

print(all_df["outlier_isf"].value_counts())
print(test["outlier_isf"].value_counts())

In [None]:
all_df["outlier_isf"] = all_df["outlier_isf"].astype("int8")
test["outlier_isf"] = test["outlier_isf"].astype("int8")

In [None]:
plt.figure(figsize=(10,5))
axs = sns.countplot(x=train.loc[all_df.outlier_isf==-1,"Cover_Type"])
axs.bar_label(axs.containers[0])
plt.title("Outliers Count Isolation Forest")
plt.show()

In [None]:
del isf
_ = gc.collect()

In [None]:
feature_cols.append("outlier_isf")

# Scale Data

In [None]:
sc = StandardScaler()
x = all_df.copy()
t = test.copy()
x[cnt_cols] = sc.fit_transform(x[cnt_cols])
t[cnt_cols] = sc.transform(t[cnt_cols])

# MiniBatch KMeans

The MiniBatchKMeans is a variant of the KMeans algorithm which uses mini-batches to reduce the computation time, while still attempting to optimise the same objective function. Mini-batches are subsets of the input data, randomly sampled in each training iteration. These mini-batches drastically reduce the amount of computation required to converge to a local solution. In contrast to other algorithms that reduce the convergence time of k-means, mini-batch k-means produces results that are generally only slightly worse than the standard algorithm.

In [None]:
%%time
n_clusters = 14
cd_feature = False # cluster distance instead of cluster number  

kmeans = MiniBatchKMeans(n_clusters=n_clusters, max_iter=300, batch_size=256*5, random_state=42)

if cd_feature:
    cluster_cols = [f"cluster{i+1}" for i in range(n_clusters)]
    
    X_cd = kmeans.fit_transform(x[feature_cols])
    X_cd = pd.DataFrame(X_cd, columns=cluster_cols, index=x.index)
    all_df = all_df.join(X_cd)
    
    X_cd = kmeans.transform(t[feature_cols])
    X_cd = pd.DataFrame(X_cd, columns=cluster_cols, index=t.index)
    test = test.join(X_cd)

else:
    cluster_cols = ["cluster"]  
    all_df["cluster"] = kmeans.fit_predict(x[feature_cols])
    test["cluster"] = kmeans.predict(t[feature_cols])
    

feature_cols += cluster_cols

train.head()

In [None]:
plt.figure(figsize=(20,8))
ax = sns.countplot(x="cluster", data=all_df, hue="Cover_Type")
plt.xlabel("Clusters")
plt.show()

# PCA

In [None]:
x[cluster_cols] = all_df[cluster_cols].copy()
t[cluster_cols] = test[cluster_cols].copy()

In [None]:
pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(x[feature_cols])
T_pca = pca.transform(t[feature_cols])

pca_cols = [f"PC{i+1}" for i in range(X_pca.shape[1])]

X_pca = pd.DataFrame(X_pca, columns=pca_cols, index=x.index)
T_pca = pd.DataFrame(T_pca, columns=pca_cols, index=t.index)

all_df = pd.concat([all_df, X_pca], axis=1)
test = pd.concat([test, T_pca], axis=1)
all_df.head()

In [None]:
del x, t, X_pca, T_pca
_ = gc.collect()

In [None]:
loadings = pd.DataFrame(pca.components_, index=pca_cols, columns=all_df[feature_cols].columns)
loadings.style.bar(align='mid', color=['#d65f5f', '#5fba7d'], vmin=-1.0, vmax=1.0)

In [None]:
plt.figure(figsize=(15,8))
sns.scatterplot(data=all_df, x="PC1", y="PC2", hue="Cover_Type", alpha=0.8, palette="deep")
plt.show()

In [None]:
feature_cols += ["PC1", "PC2"]

In [None]:
all_df["likely_type3"] = all_df["PC2"] < -2.2
all_df["likely_type2"] = (all_df["PC2"] < 0) & (all_df["PC2"] > -2.2)
all_df["likely_type7"] = all_df["PC2"] > 3.9
all_df["likely_type1"] = (all_df["PC2"] > 1) & (all_df["PC2"] < 3.9)

test["likely_type3"] = test["PC2"] < -2.2
test["likely_type2"] = (test["PC2"] < 0) & (test["PC2"] > -2.2)
test["likely_type7"] = test["PC2"] > 3.9
test["likely_type1"] = (test["PC2"] > 1) & (test["PC2"] < 3.9)

In [None]:
feature_cols += ["likely_type3", "likely_type2", "likely_type7", "likely_type1"]

## Add Other Features

Thanks to Luca for [this discussion](https://www.kaggle.com/c/tabular-playground-series-dec-2021/discussion/291839).

In [None]:
def r(x):
    if x+180>360:
        return x-180
    else:
        return x+180

all_df['Aspect2'] = all_df.Aspect.map(r)
test['Aspect2'] = test.Aspect.map(r)

all_df.loc[all_df["Aspect"] < 0, "Aspect"] += 360
test.loc[test["Aspect"] < 0, "Aspect"] += 360

all_df.loc[all_df["Aspect"] > 359, "Aspect"] -= 360
test.loc[test["Aspect"] > 359, "Aspect"] -= 360

In [None]:
all_df['Highwater'] = all_df.Vertical_Distance_To_Hydrology < 0
test['Highwater'] = test.Vertical_Distance_To_Hydrology < 0

all_df['DistHydro'] = all_df.Horizontal_Distance_To_Hydrology < 0
test['DistHydro'] = test.Horizontal_Distance_To_Hydrology < 0

all_df['DistRoad'] = all_df.Horizontal_Distance_To_Roadways < 0
test['DistRoad'] = test.Horizontal_Distance_To_Roadways < 0

all_df['DistFire'] = all_df.Horizontal_Distance_To_Fire_Points < 0
test['DistFire'] = test.Horizontal_Distance_To_Fire_Points < 0

all_df['Hillshade_3pm_is_zero'] = all_df.Hillshade_3pm == 0
test['Hillshade_3pm_is_zero'] = test.Hillshade_3pm == 0

In [None]:
all_df['EHiElv'] = all_df['Horizontal_Distance_To_Roadways'] * all_df['Elevation']
test['EHiElv'] = test['Horizontal_Distance_To_Roadways'] * test['Elevation']

all_df['EViElv'] = all_df['Vertical_Distance_To_Hydrology'] * all_df['Elevation']
test['EViElv'] = test['Vertical_Distance_To_Hydrology'] * test['Elevation']

In [None]:
all_df['EVDtH'] = all_df.Elevation-all_df.Vertical_Distance_To_Hydrology
test['EVDtH'] = test.Elevation-test.Vertical_Distance_To_Hydrology

all_df['EHDtH'] = all_df.Elevation-all_df.Horizontal_Distance_To_Hydrology*0.2
test['EHDtH'] = test.Elevation-test.Horizontal_Distance_To_Hydrology*0.2

In [None]:
all_df['Distanse_to_Hydrolody'] = (all_df['Horizontal_Distance_To_Hydrology']**2+all_df['Vertical_Distance_To_Hydrology']**2)**0.5
test['Distanse_to_Hydrolody'] = (test['Horizontal_Distance_To_Hydrology']**2+test['Vertical_Distance_To_Hydrology']**2)**0.5

all_df['Hydro_Fire_1'] = all_df['Horizontal_Distance_To_Hydrology']+all_df['Horizontal_Distance_To_Fire_Points']
test['Hydro_Fire_1'] = test['Horizontal_Distance_To_Hydrology']+test['Horizontal_Distance_To_Fire_Points']

all_df['Hydro_Fire_2'] = abs(all_df['Horizontal_Distance_To_Hydrology']-all_df['Horizontal_Distance_To_Fire_Points'])
test['Hydro_Fire_2'] = abs(test['Horizontal_Distance_To_Hydrology']-test['Horizontal_Distance_To_Fire_Points'])

all_df['Hydro_Road_1'] = abs(all_df['Horizontal_Distance_To_Hydrology']+all_df['Horizontal_Distance_To_Roadways'])
test['Hydro_Road_1'] = abs(test['Horizontal_Distance_To_Hydrology']+test['Horizontal_Distance_To_Roadways'])

all_df['Hydro_Road_2'] = abs(all_df['Horizontal_Distance_To_Hydrology']-all_df['Horizontal_Distance_To_Roadways'])
test['Hydro_Road_2'] = abs(test['Horizontal_Distance_To_Hydrology']-test['Horizontal_Distance_To_Roadways'])

all_df['Fire_Road_1'] = abs(all_df['Horizontal_Distance_To_Fire_Points']+all_df['Horizontal_Distance_To_Roadways'])
test['Fire_Road_1'] = abs(test['Horizontal_Distance_To_Fire_Points']+test['Horizontal_Distance_To_Roadways'])

all_df['Fire_Road_2'] = abs(all_df['Horizontal_Distance_To_Fire_Points']-all_df['Horizontal_Distance_To_Roadways'])
test['Fire_Road_2'] = abs(test['Horizontal_Distance_To_Fire_Points']-test['Horizontal_Distance_To_Roadways'])

In [None]:
all_df["new_f1"] = all_df["Elevation"] + all_df["Horizontal_Distance_To_Roadways"] + all_df["Horizontal_Distance_To_Fire_Points"]
all_df["new_f2"] = (all_df["Hillshade_Noon"] + all_df["Hillshade_3pm"]) - all_df["Hillshade_9am"]

test["new_f1"] = test["Elevation"] + test["Horizontal_Distance_To_Roadways"] + test["Horizontal_Distance_To_Fire_Points"]
test["new_f2"] = (test["Hillshade_Noon"] + test["Hillshade_3pm"]) - test["Hillshade_9am"]

In [None]:
all_df.loc[all_df["Hillshade_9am"] < 0, "Hillshade_9am"] = 0
test.loc[test["Hillshade_9am"] < 0, "Hillshade_9am"] = 0

all_df.loc[all_df["Hillshade_Noon"] < 0, "Hillshade_Noon"] = 0
test.loc[test["Hillshade_Noon"] < 0, "Hillshade_Noon"] = 0

all_df.loc[all_df["Hillshade_3pm"] < 0, "Hillshade_3pm"] = 0
test.loc[test["Hillshade_3pm"] < 0, "Hillshade_3pm"] = 0

all_df.loc[all_df["Hillshade_9am"] > 255, "Hillshade_9am"] = 255
test.loc[test["Hillshade_9am"] > 255, "Hillshade_9am"] = 255

all_df.loc[all_df["Hillshade_Noon"] > 255, "Hillshade_Noon"] = 255
test.loc[test["Hillshade_Noon"] > 255, "Hillshade_Noon"] = 255

all_df.loc[all_df["Hillshade_3pm"] > 255, "Hillshade_3pm"] = 255
test.loc[test["Hillshade_3pm"] > 255, "Hillshade_3pm"] = 255

In [None]:
feature_cols += ["new_f1", "new_f2", "Aspect2", "Highwater", "EVDtH", "EHDtH",  'EHiElv', 'EViElv', 'Hillshade_3pm_is_zero',
                 "Distanse_to_Hydrolody", "Hydro_Fire_1", "Hydro_Fire_2", "Hydro_Road_1", "Hydro_Road_2", "Fire_Road_1", "Fire_Road_2"]

# Mutual Information

In [None]:
%%time
x = all_df.iloc[:5000,:][feature_cols].copy()
y = train.iloc[:5000,:]['Cover_Type'].copy()
mi_scores = mutual_info_regression(x, y)
mi_scores = pd.Series(mi_scores, name="MI Scores", index=x.columns)
mi_scores = mi_scores.sort_values(ascending=False)

In [None]:
top = 20
plt.figure(figsize=(20,10))
sns.barplot(x=mi_scores.values[:top], y=mi_scores.index[:top], palette="summer")
plt.title(f"Top {top} Strong Relationships Between Feature Columns and Target Column")
plt.xlabel("Relationship with Target")
plt.ylabel("Feature Columns")
plt.show()

# XGBoost

In [None]:
all_df = all_df.append([all_df[all_df["Cover_Type"]==5]]*9, ignore_index=True)

In [None]:
train = all_df.loc[all_df.ds == 0].drop(columns=['ds'])
pseudo = all_df.loc[all_df.ds == 1].drop(columns=['ds'])
train = train.reset_index(drop=True)

In [None]:
del all_df

In [None]:
folds = 5
train["kfold"] = -1
pseudo["kfold"] = -1
kf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)

for fold, (train_indicies, valid_indicies) in enumerate(kf.split(train,train["Cover_Type"])):
    train.loc[valid_indicies, "kfold"] = fold

In [None]:
train["Cover_Type"] = train["Cover_Type"] - 1
pseudo["Cover_Type"] = pseudo["Cover_Type"] - 1

In [None]:
%%time
final_test_predictions = []
scores = []

for fold in range(folds):
    x_train = train[train.kfold != fold].copy()
    x_valid = train[train.kfold == fold].copy()
    x_test  = test[feature_cols].copy()
    
    x_train = pd.concat([x_train, pseudo],axis=0)
    
    y_train = x_train['Cover_Type']
    y_valid = x_valid['Cover_Type']

    x_train = x_train[feature_cols]
    x_valid = x_valid[feature_cols]

    xgb_params = {
        'objective': 'multi:softmax',
        'tree_method': 'gpu_hist', 
        'use_label_encoder':False,
        'seed': 42, 
        'eval_metric': ['mlogloss', 'merror'],
        'predictor': 'gpu_predictor',
        'learning_rate': .09,
        'max_depth': 0,
        'subsample': .15,
        'sampling_method': 'gradient_based',
        'seed': 42,
        'grow_policy': 'lossguide',
        'max_leaves': 255,
        'lambda': 100,
    }


    xgb_model = XGBClassifier(**xgb_params)
    xgb_model.fit(x_train, y_train, eval_set=[(x_valid, y_valid)], verbose=False)

    preds_train = xgb_model.predict(x_train)
    preds_valid = xgb_model.predict(x_valid)
    acc_train = accuracy_score(y_train, preds_train)
    acc = accuracy_score(y_valid, preds_valid)
    print(f"Fold {fold}, train: {acc_train:.6f}, valid: {acc:.6f}")
    scores.append(acc)

    preds_test = xgb_model.predict(x_test)
    final_test_predictions.append(preds_test)

print("AVG ACC:",np.mean(scores))

# Feature Importance

In [None]:
d = pd.DataFrame(np.array([feature_cols,list(xgb_model.feature_importances_)]).T, columns=['feature','importance'])
d["importance"] = pd.to_numeric(d["importance"])
d = d.sort_values('importance', ascending=False)

In [None]:
plt.figure(figsize=(20,8))
sns.barplot(y="feature", x="importance", data=d.iloc[:10])
plt.show()

# Plot Prediction

In [None]:
preds_test = stats.mode(np.array(final_test_predictions))[0]
preds_test = preds_test.squeeze() + 1

In [None]:
plt.figure(figsize=(10,5))
ax = sns.countplot(x=preds_test)
plt.title("Predictions")
plt.xlabel("Cover Type")
ax.bar_label(ax.containers[0])
plt.show()

# Submission

In [None]:
sample_submission['Cover_Type'] = preds_test
sample_submission.to_csv("submission.csv", index=False)
sample_submission.head()