In [1]:
import numpy as np
import pandas as pd
pd.set_option("display.max_columns", None)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import xgboost as xgb
import lightgbm as lgb
import catboost as ctb
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict, cross_validate, GridSearchCV, StratifiedKFold
from sklearn.decomposition import PCA
from mlxtend.classifier import StackingCVClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, make_scorer

In [2]:
def offline_evaluation(pred):
    X_train = pd.read_csv("data/train.csv")
    X_test = pd.read_csv("data/test.csv")
    original_data = pd.read_csv("original_data/covtype.csv")
    X_train.drop("Id", axis=1, inplace=True)
    test_ID = X_test["Id"]
    X_test.drop("Id", axis=1, inplace=True)
    y_train = np.array(X_train['Cover_Type'])
    X_train.drop('Cover_Type', axis=1, inplace=True)
    num_train = X_train.shape[0]
    all_data = pd.concat([X_train, X_test])
    all_data = pd.merge(all_data, original_data, how="left")
    original_y_train = all_data["Cover_Type"][:num_train]
    original_y_test = all_data["Cover_Type"][num_train:]
    assert np.all(original_y_train == y_train)
    print(accuracy_score(original_y_test, pred))

In [3]:
# obtain test set class distribution through probing the leaderboard 
class_weight = {1: 0.370530,
                2: 0.496810,
                3: 0.059365,
                4: 0.001037,
                5: 0.012958,
                6: 0.026873,
                7: 0.032427}

In [4]:
def balanced_accuracy_score(y_true, y_pred):
    return accuracy_score(y_true, y_pred, sample_weight=[class_weight[label] for label in y_true])
balanced_accuracy_scorer = make_scorer(balanced_accuracy_score, greater_is_better=True)
my_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

In [5]:
X_train = pd.read_csv("data/train.csv")
X_test = pd.read_csv("data/test.csv")

In [6]:
X_train.drop("Id", axis=1, inplace=True)
test_ID = X_test["Id"]
X_test.drop("Id", axis=1, inplace=True)

In [7]:
y_train = np.array(X_train['Cover_Type'])
X_train.drop('Cover_Type', axis=1, inplace=True)

In [8]:
assert np.all(X_train.loc[:, "Wilderness_Area1": "Wilderness_Area4"].sum(axis=1) == 1)
assert np.all(X_train.loc[:, "Soil_Type1": "Soil_Type40"].sum(axis=1) == 1)

In [9]:
num_train = X_train.shape[0]
all_data = pd.concat([X_train, X_test])

In [10]:
pca = PCA(n_components=0.95).fit(all_data)
pca_trans = pca.transform(all_data)
pca_trans.shape

(581012, 2)

In [11]:
for i in range(pca_trans.shape[1]):
    all_data["pca" + str(i)] = pca_trans[:, i]

In [12]:
all_data["Degree_To_Hydrology"] = (np.arctan((all_data["Vertical_Distance_To_Hydrology"] + np.finfo("float64").eps) /
                                             (all_data["Horizontal_Distance_To_Hydrology"] + np.finfo("float64").eps)))
all_data["Distance_to_Hydrology"] = (np.square(all_data["Vertical_Distance_To_Hydrology"]) +
                                               np.square(all_data["Vertical_Distance_To_Hydrology"]))

In [13]:
hillshade_cols = ["Hillshade_9am", "Hillshade_Noon", "Hillshade_3pm"]
all_data["Hillshade_mean"] = all_data[hillshade_cols].mean(axis=1)
all_data["Hillshade_std"] = all_data[hillshade_cols].std(axis=1)

In [14]:
cols = ["Horizontal_Distance_To_Hydrology",  "Horizontal_Distance_To_Roadways", "Horizontal_Distance_To_Fire_Points"]
names = ["H", "R", "F"]
for i in range(len(cols)):
    for j in range(i + 1, len(cols)):
        all_data["Horizontal_Distance_combination_" + names[i] + names[j] + "_1"] = all_data[cols[i]] + all_data[cols[j]]
        all_data["Horizontal_Distance_combination_" + names[i] + names[j] + "_2"] = (all_data[cols[i]] + all_data[cols[j]]) / 2
        all_data["Horizontal_Distance_combination_" + names[i] + names[j] + "_3"] = all_data[cols[i]] - all_data[cols[j]]
        all_data["Horizontal_Distance_combination_" + names[i] + names[j] + "_4"] = np.abs(all_data[cols[i]] - all_data[cols[j]])
all_data["Horizontal_Distance_mean"] = all_data[cols].mean(axis=1)

In [15]:
all_data["Elevation_Hydrology_1"] = all_data["Elevation"] + all_data["Vertical_Distance_To_Hydrology"]
all_data["Elevation_Hydrology_2"] = all_data["Elevation"] - all_data["Vertical_Distance_To_Hydrology"]

In [16]:
X_train = all_data[:num_train]
X_test = all_data[num_train:]

In [17]:
clf = ExtraTreesClassifier(n_estimators=250, random_state=0, n_jobs=-1)
scores = cross_validate(clf, X_train, y_train, cv=my_cv,
                        fit_params={"sample_weight":[class_weight[label] for label in y_train]},
                        scoring=balanced_accuracy_scorer, return_train_score=True)
print(np.mean(scores["train_score"]), np.std(scores["train_score"]))
print(np.mean(scores["test_score"]), np.std(scores["test_score"]))

1.0 0.0
0.8021838712962964 0.011155906736295742


In [18]:
clf.fit(X_train, y_train, sample_weight=[class_weight[label] for label in y_train])
pred = clf.predict(X_test)
offline_evaluation(pred)

0.8131145165508613


In [19]:
# lgtm_class_weight = {i: w for i, w in enumerate(class_weight.values())}
# clf = lgb.LGBMClassifier(n_estimators=600, random_state=0, n_jobs=-1, class_weight=lgtm_class_weight)
# scores = cross_validate(clf, X_train, y_train, cv=my_cv,
#                         scoring=balanced_accuracy_scorer, return_train_score=True)
# print(np.mean(scores["train_score"]), np.std(scores["train_score"]))
# print(np.mean(scores["test_score"]), np.std(scores["test_score"]))

In [20]:
# clf.fit(X_train, y_train)
# pred = clf.predict(X_test)
# offline_evaluation(pred)

In [21]:
clf = lgb.LGBMClassifier(n_estimators=600, random_state=0, n_jobs=-1)
scores = cross_validate(clf, X_train, y_train, cv=my_cv,
                        fit_params={"sample_weight":[class_weight[label] for label in y_train]},
                        scoring=balanced_accuracy_scorer, return_train_score=True)
print(np.mean(scores["train_score"]), np.std(scores["train_score"]))
print(np.mean(scores["test_score"]), np.std(scores["test_score"]))

1.0 0.0
0.7931799476851852 0.007347016069991456


In [22]:
clf.fit(X_train, y_train, sample_weight=[class_weight[label] for label in y_train])
pred = clf.predict(X_test)
offline_evaluation(pred)

0.8122645310412587


In [23]:
clf1 = ExtraTreesClassifier(n_estimators=250, random_state=0, n_jobs=-1)
clf2 = lgb.LGBMClassifier(n_estimators=600, random_state=0, n_jobs=-1)
clf = StackingCVClassifier(classifiers=[clf1, clf2],
                           meta_classifier=xgb.XGBClassifier(n_estimators=50, random_state=0, n_jobs=-1),
                           cv=my_cv, random_state=0, use_probas=True, use_features_in_secondary=True)
clf.fit(X_train, y_train, sample_weight=[class_weight[label] for label in y_train])
pred = clf.predict(X_test)
submission = pd.DataFrame({'Id':test_ID, 'Cover_Type':pred},
                          columns=['Id', 'Cover_Type'])
submission.to_csv("submit/v1.csv", index=False)

In [24]:
offline_evaluation(pred)

0.8621574434697787
