In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.feature_selection import mutual_info_regression
from sklearn.ensemble import VotingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.preprocessing import RobustScaler
from scipy.stats import boxcox
from category_encoders import MEstimateEncoder
from sklearn.metrics import accuracy_score


%matplotlib inline

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# Globals

SHOW_PLOTS = False
BASELINE_MODEL = False
TEST_PROGRESS = False

exclude_features = set()  # will add to this on the go

In [None]:
# Utility functions from Tutorial
def make_mi_scores(X, y):
    X = X.copy()
    for colname in X.select_dtypes(["object", "category"]):
        X[colname], _ = X[colname].factorize()
    # All discrete features should now have integer dtypes
    discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores


def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")


def score_model_cv(X, y, clf):
    print(clf.__class__.__name__)
    cv = cross_val_score(clf, X, y, cv=5, scoring="accuracy")
    print(f'Cross validation scores: {cv}')
    print(f'Cross validation avg score: {cv.mean()}')
    print('')
    

def score_model_acc(X, y, clf):
    print(clf.__class__.__name__)
    y_pred = clf.predict(X)
    acc = accuracy_score(y, y_pred)
    print(acc)
    print('')


def target_encoding(df, encoder=None):
    X = df.copy()
    if encoder is None:
        y = X.pop(LABEL)
        encoder = MEstimateEncoder(cols=["Soil_Type"], m=3.0)
        new_col = encoder.fit_transform(X, y)["Soil_Type"]
    else:
        new_col = encoder.transform(X)["Soil_Type"]
    
    return new_col, encoder
    
def feature_engineering(df, aux_df=None, encoder=None, train=True):
    # Distance
    df['Distance_To_Hydrology'] = pd.Series(np.sqrt(df['Horizontal_Distance_To_Hydrology']**2 + df['Vertical_Distance_To_Hydrology']**2), name='Distance_To_Hydrology')

    # Grouping
#     if False:
    feature_to_transform_on = 'Aspect'
    new_col = f'Wilderness_{feature_to_transform_on}'
    if train:
        df[new_col] = df.groupby('Wilderness_Area')[feature_to_transform_on].transform('mean')
    else: # test
        if new_col in df.columns:
            df.drop(columns=[new_col], inplace=True) 
        df = df.merge(aux_df[['Wilderness_Area', new_col]].drop_duplicates(), on='Wilderness_Area', how='left')
    
    # target encoding
    df['Soil_Type_new'], encoder = target_encoding(df, encoder)
    
    return df,encoder

In [None]:
# load the training data
training = pd.read_csv('/kaggle/input/forest-cover-type-prediction/train.csv').sample(frac=1.0, random_state=42)  # shuffle
print(f'Training size: {training.shape[0]}')
training.head()

In [None]:
# load the test data
test = pd.read_csv('/kaggle/input/forest-cover-type-prediction/test.csv')
print(f'Test size: {test.shape[0]}')
test.head()

In [None]:
# Globals - cont

LABEL = COVER_TYPE = 'Cover_Type'
ID = 'Id'
NUMERIC_FEATURES = ['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways',
                    'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm',
                    'Horizontal_Distance_To_Fire_Points',]

CAT_FEATURES = [ftr for ftr in training.columns.tolist() if ftr not in (NUMERIC_FEATURES + [LABEL, ID])]

In [None]:
training.info()

In [None]:
training.describe()

In [None]:
test.info()

In [None]:
test.describe()

In [None]:
# Investigate Nans

print('Nans:')
ser = training.isna().sum()
ser[ser > 0]

# => no NaNs

In [None]:
# baseline model

if BASELINE_MODEL:
    EVAL = 'ACC'  # ACC / CV
    
    rf = RandomForestClassifier(random_state=42)
    xgb = XGBClassifier(random_state=42)
    X_train = training.copy()
    y_train = X_train.pop(LABEL)

    if EVAL == 'ACC':
        train_df, valid_df = train_test_split(training, test_size=0.20, random_state=42)
    
    clfs = [rf]
    print(f'Chance: {y_train[y_train == y_train.value_counts().idxmax()].count() / y_train.shape[0]}')
    for clf in clfs:
        if EVAL == 'CV':
            score_model_cv(X_train, y_train, clf)
        else: # ACC
            y_train = train_df.pop(LABEL)
            y_valid = valid_df.pop(LABEL)
            clf.fit(train_df, y_train)
            score_model_acc(valid_df, y_valid, clf)

# => RF: 86.89% (assuming the df is shuffled. othewise it's around 60%)
# => RF (Official public / private score): 72.48%

In [None]:
# explore numeric features

if SHOW_PLOTS:
    fig = plt.figure(figsize=(18,16))
    for i,col in enumerate(sorted(NUMERIC_FEATURES)):
        plt.subplot(5,5,i+1)
        sns.histplot(training[col].dropna())
    fig.tight_layout(pad=1.0)

In [None]:
# Observations:
"""
Hillshare_3pm: Ok
Hillshade_9am: skewed to the right, I don't think it's problematic for RF
Hillshade_Noon: skewed to the right, I don't think it's problematic for RF
Horizontal_Distance_To_Fire_Points: skewed to the left, I don't think it's problematic for RF
Horizontal_Distance_To_Hydrology: skewed to the left, I don't think it's problematic for RF
Horizontal_Distance_To_Roadways: skewed to the left, I don't think it's problematic for RF
Slope: Ok, except a strange gap around x=25.
Vertical_Distance_To_Hydrology: High frequency around 0. Maybe take log? (after dropping negataives)
"""

In [None]:
# detect outliers

if SHOW_PLOTS:
    fig = plt.figure(figsize=(18,16))
    for i,col in enumerate(sorted(NUMERIC_FEATURES)):
        plt.subplot(5,5,i+1)
        sns.boxplot(y=training[col])
    fig.tight_layout(pad=1.0)

In [None]:
# Remove outliers

print(f'Training size before cleaning: {training.shape[0]}')

#training = training.drop(training[training['Hillshade_3pm'] < 50].index)
training = training.drop(training[training['Hillshade_9am'] < 50].index)
#training = training.drop(training[training['Hillshade_Noon'] < 180].index)

#training = training.drop(training[training['Horizontal_Distance_To_Fire_Points'] > 2500].index)
#training = training.drop(training[training['Horizontal_Distance_To_Hydrology'] > 500].index)
#training = training.drop(training[training['Horizontal_Distance_To_Roadways'] > 3000].index)

#training = training.drop(training[training['Slope'] > 30].index)

training = training.drop(training[training['Vertical_Distance_To_Hydrology'] > 500 ].index)
training = training.drop(training[training['Vertical_Distance_To_Hydrology'] < 0 ].index)  # distance is positive by definition

print(f'Training size after cleaning: {training.shape[0]}')

In [None]:
# Numerical features against label

if SHOW_PLOTS:
    fig = plt.figure(figsize=(18,10))
    for i,col in enumerate(sorted(NUMERIC_FEATURES)):
        plt.subplot(3, 4, i+1)
        sns.barplot(x=LABEL, y=col, data=training)
    fig.tight_layout(pad=1.0)

In [None]:
# Numerical features against label

if SHOW_PLOTS:
    fig = plt.figure(figsize=(18,10))
    for i,col in enumerate(sorted(NUMERIC_FEATURES)):
        plt.subplot(3, 4, i+1)
        sns.violinplot(x=LABEL, y=col, data=training)
    fig.tight_layout(pad=1.0)

In [None]:
# Correlation matrix (numerical features)

temp_df = training[NUMERIC_FEATURES + [LABEL]]
plt.figure(figsize=(25,10))
g = sns.heatmap(temp_df.corr(),annot=True, fmt = ".2f", cmap = "coolwarm", vmin=-1)

In [None]:
# Observations:
"""
Nothing is highly correlated with the label.

Highly correlated features:
Elevation: Horizontal_Distance_To_Roadways
Aspect: Hillshade_9am, Hillshade_3pm
Slope: Hillshade_Noon
Horizontal_Distance_To_Hydrology: Vertical_Distance_To_Hydrology
Horizontal_Distance_To_Roadways: Horizontal_Distance_To_Fire_Points
Hillshade_9am: Hillshade_3pm
Hillshade_Noon: Hillshade_3pm
"""

In [None]:
# Explore cat features

In [None]:
# Combine cat features (reverse get_dummies)
wilderness_column = pd.Series(training[[ftr for ftr in CAT_FEATURES if ftr.startswith('Wilderness_Area')]].idxmax(axis=1), name='Wilderness_Area').astype('category')
soil_column = pd.Series(training[[ftr for ftr in CAT_FEATURES if ftr.startswith('Soil_Type')]].idxmax(axis=1), name='Soil_Type').astype('category')
comb_train_df = pd.concat([training[NUMERIC_FEATURES], wilderness_column, soil_column, training[LABEL]], axis=1)

wilderness_column = pd.Series(test[[ftr for ftr in CAT_FEATURES if ftr.startswith('Wilderness_Area')]].idxmax(axis=1), name='Wilderness_Area').astype('category')
soil_column = pd.Series(test[[ftr for ftr in CAT_FEATURES if ftr.startswith('Soil_Type')]].idxmax(axis=1), name='Soil_Type').astype('category')
comb_test_df = pd.concat([test[NUMERIC_FEATURES], wilderness_column, soil_column], axis=1)

In [None]:
# print cat features counts

wilderness_column.value_counts().sort_index()

In [None]:
# print cat features counts - cont

soil_order = [f'Soil_Type{num}' for num in range(1,40+1)]
#soil_column.value_counts()[[ftr for ftr in soil_order if ftr in soil_column.unique()]]
soil_column.value_counts()

In [None]:
# TODO: Try to ditch soil types <= 20 with high std against label

In [None]:
# Cat features against label

if SHOW_PLOTS:
    # Wilderness
    plt.figure(dpi=100, figsize=(7, 4))
    g = sns.violinplot(x='Wilderness_Area', y=LABEL, data=comb_train_df, order=[f'Wilderness_Area{num}' for num in range(1,4+1)])
    
    # Soil
    plt.figure(dpi=100, figsize=(16, 8))
    g = sns.barplot(x='Soil_Type', y=LABEL, data=comb_train_df, order=soil_order)
    plt.xticks(rotation=90)

In [None]:
# Mutual information (on both numerical AND cat, against label)

X = comb_train_df.copy()
y = X.pop(LABEL)

mi_scores = make_mi_scores(X, y)

plt.figure(dpi=100, figsize=(8, 5))
plot_mi_scores(mi_scores.head(20))

In [None]:
# Observations:
"""
Elevation and Soil_Tyoe have high MI with the label.
"""

## Feature Engineering

In [None]:
# split to train and validation

train_df, valid_df = train_test_split(comb_train_df, test_size=0.20, random_state=42)
train_df = train_df.copy()
valid_df = valid_df.copy()

In [None]:
# Create Distance To Hydrology from horizontal and vertical distances

#comb_train_df['Distance_To_Hydrology'] = pd.Series(np.sqrt(training['Horizontal_Distance_To_Hydrology']**2 + training['Vertical_Distance_To_Hydrology']**2), name='Distance_To_Hydrology')
train_df, target_enc = feature_engineering(train_df, aux_df=None, encoder=None, train=True)

g = sns.violinplot(x=LABEL, y='Distance_To_Hydrology', data=train_df)

In [None]:
valid_df_y = valid_df.pop(LABEL)
valid_df,_ = feature_engineering(valid_df, aux_df=train_df, encoder=target_enc, train=False)
valid_df[LABEL] = valid_df_y

print(valid_df[['Wilderness_Area', 'Aspect', 'Wilderness_Aspect']].sample(10, random_state=42))

g = sns.barplot(x=LABEL, y='Distance_To_Hydrology', data=valid_df)

In [None]:
# compare the encoded values to the target

plt.figure(dpi=90)
ax = sns.distplot(y, kde=False, norm_hist=True)
ax = sns.kdeplot(valid_df['Soil_Type_new'], color='r', ax=ax)
ax.set_xlabel('Cover_Type')
ax.legend(labels=['Soil_Type_new', 'Cover_Type']);

In [None]:
# test after adding 2 new features

if TEST_PROGRESS:
    X_train = pd.get_dummies(train_df)
    X_valid = pd.get_dummies(valid_df)

    y_train = X_train.pop(LABEL)
    y_valid = X_valid.pop(LABEL)

    rf = RandomForestClassifier(random_state=42)
    rf.fit(X_train, y_train)
    score_model_acc(X_valid, y_valid, rf)

# => 85.30% (so it actually dropped)

In [None]:
test_df,_ = feature_engineering(comb_test_df, aux_df=train_df, encoder=target_enc, train=False)

In [None]:
# Prediction
X_train = pd.get_dummies(train_df)
y_train = X_train.pop(LABEL)

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

test_df = pd.get_dummies(test_df)

# handling unseen data
test_df = test_df.reindex(columns=X_train.columns.tolist()).fillna(0)

predictions = rf.predict(test_df)
output = pd.DataFrame({ID: test[ID], LABEL: predictions})
output.to_csv('submission.csv', index=False)
print('Submission saved')