

Credits go to the two notebooks mine is based on:
- [Feature engineering](https://www.kaggle.com/gulshanmishra/tps-dec-21-tensorflow-nn-feature-engineering)
- [Model training](https://www.kaggle.com/hamzaghanmi/tps-dec-step-by-step)

# Importing libraries

In [None]:
!pip install scikit-learn-intelex -q --progress-bar off

In [None]:
from sklearnex import patch_sklearn
patch_sklearn()

In [None]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn import model_selection
from sklearn.metrics import accuracy_score
import gc
import optuna
from sklearn.neighbors import KNeighborsClassifier
from scipy import stats

# Loading the data

In [None]:
df_train = pd.read_csv('../input/tabular-playground-series-dec-2021/train.csv')
df_test = pd.read_csv('../input/tabular-playground-series-dec-2021/test.csv')
sample_submission = pd.read_csv('../input/tabular-playground-series-dec-2021/sample_submission.csv')

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_memory = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_memory = df.memory_usage().sum() / 1024**2
    if verbose: 
        print(f'Memory usage of dataframe after reduction {end_memory} MB')
        print(f'Reduced by {100 * (start_memory - end_memory) / start_memory} % ')
    return df

In [None]:
df_train = reduce_mem_usage(df_train)
df_test = reduce_mem_usage(df_test)

# Preprocessing

In [None]:
df_train = df_train.drop(df_train[df_train['Cover_Type'] == 5].index, axis=0)

columns = ['Id', 'Soil_Type7', 'Soil_Type15']
df_train = df_train.drop(columns, axis=1)
df_test = df_test.drop(columns, axis=1)

df_train['Aspect'][df_train['Aspect'] < 0] += 360
df_train['Aspect'][df_train['Aspect'] > 359] -= 360

df_test['Aspect'][df_test['Aspect'] < 0] += 360
df_test['Aspect'][df_test['Aspect'] > 359] -= 360

new_names = {
    'Horizontal_Distance_To_Hydrology': 'x_dist_hydrlgy',
    'Vertical_Distance_To_Hydrology': 'y_dist_hydrlgy',
    'Horizontal_Distance_To_Roadways': 'x_dist_rdwys',
    'Horizontal_Distance_To_Fire_Points': 'x_dist_firepts'
}

df_train.rename(new_names, axis=1, inplace=True)
df_test.rename(new_names, axis=1, inplace=True)

# Manhhattan distance to Hydrology
df_train['Manhattan_Distance_To_Hydrology'] = np.abs(df_train['x_dist_hydrlgy']) + np.abs(df_train['y_dist_hydrlgy'])
df_test['Manhattan_Distance_To_Hydrology'] = np.abs(df_test['x_dist_hydrlgy']) + np.abs(df_test['y_dist_hydrlgy'])

# Euclidean distance to Hydrology
# df_train['Euclidean_Distance_To_Hydrology'] = (df_train['x_dist_hydrlgy']**2 + df_train['y_dist_hydrlgy']**2)**0.5
# df_test['Euclidean_Distance_To_Hydrology'] = (df_test['x_dist_hydrlgy']**2 + df_test['y_dist_hydrlgy']**2)**0.5

soil_features = [c for c in df_train.columns if c.startswith('Soil_Type')]
df_train['soil_type_count'] = df_train[soil_features].sum(axis=1)
df_test['soil_type_count'] = df_test[soil_features].sum(axis=1)

wilderness_features = [c for c in df_train.columns if c.startswith('Wilderness_Area')]
df_train['wilderness_area_count'] = df_train[wilderness_features].sum(axis=1)
df_test['wilderness_area_count'] = df_test[wilderness_features].sum(axis=1)

df_train.loc[df_train['Hillshade_9am'] < 0, 'Hillshade_9am'] = 0
df_test.loc[df_test['Hillshade_9am'] < 0, 'Hillshade_9am'] = 0

df_train.loc[df_train['Hillshade_Noon'] < 0, 'Hillshade_Noon'] = 0
df_test.loc[df_test['Hillshade_Noon'] < 0, 'Hillshade_Noon'] = 0

df_train.loc[df_train['Hillshade_3pm'] < 0, 'Hillshade_3pm'] = 0
df_test.loc[df_test['Hillshade_3pm'] < 0, 'Hillshade_3pm'] = 0

df_train.loc[df_train['Hillshade_9am'] > 255, 'Hillshade_9am'] = 255
df_test.loc[df_test['Hillshade_9am'] > 255, 'Hillshade_9am'] = 255

df_train.loc[df_train['Hillshade_Noon'] > 255, 'Hillshade_Noon'] = 255
df_test.loc[df_test['Hillshade_Noon'] > 255, 'Hillshade_Noon'] = 255

df_train.loc[df_train['Hillshade_3pm'] > 255, 'Hillshade_3pm'] = 255
df_test.loc[df_test['Hillshade_3pm'] > 255, 'Hillshade_3pm'] = 255

hillshade_features = [c for c in df_train.columns if c.startswith('Hillshade')]

df_train['hillshade_sum'] = df_train[hillshade_features].sum(axis=1)
df_test['hillshade_sum'] = df_test[hillshade_features].sum(axis=1)

df_train['hillshade_mean'] = df_train[hillshade_features].mean(axis=1)
df_test['hillshade_mean'] = df_test[hillshade_features].mean(axis=1)

features = [c for c in df_test]

cols = [
    "Elevation",
    "Aspect",
    "Manhattan_Distance_To_Hydrology",
    "soil_type_count",
#     "wilderness_area_count",
#     "Slope",
    "x_dist_hydrlgy",
    "y_dist_hydrlgy",
    "x_dist_rdwys",
    "Hillshade_9am",
    "Hillshade_Noon",
    "Hillshade_3pm",
    "x_dist_firepts",
    "hillshade_sum"]
#     "hillshade_mean"]

scaler = preprocessing.StandardScaler()
df_train[features] = scaler.fit_transform(df_train[features])
df_test[features] = scaler.transform(df_test[features])

In [None]:
df_train = reduce_mem_usage(df_train)
df_test = reduce_mem_usage(df_test)

# Model training

In [None]:
%%time

s = 0

preds = [] 
kf = model_selection.StratifiedKFold(n_splits=20,random_state=42,shuffle=True)
acc=[]  # list contains accuracy for each fold
n=0
for trn_idx, test_idx in kf.split(df_train[cols],df_train['Cover_Type']):
    X_tr,X_val = df_train[cols].iloc[trn_idx],df_train[cols].iloc[test_idx]
    y_tr,y_val = df_train['Cover_Type'].iloc[trn_idx],df_train['Cover_Type'].iloc[test_idx]

    model = KNeighborsClassifier(n_neighbors=200,
                                 n_jobs=-1)

    model.fit(X_tr,
              y_tr)

#     valid_pred = model.predict(X_valid)
    test_pred = model.predict(df_test[cols])   
      
    preds.append(test_pred)
    acc.append(accuracy_score(y_val, model.predict(X_val)))

    print(n, acc[n])  
    n+=1
    
    del X_tr,X_val,y_tr,y_val
    gc.collect()
    
print(np.mean(acc))

sub = pd.read_csv('/kaggle/input/tabular-playground-series-dec-2021/sample_submission.csv')
prediction = stats.mode(preds)[0][0]
sub['Cover_Type'] = prediction
sub.to_csv('submission.csv', index=False)