In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pd.set_option('max_columns', None)
pd.set_option('max_rows', None)

%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns

#Warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
# Load data
train = pd.read_csv("/kaggle/input/tabular-playground-series-dec-2021/train.csv")
test = pd.read_csv("/kaggle/input/tabular-playground-series-dec-2021/test.csv")
train.set_index('Id', inplace=True)
test.set_index('Id', inplace=True)

print(train.shape)
print(test.shape)

In [None]:
train.head()

In [None]:
# General information
train.info()

In [None]:
train.describe()

In [None]:
test.describe()

In [None]:
# Check for missing values
print(train.isna().sum().sum())
print(test.isna().sum().sum())


In [None]:
# There are no missing values in both datasets.

# But some data very strange and possibly are wrong:
# 1) Some 'Aspect' (azimuth) values are out of range [0, 360[
# 2) Some 'Slope' values are negative. They should be in range [0, 90]
# 3) Data, related to distance features also contains negative values
# 4) Hillshade data is out of range [0, 255]
# 5) Categorical features 'Soil_Type7' and 'Soil_Type15' shold be removed, 
#     because all their values are Zero in both data sets

# So, we need some preprocesing to fix these inconsistencies.

In [None]:
def print_wrong_distance(train, test, feature):
    
    print(f'Wrong "{feature}" data')
    percent_wrong = train[(train[feature] < 0)].shape[0] / train.shape[0] * 100
    print('Train:\t%.2f%%' % percent_wrong)
    percent_wrong = test[(test[feature] < 0)].shape[0] / test.shape[0] * 100
    print('Test\t%.2f%%' % percent_wrong)
    print()

    
def print_wrong_hillshade(train, test, feature):
    
    print(f'Wrong "{feature}" data')
    percent_wrong = train[(train[feature] < 0) | (train[feature] > 255)].shape[0] / train.shape[0] * 100
    print('Train:\t%.2f%%' % percent_wrong)
    percent_wrong = test[(test[feature] < 0) | (test[feature] > 255)].shape[0] / test.shape[0] * 100
    print('Test:\t%.2f%%' % percent_wrong)
    print()

In [None]:
# Number a wrong data in % in both data sets

print('Wrong "Aspect" data')
percent_wrong = train[(train.Aspect < 0) | (train.Aspect >= 360)].shape[0] / train.shape[0] * 100
print('Train:\t%.2f%%' % percent_wrong)
percent_wrong = test[(test.Aspect < 0) | (test.Aspect >= 360)].shape[0] / test.shape[0] * 100
print('Test:\t%.2f%%' % percent_wrong)
print()
print('Wrong "Slope" data')
percent_wrong = train[(train.Slope < 0) | (train.Slope > 90)].shape[0] / train.shape[0] * 100
print('Train:\t%.2f%%' % percent_wrong)
percent_wrong = test[(test.Slope < 0) | (test.Slope > 90)].shape[0] / test.shape[0] * 100
print('Test:\t%.2f%%' % percent_wrong)
print()

columns = list(train.columns)
dist_cols = [col for col in columns if 'Distance' in col]
for col in dist_cols:
    print_wrong_distance(train, test, col)

hshade_cols = [col for col in columns if 'Hillshade' in col]
for col in hshade_cols:
    print_wrong_hillshade(train, test, col)


In [None]:
# Feature "Vertical_Distance_To_Hydrology" contains near the 15% wrong data !

In [None]:
# Target 'Cover_Type' distribution
target = train.columns[-1]
print('Absolute values:')
print(train.value_counts(subset=target))
print()
print('Values in percent:')
print((train.value_counts(subset=target, normalize=True) * 100))
train[target].hist(bins=20)

In [None]:
# We have only 1 row with the Cover_Type == 5 !!!
train[train[target] == 5].head()

In [None]:
noncat = ['Elevation', 'Aspect', 'Slope'] + dist_cols + hshade_cols
# Description by target for non-categorical features
train[[target] + noncat].groupby(target).describe()

In [None]:
# Data distribution for non-categorical features
train[noncat].hist(bins=90, figsize=(20,15))

In [None]:
# Correlation for non-categorical features
train_noncat = train[[target] + noncat]
corr = train_noncat.corr()
corr.style.background_gradient(cmap='coolwarm').format(precision = 4)

In [None]:
# Correlation for the different Cover_Type separately
max_ct = train_noncat[target].max()
corrs = []
for ct in range(1, max_ct + 1):
    train_noncat_1 = train_noncat.copy()
    train_noncat_1[target] = train_noncat_1[target].where(train_noncat_1[target] == ct, 0)
    train_noncat_1[target] = train_noncat_1[target].where(train_noncat_1[target] == 0, 1)
    corr_1 = train_noncat_1.corr()
    corr_1.style.background_gradient(cmap='coolwarm').format(precision = 4)
    corrs.append(corr_1)


In [None]:
# Correlation for "Cover_Type" equal to 1
corrs[0].style.background_gradient(cmap='coolwarm').format(precision = 4)

In [None]:
# Correlation for "Cover_Type" equal to 2
corrs[1].style.background_gradient(cmap='coolwarm').format(precision = 4)

In [None]:
# Correlation for "Cover_Type" equal to 3
corrs[2].style.background_gradient(cmap='coolwarm').format(precision = 4)

In [None]:
# Correlation for "Cover_Type" equal to 4
corrs[3].style.background_gradient(cmap='coolwarm').format(precision = 4)

In [None]:
# Correlation for "Cover_Type" equal to 5
corrs[4].style.background_gradient(cmap='coolwarm').format(precision = 4)

In [None]:
# Correlation for "Cover_Type" equal to 6
corrs[5].style.background_gradient(cmap='coolwarm').format(precision = 4)

In [None]:
# Correlation for "Cover_Type" equal to 7
corrs[6].style.background_gradient(cmap='coolwarm').format(precision = 4)

In [None]:
# As for correlation non-categorical features:
# 1) Our target 'Cover_Type' highly correlated with 'Elevation' feature
# 2) We can see relatively high correlation between 'Elevation' and 'Horizontal_Distance_To_Roadways'
# 3) Very weak correlation for 'Cover_Type' equal to the 4 and 5.

In [None]:
# Collect categorical features
cat_area, cat_soil = [], []
for col in train.columns:
    if 'Wilderness' in col:
        cat_area.append(col)
    elif 'Soil' in col:
        cat_soil.append(col)


# remove 'Soil_Type7' and 'Soil_Type15', because Zero values
cat_soil.remove('Soil_Type7')
cat_soil.remove('Soil_Type15')
cat_cols = cat_area + cat_soil
print(cat_area)
print(cat_soil)


In [None]:
corr_cat = train[[target] + cat_cols].corr()
corr_cat.style.background_gradient(cmap='coolwarm').format(precision = 4)

In [None]:
# Correlation, related to categorical features:
# 1) 'Wilderness_Area3' higly correlated with other areas
# 2) 'Soil_Type3' and 'Wilderness_Area4'
# 3) 'Soil_Type6' and 'Wilderness_Area4'
# 4) 'Soil_Type10' and areas 1, 3, 4
# 5) 'Soil_Type29' and areas 1, 3

In [None]:
import gc

def reduce_memory(df):
    
    for col in df.columns:
        ctype = df[col].dtype
        dcast = None
        if ctype in ['float64', 'float32']:
            dcast ='float' # downcast to the lowest float type
        elif ctype in ['int64', 'int32', 'int16']:
            dcast = 'integer'
            
        if dcast != None:
            df[col] = pd.to_numeric(df[col], downcast=dcast) # downcast to the lowest possible type
        
    gc.collect()
    
    return df

In [None]:
train = reduce_memory(train)
test = reduce_memory(test)

# Save original files in binary format
train.reset_index().to_feather('train.feather')
test.reset_index().to_feather('test.feather')

print(train.info(verbose=False, memory_usage='deep'))
print()
print(test.info(verbose=False, memory_usage='deep'))