# Major observations:
- Three fields 'Elevations', 'Horizontal_Distance_To_Roadways' and 'Horizontal_Distance_To_Fire_Points' exhibit different distributions
- when filtered by 'Widerness_Area4', 'Soil_Type3' and 'Cover_Type'
- Considering distribution of class label('Cover_Type'), filtering by 'Soil_Type6' results in some distinct ones as well
- However, each of those filtering boolean variables is highly skewed, and I am not sure about statistical significance 
- Possible combinations of 'Wilderness_area's are observed, whereas the same for 'Soil_Type's seems intractible
- Seemingly no chunking like November TPS

# Preparation

In [None]:
import pandas as pd
import numpy as np

def load_dataset(train_or_test='train'):
    df = pd.read_csv('../input/tabular-playground-series-dec-2021/train.csv')
    df = modify_features(df)
    return df
    
def modify_features(df):
    df.set_index('Id', inplace=True)
    df = df[df['Cover_Type'] != 5]  # Only 1 sample for type 5. What can I do with it?!
    df = df.astype({fieldname: 'float64' for fieldname in list(df)[:10]}, copy=False)
    aspect_in_radian = df['Aspect'] * np.pi / 180.
    df['Aspect_cos'], df['Aspect_sin'] = np.cos(aspect_in_radian), np.sin(aspect_in_radian)
    del df['Aspect']
    del df['Soil_Type7'], df['Soil_Type15']  # univalent features
    
    cols = list(df)
    cols = cols[0:1] + cols[-2:] + cols[1:9] + cols[9:-2]  #rearranging; 11 continuous features + 42 boolean features + 1 label
    df = df[cols]
    return df

df = load_dataset()
print('Shape:', df.shape)
df.head(5)

# Histograms

In [None]:
import matplotlib.pyplot as plt

fig, axs = plt.subplots(nrows=7, ncols=8, figsize=(24, 24))
for num, fieldname in enumerate(df):
    ax = axs[num // 8][num % 8]
    ax.set_title(f'{fieldname}')
    if num < 11:
        ax.hist(df[fieldname])
    else:
        vc = df[fieldname].value_counts().sort_index()
        if fieldname == 'Cover_Type':
            vc = vc.reindex(list(range(1, 7+1)))
        ax.bar(vc.index, vc)
    
plt.tight_layout()
plt.show()
plt.close('all')

In [None]:
import matplotlib.pyplot as plt

filt = 'Wilderness_Area4'
fig, axs = plt.subplots(nrows=2, ncols=6, figsize=(24, 8))
for num, fieldname in zip(range(11), df):
    ser = df[fieldname]
    ax = axs[num // 6][num % 6]
    ax.set_title(f'{fieldname}')
    ax.hist(ser[df[filt] == 0], color='C0', alpha=0.5)
    
    ax = ax.twinx()
    ax.hist(ser[df[filt] == 1], color='C1', alpha=0.5)
ax = axs[-1][-1]
ax.set_title('Cover_Type')
df['Cover_Type'][df[filt] == 0].value_counts().sort_index().reindex(list(range(1, 8))).plot.bar(ax=ax, rot=0, alpha=0.5, color='C0')
ax = ax.twinx()
df['Cover_Type'][df[filt] == 1].value_counts().sort_index().reindex(list(range(1, 8))).plot.bar(ax=ax, rot=0, alpha=0.5, color='C1')

fig.suptitle(f'filtered by {filt}: blue=0, orange=1', fontsize=20)
plt.tight_layout()
plt.show()
plt.close('all')

In [None]:
import matplotlib.pyplot as plt

filt = 'Soil_Type3'
fig, axs = plt.subplots(nrows=2, ncols=6, figsize=(24, 8))
for num, fieldname in zip(range(11), df):
    ser = df[fieldname]
    ax = axs[num // 6][num % 6]
    ax.set_title(f'{fieldname}')
    ax.hist(ser[df[filt] == 0], color='C0', alpha=0.5)
    
    ax = ax.twinx()
    ax.hist(ser[df[filt] == 1], color='C1', alpha=0.5)
ax = axs[-1][-1]
ax.set_title('Cover_Type')
df['Cover_Type'][df[filt] == 0].value_counts().sort_index().reindex(list(range(1, 8))).plot.bar(ax=ax, rot=0, alpha=0.5, color='C0')
ax = ax.twinx()
df['Cover_Type'][df[filt] == 1].value_counts().sort_index().reindex(list(range(1, 8))).plot.bar(ax=ax, rot=0, alpha=0.5, color='C1')

fig.suptitle(f'filtered by {filt}: blue=0, orange=1', fontsize=20)
plt.tight_layout()
plt.show()
plt.close('all')

In [None]:
import matplotlib.pyplot as plt

filt = 'Soil_Type6'
fig, axs = plt.subplots(nrows=2, ncols=6, figsize=(24, 8))
for num, fieldname in zip(range(11), df):
    ser = df[fieldname]
    ax = axs[num // 6][num % 6]
    ax.set_title(f'{fieldname}')
    ax.hist(ser[df[filt] == 0], color='C0', alpha=0.5)
    
    ax = ax.twinx()
    ax.hist(ser[df[filt] == 1], color='C1', alpha=0.5)
ax = axs[-1][-1]
ax.set_title('Cover_Type')
df['Cover_Type'][df[filt] == 0].value_counts().sort_index().reindex(list(range(1, 8))).plot.bar(ax=ax, rot=0, alpha=0.5, color='C0')
ax = ax.twinx()
df['Cover_Type'][df[filt] == 1].value_counts().sort_index().reindex(list(range(1, 8))).plot.bar(ax=ax, rot=0, alpha=0.5, color='C1')

fig.suptitle(f'filtered by {filt}: blue=0, orange=1', fontsize=20)
plt.tight_layout()
plt.show()
plt.close('all')

In [None]:
import matplotlib.pyplot as plt

filt = 'Cover_Type'
fig, axs = plt.subplots(nrows=2, ncols=6, figsize=(24, 8))
for num, fieldname in zip(range(11), df):
    ser = df[fieldname]
    ax = axs[num // 6][num % 6]
    ax.set_title(f'{fieldname}')
    ax.hist(ser[df[filt] == 1], color='C0', histtype='step')
    
    ax = ax.twinx()
    ax.hist(ser[df[filt] == 2], color='C1', histtype='step')
    
    ax = ax.twinx()
    ax.hist(ser[df[filt] == 4], color='C2', histtype='step')
    
    ax = ax.twinx()
    ax.hist(ser[df[filt] == 6], color='C3', histtype='step')
    
    ax = ax.twinx()
    ax.hist(ser[df[filt] == 7], color='C4', histtype='step')
    
    
fig.suptitle(f'filtered by {filt}')
plt.tight_layout()
plt.show()
plt.close('all')

# Categorical fields

In [None]:
wilderness_sum = df.loc[:, 'Wilderness_Area1':'Wilderness_Area4'].sum(axis=1) + df['Wilderness_Area4']
wilderness_sum.value_counts().sort_index().plot.bar(rot=0)
# plt.hist(wilderness_sum)
plt.gca().set_title('Sum of Wilderness_Area\'s')
plt.show()
plt.close('all')

wilderness_comb = df['Wilderness_Area1'].copy()
for i in range(2, 4+1):
    wilderness_comb *= 10
    wilderness_comb += (i*df[f'Wilderness_Area{i}'])
vc = wilderness_comb.value_counts()
vc.index = vc.index.map(lambda x: ','.join(list(str(x).replace('0', ''))))
vc.sort_index().plot.bar(rot=0)
plt.gca().set_title('Combination of Wilderness_Area\'s')
plt.show()
plt.close('all')

soil_sum = df.loc[:, 'Soil_Type1':'Soil_Type40'].sum(axis=1) + df['Soil_Type40']
soil_sum.value_counts().sort_index().plot.bar(rot=0)
plt.gca().set_title('Sum of Soil_Type\'s')
plt.show()
plt.close('all')

In [None]:
p = 41 # prime number is used to treat combinations numerically
soil_comb = df['Soil_Type1'].copy()
for i in range(2, 40 + 1):
    if f'Soil_Type{i}' not in df:
        continue
    soil_comb += (soil_comb * (p-1) + i) * df[f'Soil_Type{i}']
print(soil_comb[soil_comb < p**2].value_counts()) # combination value less than p ** k means there are at mosk k different types of soil
print(soil_comb[soil_comb < p**3].value_counts())
print(soil_comb[soil_comb < p**4].value_counts())

# Cumulative sum

In [None]:
import matplotlib.pyplot as plt

fig, axs = plt.subplots(nrows=7, ncols=8, figsize=(24, 24))
for num, fieldname in enumerate(df):
    ax = axs[num // 8][num % 8]
    ax.set_title(f'{fieldname}')
    ax.plot(df[fieldname].cumsum())
plt.tight_layout()
plt.show()
plt.close('all')

# Correlation coefficients

In [None]:
import seaborn as sns

df_small = df.iloc[::10]  # to cope with overflow issue
sns.heatmap(df_small.corr(method='pearson'))
plt.show()
plt.close('all')
sns.heatmap(df_small.corr(method='kendall'))
plt.show()
plt.close('all')
sns.heatmap(df_small.corr(method='spearman'))
plt.show()
plt.close('all')