### TPS-12-21 data vizualization for beginners

In [None]:
import pandas as pd  # data analysis library
import numpy as np  # comprehensive mathematical functions, random number generators, linear algebra routines, Fourier transforms, and more
import matplotlib.pyplot as plt  # provides an implicit way of plotting
import seaborn as sns  # for visualization
from tqdm import tqdm  # progressbar decorator for iterators
import os  # for operating system

import warnings  # error processing
warnings.filterwarnings("ignore")

In [None]:
train=pd.read_csv('../input/tabular-playground-series-dec-2021/train.csv')    # read the train data file
test=pd.read_csv('../input/tabular-playground-series-dec-2021/test.csv')    # read the test data file

In [None]:
print("\n Name of the columns train data \n ", train.columns.tolist(), '\n')  # let's see the name of the columns train data
print('\n Name of the columns test data \n', test.columns.tolist(), '\n')  # let's see the name of the columns test data

In [None]:
print('total number of columns train data', len(train.columns.tolist()))  # total number of columns
print('total number of columns test data', len(test.columns.tolist()))  # total number of columns

In [None]:
train.head()  # display the first five rows of table data train

In [None]:
train.info()  # dataframe info

In [None]:
train.describe()  # dataframe statistics

In [None]:
cols=train.columns.tolist()
fig, ax = plt.subplots(28,2,figsize=(16,80))
for i in tqdm(range(56)):
    if i<56:
        r=i//2
        c=i%2
        sns.histplot(train[train.Cover_Type==1][cols[i]], label=cols[i]+' Cover_Type=1', ax=ax[r,c], color='black',bins=20)
        sns.histplot(train[train.Cover_Type==2][cols[i]], label=cols[i]+' Cover_Type=2', ax=ax[r,c], color='C1',bins=20)
        sns.histplot(train[train.Cover_Type==3][cols[i]], label=cols[i]+' Cover_Type=3', ax=ax[r,c], color='C2',bins=20)
        sns.histplot(train[train.Cover_Type==4][cols[i]], label=cols[i]+' Cover_Type=4', ax=ax[r,c], color='C3',bins=20)
        sns.histplot(train[train.Cover_Type==5][cols[i]], label=cols[i]+' Cover_Type=5', ax=ax[r,c], color='C4',bins=20)
        sns.histplot(train[train.Cover_Type==6][cols[i]], label=cols[i]+' Cover_Type=6', ax=ax[r,c], color='C5',bins=20)
        sns.histplot(train[train.Cover_Type==7][cols[i]], label=cols[i]+' Cover_Type=7', ax=ax[r,c], color='C6',bins=20)
        ax[r,c].legend()
        ax[r,c].grid()
    else:
        r=i//2
        c=i%2
        ax[r,c].axis("off")       
plt.show()

In [None]:
len(train['Id'].unique())  # Counting the number of unique values in a column

In [None]:
df1 = train[0:50]
plt.figure(figsize=(30, 30))

sns.lineplot(data=df1)

In [None]:
# train.value_counts()  # Counting the number of values

In [None]:
train.groupby('Elevation').count()

In [None]:
#Check if there'is null values
train.isnull().sum()

In [None]:
train.iloc[:,:].hist(figsize=(18,18))
plt.show()

In [None]:
cols = [e for e in test.columns if e not in ('Id')]
continous_features = cols[:10]
categorical_features = cols[10:]
# plot continous features 
i = 1
plt.figure()
fig, ax = plt.subplots(2, 5,figsize=(20, 12))
for feature in continous_features:
    plt.subplot(2, 5,i)
    sns.histplot(train[feature],color="blue", kde=True,bins=100, label='train_'+feature)
    sns.histplot(test[feature],color="olive", kde=True,bins=100, label='test_'+feature)
    plt.xlabel(feature, fontsize=9); plt.legend()
    i += 1
plt.show() 

In [None]:
test.head()  # display the first five rows of table data test

In [None]:
len(test['Id'].unique())  # Counting the number of unique values in a column

In [None]:
test.info()  # dataframe info

In [None]:
test.describe()  # dataframe statistics

In [None]:
test.iloc[:,:].hist(figsize=(18,18))
plt.show()

In [None]:
df2 = test[0:50]
plt.figure(figsize=(30, 30))

sns.lineplot(data=df2)

the only difference is in the presence of one more column in the test date frame

In [None]:
#Check if there'is null values
test.isnull().sum()

## Target distibution

In [None]:
sns.catplot(x="Cover_Type", kind="count", palette="ch:.25", data=train)

In [None]:
train.Cover_Type.value_counts()

In [None]:
fig, ax = plt.subplots()
sns.countplot(x='Cover_Type', data=train, order=sorted(train['Cover_Type'].unique()), ax=ax)
ax.set_ylim(0, 2563000)
ax.set_title('Cover_Type Distribution', weight='bold')
plt.show()

Here we see that there is a significant imbalance in the output values. Especially when the value is 5 only in one case. All this makes learning very difficult.

### Features correlation

In [None]:
corr = train[continous_features+['Cover_Type']].corr()
corr.style.background_gradient(cmap='coolwarm').set_precision(3)

In [None]:
corr=train.corr()
v=10
colmn = corr.nlargest(v, 'Cover_Type')['Cover_Type'].index 
xm = np.corrcoef(train[colmn].values.T)
sns.set(font_scale=1.25)
plt.figure(figsize=(18, 18))
hm = sns.heatmap(xm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10},yticklabels=colmn.values, xticklabels=colmn.values)
plt.show()

In [None]:
train.iloc[:, :-1].describe().T.sort_values(by='std' , ascending = False).style.background_gradient().bar(subset=["max"], color='green').bar(subset=["mean",], color='blue')

In [None]:
print('\n Before \n')
print('total number of columns train data', len(train.columns.tolist()))  # total number of columns
print('total number of columns test data', len(test.columns.tolist()))  # total number of columns

# since id does not carry any information, we will delete this column in both data sets
train.drop(["Id"] , axis = 1 , inplace = True)
test.drop(["Id"] , axis = 1 , inplace = True)

print('\n After \n')
print('total number of columns train data', len(train.columns.tolist()))  # total number of columns
print('total number of columns test data', len(test.columns.tolist()))  # total number of columns

In [None]:

TARGET = 'Cover_Type'
FEATURES = [col for col in train.columns if col not in ['id', TARGET]]
RANDOM_STATE = 2021

df = pd.concat([train[FEATURES], test[FEATURES]], axis=0)
cat_features = [col for col in FEATURES if df[col].nunique() < 25]
cont_features = [col for col in FEATURES if df[col].nunique() >= 25]
del df
plt.pie([len(cat_features), len(cont_features)],labels=['Categorical', 'Continuos'],autopct='%1.1f%%')
plt.show()

In [None]:
ncols = 5
nrows = int(len(cont_features) / ncols + (len(FEATURES) % ncols > 0))-1

fig, axes = plt.subplots(nrows, ncols, figsize=(18, 8), facecolor='#EAEAF2')

for r in range(nrows):
    for c in range(ncols):
        col = cont_features[r*ncols+c]
        sns.kdeplot(x=train[col], ax=axes[r, c],label='Train data')
        sns.kdeplot(x=test[col], ax=axes[r, c],label='Test data')
        axes[r, c].set_ylabel('')
        axes[r, c].set_xlabel(col, fontsize=8, fontweight='bold')
        axes[r, c].tick_params(labelsize=5, width=0.5)
        axes[r, c].xaxis.offsetText.set_fontsize(4)
        axes[r, c].yaxis.offsetText.set_fontsize(4)
plt.show()