In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Dimension reduction for EDA

A use of dimensionality reduction is visualization of datasets with a high numbers od features.
This dataset deals with predicting the category on an eCommerce product given various attributes about the listing. There are four different classes to predict and 50 different features.
Dimensionality reduction can be useful to reduce the number of features while preserving the variance of the original dataset. By reducing the numbers of features you can also make plot easier and find the components that enable to differentiate the different classes.


In [None]:
train = pd.read_csv('../input/tabular-playground-series-may-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-may-2021/test.csv')

First let's count the number of times each class is present in the training data.

In [None]:
fig,ax = plt.subplots()
plt.bar(train['target'].unique(), train['target'].value_counts(), color = ['red','blue','green','yellow'])
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
plt.title('Counting target class for training data')

In [None]:
train['target'].value_counts()

Training dataset has 100000 rows and 50 features.
Class_2 is the most represented class in the dataset while Class_3 it is the least represented class.

Let's now drop 'id' column.

In [None]:
data = [train,test]

for d in data:
    d.drop('id', axis = 1, inplace = True)

The features have a lot of zeros values in it. Let's count them.

In [None]:
train_null_perc = pd.DataFrame(np.round(train[train == 0].count()/len(train),2)*100, columns = ['Train_null_perc'])

In [None]:
train_null_perc.sort_values(by = 'Train_null_perc',ascending = False, inplace = True)
train_null_perc.head()

95% of values in features_13 are 0, followed by 93% of feature_2.
Let's plot all percentages.

In [None]:
fig , ax = plt.subplots()
sns.barplot(x = train_null_perc.index, y= train_null_perc['Train_null_perc'], ax= ax, dodge = False)
plt.xticks(rotation=90)
plt.title('Zero values for training data')
sns.despine()

### LinearDiscriminantAnalysis for data visualization

In [None]:
X_cols = [col for col in train.columns if col not in ('target','id')]
y_col = 'target'

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import OneHotEncoder

I will apply Linear Discriminant Analysis to the first 3000 rows of the training dataset, so it will be faster.

In [None]:
X = train[X_cols].iloc[:3000,:]
y = np.array(train[y_col])[:3000]

Before applying LDA we need to HotEncode variables. We can create a pipe to apply OneHotEncoding and LinearDiscriminantAnalysis sequantially to training data.

LDA works with no sparse data so we need to set sparse = False in OneHotEncoding.

In [None]:
pipe = Pipeline([('ohe',OneHotEncoder(sparse = False)),('LDA', LinearDiscriminantAnalysis())])

In [None]:
Xt = pipe.fit_transform(X,y)

In [None]:
Xt.shape

In [None]:
Xt

After applying LDA we get an array with 3 components that we can convert in a dataframe for data visualization.

In [None]:
df = pd.DataFrame(Xt, columns = ['Component_1','Component_2','Component_3'])
df['target'] = train['target'][:3000]

In [None]:
df.head()

Now every class is associated to a different combination of the 3 components.

Let's now visualize components and classes!

In [None]:
g = sns.PairGrid(df , hue = 'target', palette = ['red','blue','green','yellow'])
g.map(sns.scatterplot)
g.add_legend()

As you can see different components are useful to distinguish the classes.

Component_1 and Component_2 seem useful to distinguish between Class_2 and Class_3 while Component_3 and Component_1 seem useful distinguish Class_1 and Component_3 and Component_2 to distinguish Class_4.

LDA can be applied not only to visualiza data but also to reduce numbers of features before developing a model.