In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np 
import pandas as pd
import matplotlib as mpl 
import matplotlib.pyplot as plt
import seaborn as sns
from cycler import cycler

## Set Dark Mode for data exploration 

In [None]:
# Subinium's Code -https://www.kaggle.com/subinium/dark-mode-visualization-apple-version 

raw_light_palette = [
    (0, 122, 255), # Blue
    (255, 149, 0), # Orange
    (52, 199, 89), # Green
    (255, 59, 48), # Red
    (175, 82, 222),# Purple
    (255, 45, 85), # Pink
    (88, 86, 214), # Indigo
    (90, 200, 250),# Teal
    (255, 204, 0)  # Yellow
]

raw_dark_palette = [
    (10, 132, 255), # Blue
    (255, 159, 10), # Orange
    (48, 209, 88),  # Green
    (255, 69, 58),  # Red
    (191, 90, 242), # Purple
    (94, 92, 230),  # Indigo
    (255, 55, 95),  # Pink
    (100, 210, 255),# Teal
    (255, 214, 10)  # Yellow
]

raw_gray_light_palette = [
    (142, 142, 147),# Gray
    (174, 174, 178),# Gray (2)
    (199, 199, 204),# Gray (3)
    (209, 209, 214),# Gray (4)
    (229, 229, 234),# Gray (5)
    (242, 242, 247),# Gray (6)
]

raw_gray_dark_palette = [
    (142, 142, 147),# Gray
    (99, 99, 102),  # Gray (2)
    (72, 72, 74),   # Gray (3)
    (58, 58, 60),   # Gray (4)
    (44, 44, 46),   # Gray (5)
    (28, 28, 39),   # Gray (6)
]


light_palette = np.array(raw_light_palette)/255
dark_palette = np.array(raw_dark_palette)/255
gray_light_palette = np.array(raw_gray_light_palette)/255
gray_dark_palette = np.array(raw_gray_dark_palette)/255

#print('Light mode palette')
#sns.palplot(light_palette)
#sns.palplot(gray_light_palette)

#print('Dark mode palette')
#sns.palplot(dark_palette)
#sns.palplot(gray_dark_palette)

In [None]:

mpl.rcParams['axes.prop_cycle'] = cycler('color',dark_palette)
mpl.rcParams['figure.facecolor']  = gray_dark_palette[-2]
mpl.rcParams['figure.edgecolor']  = gray_dark_palette[-2]
mpl.rcParams['axes.facecolor'] =  gray_dark_palette[-2]

white_color = gray_light_palette[-2]
mpl.rcParams['text.color'] = white_color
mpl.rcParams['axes.labelcolor'] = white_color
mpl.rcParams['axes.edgecolor'] = white_color
mpl.rcParams['xtick.color'] = white_color
mpl.rcParams['ytick.color'] = white_color

mpl.rcParams['figure.dpi'] = 150

mpl.rcParams['axes.spines.top'] = False
mpl.rcParams['axes.spines.right'] = False

## Read the training file and describe the fields based on the standard deviation of values

In [None]:
df = pd.read_csv("../input/tabular-playground-series-nov-2021/train.csv")
test = pd.read_csv('/kaggle/input/tabular-playground-series-nov-2021/test.csv')

In [None]:

df.loc[:, 'f0':'f99'].describe().T.sort_values(by='std',ascending=False).style.bar(subset=['mean'], color='#205ff2')\
                            .background_gradient(subset=['std'], cmap='Greens')\
                            .background_gradient(subset=['25%'], cmap='Spectral')\
                            .background_gradient(subset=['50%'], cmap='seismic')\
                            .background_gradient(subset=['75%'], cmap='viridis')\
                            .background_gradient(subset=['mean'], cmap='cubehelix')\
                            .background_gradient(subset=['min'], cmap='Reds')\
                            .background_gradient(subset=['max'], cmap='Blues')

##  Top-5 features with strong standard deviations

In [None]:
df.loc[:, 'f0':'f99'].describe().T.sort_values(by='std',ascending=False).head(5)

In [None]:
## Some analysis

In [None]:
custom_colors = ["#003399","#00A1F1","#00A300","#00F700","#292F33","#2B5797","#2D89EF","#3B5998","#7B0099","#9F00A7","#B91D47","#C63678","#E50914","#EE1111","#F5FF00","#F70000","#FF0097","#FF9900","#FFC40D","#FFCC00"]
customPalette = sns.set_palette(sns.color_palette(custom_colors))
sns.palplot(sns.color_palette(custom_colors),size = 1.2)
plt.tick_params(axis ='both', labelsize=0, length = 0)


## Let us try a Correlation Chart

In [None]:
fig, ax = plt.subplots(figsize=(20 , 20))

corr = df.loc[:, 'f0':'f99'].corr()

mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True


sns.heatmap(corr,
        cmap=custom_colors, robust=True, center=0,square=True, linewidth=0.2,
        mask=mask, ax=ax) 

ax.set_title('Feature Correlation', loc='left', fontweight='bold')
plt.show()

## Let us try histograms of the features

In [None]:
cols = ['f'+str(i) for i in range(100)]

In [None]:
i = 1
plt.figure()
fig, ax = plt.subplots(25, 4,figsize=(16, 50))
for feature in cols[:100]:
    plt.subplot(25, 4,i)
    sns.histplot(df[feature],color="violet", kde=True,bins=100, label='train')
    sns.histplot(test[feature],color="green", kde=True,bins=100, label='test')
    plt.xlabel(feature, fontsize=6)
    i += 1
plt.legend()
plt.show()

# November Tabular Playground Series - 2021

## An Automated EDA using Dataprep and other tools

### TPS-Nov-2021 is an exercise to detect spam status of messages based on the parameters generated by a GAN network

#### We shall use Dataprep for the automated analysis & use Light Auto ML (LAMA) for AutoML generation

<table>
<tr>
    <td>
        <img src="https://github.com/sfu-db/dataprep/raw/develop/assets/logo.png" height=400, width=400>
        <h2>DataPrep lets you prepare your data using a single library with a few lines of code.</h2>
        <ol><li>Collect data from common data sources (through dataprep.connector)</li>
            <li>Do your exploratory data analysis (through dataprep.eda)</li>
            <li>Clean and standardize data (through dataprep.clean)</li>
        </ol>
    </td>
    <td>
        <img src="https://github.com/sberbank-ai-lab/LightAutoML/raw/master/imgs/LightAutoML_logo_big.png" height=400, width=400>
        <h2>LightAutoML - automatic model creation framework</h2> <br>
        <ol> LightAutoML (LAMA) - project from Sberbank AI Lab AutoML group is the framework for automatic classification and regression model creation.
            <li>binary classification</li>
            <li>multiclass classification</li>
            <li>regression</li>
        </ol>
    </td>
</tr>
    <tr>
    <td>
        <img src="https://github.com/lmcinnes/umap/raw/master/doc/logo_large.png" height=400, width=400>
        <h2>Uniform Manifold Approximation and Projection (UMAP) is a dimension reduction technique that can be used for visualisation similarly to t-SNE, but also for general non-linear dimension reduction</h2>
        <ol><li> The algorithm is founded on three assumptions about the data:</li>
            <li>Do your exploratory data analysis (through dataprep.eda)</li>
            <li>Clean and standardize data (through dataprep.clean)</li>
            <li>The data is uniformly distributed on a Riemannian manifold;</li>
            <li>The Riemannian metric is locally constant (or can be approximated as such)</li>
        </ol>
    </td>
    <td>
        <img src="https://camo.githubusercontent.com/8a45c0936d6113b12b7b32942f448270eda8f714665ba8629f36c291f0ccd5fd/68747470733a2f2f70616e6461732d70726f66696c696e672e6769746875622e696f2f70616e6461732d70726f66696c696e672f646f63732f6173736574732f6c6f676f5f6865616465722e706e67" height=400, width=400>
        <h2>Pandas Profiling</h2> <br>
        <ol>Generates profile reports from a pandas DataFrame. The pandas df.describe() function is great but a little basic for serious exploratory data analysis. pandas_profiling extends the pandas DataFrame with df.profile_report() for quick data analysis.
            <li>Type inference: detect the types of columns in a dataframe.
            <ol>
                <li>Essentials: type, unique values, missing values</li>
 <li>Quantile statistics like minimum value, Q1, median, Q3, maximum, range, interquartile range </li>
 <li>Descriptive statistics like mean, mode, standard deviation, sum, median absolute deviation, coefficient of variation, kurtosis, skewness </li>
 <li>Most frequent values</li>
 <li>Histogram</li>
 <li>Correlations highlighting of highly correlated variables, Spearman, Pearson and Kendall matrices</li>
 <li>Missing values matrix, count, heatmap and dendrogram of missing values</li>
 <li>Text analysis learn about categories (Uppercase, Space), scripts (Latin, Cyrillic) and blocks (ASCII) of text data.</li>
              </ol>
 <li>File and Image analysis extract file sizes, creation dates and dimensions and scan for truncated images or those containing EXIF information.</li>
        </ol>
    </td>
</tr>
</table>


# DataPrep EDA Analysis

In [None]:
# Run me if you'd like to install dataprep
try:
    from dataprep.eda import plot, plot_correlation, plot_missing, create_report
except:
    !pip install dataprep
    from dataprep.eda import plot, plot_correlation, plot_missing, create_report
import pandas as pd


In [None]:
plot(df)

## Plot the variables with high Standard Deviation

In [None]:
plot(df, "f2")



In [None]:
plot(df, "f35")


In [None]:
plot(df, "f36")


In [None]:
plot(df, "f44")


In [None]:
plot(df, "f84")

In [None]:
plot_correlation(df,"f2", "f35")


In [None]:
plot_correlation(df, "f2", "f36")


In [None]:
plot_correlation(df, "f2", "f44")


In [None]:
plot_correlation(df, "f2", "f84")


In [None]:
plot_correlation(df, "f35", "f36")


In [None]:
plot_correlation(df, "f35", "f44")


In [None]:
plot_correlation(df, "f35", "f84")


In [None]:
plot_correlation(df, "f36", "f44")


In [None]:
plot_correlation(df, "f36", "f84")


In [None]:
plot_correlation(df, "f44", "f84")

In [None]:
df=df.dropna()
plot_correlation(df)

In [None]:
## Plot correlation for the variables with high Standard Deviation

## show the location/position and percentage of missing data

In [None]:
plot_missing(df)

## Create a report using  dataprepare.eda 


In [None]:


create_report(df)


# Analysis using Autoviz

## Load

In [None]:
try:
    from autoviz.AutoViz_Class import AutoViz_Class
except:
    !pip install sweetviz autoviz xlrd
    from autoviz.AutoViz_Class import AutoViz_Class

## Analyze

In [None]:
train_file ="../input/tabular-playground-series-nov-2021/train.csv"

AV = AutoViz_Class()
df = AV.AutoViz(train_file, depVar='target',verbose = 1, 
                lowess = False, chart_format ='png', 
                max_rows_analyzed = 150000)

# Pandas Profiling

In [None]:
%%time
import pandas_profiling
df.profile_report()

In [None]:
from pandas_profiling import ProfileReport
ProfileReport(df).to_widgets()

# Dimension Reduction with Sampling

##### Thanks to https://www.kaggle.com/subinium/tps-may-categorical-eda 

1. UMAP gives us a meaningful clustering view through dimension reduction. 
1. We can sample a huge dataset with limited number of records
1. For the example we are taking 15,000 Records

In [None]:
from umap import UMAP

train_sub = df.sample(15000, random_state=0)
target = train_sub['target']
umap = UMAP(random_state=0)
dr = umap.fit_transform(train_sub.iloc[:,:-1], target)


In [None]:
fig = plt.figure(figsize=(12, 12))
gs = fig.add_gridspec(5, 4)
ax = fig.add_subplot(gs[:-1,:])

sub_axes = [None] * 4
for idx in range(4): 
    sub_axes[idx] = fig.add_subplot(gs[-1,idx])

for idx in range(4):
    ax.scatter(x=dr[:,0][target==idx], y=dr[:,1][target==idx],
              s=10, alpha=0.2
              )

    for j in range(4):
        sub_axes[j].scatter(x=dr[:,0][target==idx], y=dr[:,1][target==idx],
                              s=10, alpha = 0.4 if idx==j else 0.008, color = (dark_palette[j%9]) if idx==j else white_color,
                            zorder=(idx==j)
                           )
        
    
    sub_axes[idx].set_xticks([])
    sub_axes[idx].set_yticks([])
    sub_axes[idx].set_xlabel('')
    sub_axes[idx].set_ylabel('')
    sub_axes[idx].set_title(f'Class_{idx+1}')
    sub_axes[idx].spines['right'].set_visible(True)
    sub_axes[idx].spines['top'].set_visible(True)

ax.set_title('Dimenstion Reduction using (UMAP)', fontweight='bold', fontfamily='serif', fontsize=20, loc='left')   
    
ax.set_xticks([])
ax.set_yticks([])
ax.set_xlabel('')
ax.set_ylabel('')
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_visible(False)

fig.tight_layout()
plt.show()

# AutoML Generation through LAMA Framework

In [None]:
try:
    from lightautoml.automl.presets.tabular_presets import TabularAutoML
    from lightautoml.tasks import Task
except:
    !pip install -U lightautoml
    from lightautoml.automl.presets.tabular_presets import TabularAutoML
    from lightautoml.tasks import Task

In [None]:
test = pd.read_csv('../input/tabular-playground-series-nov-2021/test.csv')
sub = pd.read_csv('../input/tabular-playground-series-nov-2021/sample_submission.csv')
df.shape, test.shape, sub.shape

In [None]:
task = Task('binary')
automl = TabularAutoML(task = task, timeout = 8 * 3600, cpu_limit = 4, 
                       general_params = {'use_algos': [['cb']]}, 
                       selection_params = {'mode': 0})
oof_pred = automl.fit_predict(df, roles = {'target': 'target', 'drop': ['id']}, verbose = 3)
sub['target'] = automl.predict(test).data[:, 0]

In [None]:
sub.to_csv('submission.csv', index = False)