# Visualizations and Profiling Data

<font color='steelblue'>

<span style="font-family:verdana; font-size:1.6em;">
    <b>Couple of handy tools to do data exploration with minimal code</b><br>
</span>
<span style="font-family:verdana; font-size:1.4em;">
    <b>Tools:</b><i>
    <ol>
        <li>Autoviz - to visualize dataset</li>
        <li>Pandas Profiling - to profile data</li>
        <li>Datasist - Another visualization tool</li>
    </ol> </i>   
</span>

</font>

# Autoviz

<font color='tomato'>

<span style="font-family:verdana; font-size:1.6em;">
    <b>Install following (instructions for windows/mac/Linux)</b><br>
 
<ol>
    <li>conda install -c anaconda py-xgboost </li>
    <li>pip install autoviz</li>
    </ol>
    </span>
    </font>

## Import required libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
plt.style.use('seaborn-whitegrid')    # grids in the plots
import warnings
warnings.filterwarnings('ignore')

# Do not truncate columns in dataframe
pd.set_option('display.max_colwidth', -1)

## Load the dataset<br>
<font color='steelblue'>
<span style="font-family:verdana; font-size:1.4em;">
The dataset is regading telecom customers with various information regarding them. Objective is to create a model to predict there will be customer churn - will stay or leave
</span>
</font>

In [None]:
filename = '../datasets/Telco-Customer-Churn.csv'
df = pd.read_csv(filename)

In [None]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
df.head()

In [None]:
df['TotalCharges'] = df['TotalCharges'].replace(" ", np.nan)

In [None]:
df.shape

In [None]:
df.isna().sum()

In [None]:
df.nunique()

In [None]:
target_col = ['Churn']
cat_cols   = df.nunique()[df.nunique() < 6].keys().tolist()
# categorical coluns
cat_cols   = [x for x in cat_cols if x not in target_col]
# numerical columns
num_cols   = [x for x in df.columns if x not in cat_cols + target_col]
# binary columns
bin_cols   = df.nunique()[df.nunique() == 2].keys().tolist()
# multiple values column
multi_cols = [i for i in cat_cols if i not in bin_cols]

In [None]:
print("categorical {}".format(cat_cols))
print("continuous {}".format(num_cols))
print("binary categorical {}".format(bin_cols))
print("multi categorical {}".format(multi_cols))

## Instantiate the AutoViz class

In [None]:
from autoviz.AutoViz_Class import AutoViz_Class
AV = AutoViz_Class()

In [None]:
# define data information
sep = ','
target = 'Churn'

## Plot data without defining target variable

In [None]:
# Replace verbose to 0 to see additional information about data
#Verbose
#    if 0, does not print any messages and goes into silent mode. This is the default.
#    if 1, print messages on the terminal and also display charts on terminal.
#    if 2, print messages but will not display charts, it will simply save them.

dft = AV.AutoViz(filename, sep=sep, dfte=None, header=1, verbose=1,
                lowess=False,chart_format='svg',max_rows_analyzed=9000,max_cols_analyzed=60)

## Plot data with target variable

In [None]:
dft = AV.AutoViz(filename, sep=sep, depVar=target, dfte=None, header=1, verbose=1,
                lowess=False,chart_format='svg',max_rows_analyzed=9000,max_cols_analyzed=60)

# Pandas Profiling

<font color='tomato'>

<span style="font-family:verdana; font-size:1.6em;">
    <b>Install Pandas Profiling (Windows)</b><br>
 
<ol>
    <li>conda install -c conda-forge/label/cf202003 pandas-profiling </li>
    <li>pip install missingno </li>
</ol>
    missingno is for missing information in dataframe (null, NA, np.nan)
</span>
</font>

In [None]:
# to render the report in the notebook
!jupyter nbextension enable --py widgetsnbextension

In [None]:
from pandas_profiling import ProfileReport

## Create a report

In [None]:
designReport = ProfileReport(df, title = "Customer Churn", 
                             correlations={"cramers": {"calculate": True},
                                           "spearman": {"calculate": True},
                                           "kendall": {"calculate": False},
                                           "phi_k": {"calculate": False},
                                           "recoded": {"calculate": True},
                                           "pearson": {"calculate": True}},
                            samples = {"head": 10, "tail": 5})

In [None]:
print(type(designReport))

In [None]:
# can be written to JSON also
designReport.to_file('report.html')

In [None]:
designReport.to_widgets()

## To view [Profile Report parameters](https://github.com/pandas-profiling/pandas-profiling/blob/master/src/pandas_profiling/config_default.yaml)

# Datasist

<font color='tomato'>

<span style="font-family:verdana; font-size:1.6em;">
    <b>Install Datasist (Windows)</b><br>
 
<ol>
    <li>pip install datasist</li>
</ol>
</span>
</font>

## To view [Datasist Documentation](https://risingodegua.gitbook.io/datasist-doc/)

## To view [Datasist Tutorial](https://risenw.github.io/datasist/classification_example.html) 

In [None]:
import datasist as ds  #import datasist library

## Numerical Features Visualization

In [None]:
# create box plot
ds.visualizations.boxplot(df, target = 'Churn')

## Categorical Features Visualizations

In [None]:
# set save_fig to true so that each chart is printed as .png file
#ds.visualizations.catbox(df, target = 'Churn', save_fig = True)
ds.visualizations.catbox(df, target = 'Churn', save_fig = False)

In [None]:
# Use Autoviz
ds.visualizations.autoviz(df)

<font color='tomato'>

<span style="font-family:verdana; font-size:1.6em;">
    <b>Download as HTML</b><br><br>
    <b>File --> Download as --> HTML (html)</b>
    </span>
</font>