<img src="https://docs.microsoft.com/en-us/azure/machine-learning/team-data-science-process/media/lifecycle/tdsp-lifecycle2.png" width="700" height="500" 
     alt="Markdown Monster icon"
     style="center" /> 

<font size="6" color='darkred'>**Explore the data**</font>

In [1]:
#!pip -V
#!python --version
#!conda list -n visualization
#import platform,sys
#print(platform.system())
#print(sys.executable)
##!pip install chart_studio
##!conda install -c conda-forge statsmodels -y

In [1]:
import numpy as np
import pandas as pd
import scipy
import seaborn as sns
import matplotlib
import scipy.stats as stats
from collections import OrderedDict
from statsmodels.graphics.mosaicplot import mosaic


import chart_studio.plotly as py
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode
init_notebook_mode(connected=True)

import cufflinks as cf
cf.go_offline(connected=True)
cf.set_config_file(colorscale='plotly', world_readable=True)


pd.options.display.max_rows = 30
pd.options.display.max_columns = 25


from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

import os
from IPython.display import Image, display, HTML
import ipywidgets as widgets
from ipywidgets import interact, interact_manual
import statsmodels.api as sm
from statsmodels.formula.api import ols
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import plotly.express as px

<font size="6" color='darkblue'>*1- Read and Summarize the Data*</font>

- <font size="4.5" color='black'>**Read data and infer column types**</font>

In [2]:
df = pd.read_csv("adult-income.csv")
df.columns = ['age', 'workclass', 'fnlwgt', 'education', 'educationnum',
       'maritalstatus', 'occupation', 'relationship', 'race', 'sex',
       'capitalgain', 'capitalloss', 'hoursperweek', 'nativecountry',
       'label_IsOver50K']
cols = df.columns
num_cols = df._get_numeric_data().columns.tolist()
cat_cols=list(set(cols) - set(num_cols))

Numeric = ', '.join(each for each in list(num_cols))
print("[The Numeric column names are]:  " + Numeric)
print("\n")
Catagorical = ', '.join(each for each in list(cat_cols))
print("[The Catagorical column names are]:  " + Catagorical)
print("\n")
print ('The data has {} Rows and {} columns'.format(df.shape[0],df.shape[1]))


[The Numeric column names are]:  age, fnlwgt, educationnum, capitalgain, capitalloss, hoursperweek, label_IsOver50K


[The Catagorical column names are]:  race, relationship, education, occupation, workclass, nativecountry, maritalstatus, sex


The data has 32561 Rows and 15 columns



- <font size="4.5" color='black'>**Print the first n (n=5 by default) rows of the data**</font>

In [3]:
@interact
def show_head(n=widgets.IntSlider(min=1, max=20, step=1, value=5)):
    display(HTML(f'<h5>Print the first  {n}  rows of the data'))
    display(df.head(n))

interactive(children=(IntSlider(value=5, description='n', max=20, min=1), Output()), _dom_classes=('widget-int…



- <font size="4.5" color='black'>**Print the column types**</font>

In [4]:
print("The types of columns are:")
df.dtypes

The types of columns are:


age                 int64
workclass          object
fnlwgt              int64
education          object
educationnum        int64
maritalstatus      object
occupation         object
relationship       object
race               object
sex                object
capitalgain         int64
capitalloss         int64
hoursperweek        int64
nativecountry      object
label_IsOver50K     int64
dtype: object


<font size="6" color='darkblue'>*2- Extract Descriptive Statistics of Each Column*</font>

In [5]:
def num_missing(x):
    return len(x.index)-x.count()

def num_unique(x):
    return len(np.unique(x))

temp_df = df.describe().T
missing_df = pd.DataFrame(df.apply(num_missing, axis=0)) 
missing_df.columns = ['missing']
unq_df = pd.DataFrame(df.apply(num_unique, axis=0))
unq_df.columns = ['unique']
types_df = pd.DataFrame(df.dtypes)
types_df.columns = ['DataType']



- <font size="4.5" color='black'>**Print the descriptive statistics of numerical columns**</font>

In [6]:
summary_df = temp_df.join(missing_df).join(unq_df).join(types_df)
summary_df

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,missing,unique,DataType
age,32561.0,38.581647,13.640433,17.0,28.0,37.0,48.0,90.0,0,73,int64
fnlwgt,32561.0,189778.366512,105549.977697,12285.0,117827.0,178356.0,237051.0,1484705.0,0,21648,int64
educationnum,32561.0,10.080679,2.57272,1.0,9.0,10.0,12.0,16.0,0,16,int64
capitalgain,32561.0,1077.648844,7385.292085,0.0,0.0,0.0,0.0,99999.0,0,119,int64
capitalloss,32561.0,87.30383,402.960219,0.0,0.0,0.0,0.0,4356.0,0,92,int64
hoursperweek,32561.0,40.437456,12.347429,1.0,40.0,40.0,45.0,99.0,0,94,int64
label_IsOver50K,32561.0,0.24081,0.427581,0.0,0.0,0.0,0.0,1.0,0,2,int64



- <font size="4.5" color='black'>**Print the descriptive statistics of categorical columns**</font>

In [7]:
col_names = list(types_df.index) #Get all col names
num_cols = len(col_names)
index = range(num_cols)
cat_index = []
for i in index: #Find the indices of columns in Categorical columns
    if col_names[i] in df[cat_cols]:
        cat_index.append(i)
summary_df_cat = missing_df.join(unq_df).join(types_df.iloc[cat_index], how='inner') #Only summarize categorical columns
summary_df_cat

Unnamed: 0,missing,unique,DataType
workclass,0,9,object
education,0,16,object
maritalstatus,0,7,object
occupation,0,15,object
relationship,0,6,object
race,0,5,object
sex,0,2,object
nativecountry,0,42,object


<font size="6" color='darkblue'>*3- Fill missing and test outliers*</font>

In [8]:
# there is no duplication in dataset
print ('{} rows detected as duplicated rows'.format(df.duplicated().sum()))

24 rows detected as duplicated rows


In [9]:
#fill numeric mission values with 0
df[df._get_numeric_data().columns.tolist()]=df[df._get_numeric_data().columns.tolist()].fillna(0)
#fill rest of colunms which should be categorical with unknown
df=df.fillna("Unknown")
print (df.isnull().values.any())

False


There are many defernt way to detect outliers and anomaly , we could use Microsoft Anomaly detction API for time series which based on deeplearnin algorithm SR-CNN  , ref: https://arxiv.org/pdf/1906.03821.pdf

But there are other simple open source methods that are good in different scenarios:

<img src="https://scikit-learn.org/stable/_images/sphx_glr_plot_anomaly_comparison_0011.png" width="500" height="300" 
     alt="Markdown Monster icon"
     style="center" />


In [10]:
from sklearn.ensemble import IsolationForest

numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
#df.select_dtypes(include=numerics).shape
iso = IsolationForest(contamination=0.1)
yhat = iso.fit_predict(df[df._get_numeric_data().columns.tolist()].select_dtypes(include=numerics))
# select all rows that are not outliers
mask = yhat != -1
df_no_outliers=df[mask]
print ('{} rows detected as outliers but we kept them for visualization'.format(df.shape[0]-df_no_outliers.shape[0]))

3256 rows detected as outliers but we kept them for visualization


<font size="6" color='darkblue'>*4- Explore Individual Variables*</font>

- <font size="4.5" color='darkred'>Important (Find Numeric and Categorical colunms)</font>

- <font size="4.5" color='darkred'>Important (for big data you could took sample of 10000)</font>

In [11]:
# change numeric and categorical and target colunms

df=df.sample(10000).copy()
target="label_IsOver50K"
cols = df.columns
num_cols = df._get_numeric_data().columns.tolist()
#num_cols.append('')
if target in num_cols:
    num_cols.remove(target)
cat_cols=list(set(cols) - set(num_cols))

In [12]:
print("The Target is:  " + str([target]))
print("\n")
print("The Numerical are:  " + str(num_cols))
print("\n")
print("The Categorical are:  " + str(cat_cols))

The Target is:  ['label_IsOver50K']


The Numerical are:  ['age', 'fnlwgt', 'educationnum', 'capitalgain', 'capitalloss', 'hoursperweek']


The Categorical are:  ['race', 'relationship', 'education', 'occupation', 'workclass', 'nativecountry', 'label_IsOver50K', 'maritalstatus', 'sex']


- <font size="4.5" color='black'>**Explore the target variable**</font>

In [13]:
import matplotlib.pyplot as plt
matplotlib.rcParams['figure.figsize'] = (15.0, 6.0)
CSS = """
.output {
    align-items: center;
}
"""
HTML('<style>{}</style>'.format(CSS))
@interact
def plot(Target=target):# categorical univariate plot
    
    display(HTML(f'<h4>Bar plot and Pie chart of {target}'))
    fig, axs = plt.subplots(1,2)
    df[target].value_counts().plot.pie(ax=axs[1])
    sns.countplot(y=Target,data=df,ax=axs[0]);

interactive(children=(Text(value='label_IsOver50K', description='Target'), Output()), _dom_classes=('widget-in…

- <font size="4.5" color='black'>**Explore individual numeric variables and test for normality (on sampled data)**</font>


In [15]:
matplotlib.rcParams['figure.figsize'] = (15.0, 4.0)
@interact_manual
def plot_norm(col=num_cols):          # numerical variable univariate plot
    stats.probplot(df[col].to_numpy(), dist="norm", fit=True, rvalue=True, plot=plt)
    f, (ax_box, ax_hist) = plt.subplots(2,figsize=(15, 8), sharex=True,gridspec_kw={"height_ratios": (.35, .65)})
    # Add a graph in each part
    sns.boxplot(x=df[col], ax=ax_box,color="r")
    sns.histplot(data=df[col],kde=True ,ax=ax_hist)
    # Remove x axis name for the boxplot
    ax_box.set(xlabel='')

interactive(children=(Dropdown(description='col', options=('age', 'fnlwgt', 'educationnum', 'capitalgain', 'ca…


- <font size="4.5" color='black'>**Explore individual categorical variables (sorted by frequencies)**</font>

In [16]:
import matplotlib.pyplot as plt
matplotlib.rcParams['figure.figsize'] = (15.0, 6.0)
CSS = """
.output {
    align-items: center;
}
"""
HTML('<style>{}</style>'.format(CSS))
@interact_manual
def plot(col=cat_cols):# categorical univariate plot
    
    display(HTML(f'<h4>Bar plot and Pie chart of {col}'))
    fig, axs = plt.subplots(1,2)
    df[col].value_counts().plot.pie(ax=axs[1])
    sns.countplot(y=col,data=df,ax=axs[0]);

interactive(children=(Dropdown(description='col', options=('race', 'relationship', 'education', 'occupation', …


<font size="6" color='darkblue'>*5- Explore Interactions Between Variables*</font>


- <font size="4.5" color='black'>**Rank Numeric variables based on linear relationships with reference variable (on sampled data)**</font>

In [17]:
matplotlib.rcParams['figure.figsize'] = (15.0, 6.0)
@interact_manual
def plot(col=widgets.IntSlider(min=0, max=20, step=1, value=4)):# categorical univariate plot
    
    display(HTML(f'<h4>Top {col} Associated Numeric Variables '))
    abs(df[num_cols+["label_IsOver50K"]].corr(method="pearson")['label_IsOver50K'][:]).sort_values(ascending=False).iloc[1:col+1].plot.bar()
    #plt.set_xticklabels(df.index,rotation=90)

interactive(children=(IntSlider(value=4, description='col', max=20), Button(description='Run Interact', style=…

- <font size="4.5" color='black'>**Rank Ctegorical variables based on linear relationships with reference variable (pearson)**</font>

In [18]:
matplotlib.rcParams['figure.figsize'] = (15.0, 6.0)
df0=df.copy()

@interact_manual
def plot(col=widgets.IntSlider(min=0, max=20, step=1, value=4)):
    
    display(HTML(f'<h4>Top {col-1} Associated Categorical Variables (Pearson) '))
    
    df0=df.copy()
    cols_t = df0.columns
    num_cols_t = df0._get_numeric_data().columns
    cat_cols_t=list(set(cols_t) - set(num_cols_t))
    df0[cat_cols_t] = df0[cat_cols_t].apply(lambda x: x.astype('category').cat.codes)
    abs(df0[cat_cols_t+["label_IsOver50K"]].corr(method="pearson")['label_IsOver50K'][:]).sort_values(ascending=False).iloc[1:col+1].plot.bar()
    
del df0

interactive(children=(IntSlider(value=4, description='col', max=20), Button(description='Run Interact', style=…

- <font size="4.5" color='black'>**Rank Ctegorical variables based on linear relationships with reference variable (chisq)**</font>

In [19]:
@interact_manual
def plot(col=widgets.IntSlider(min=0, max=20, step=1, value=4)):# categorical univariate plot
    
    display(HTML(f'<h4>Top {col} Associated Categorical Variables (ChiSq) '))
    cramer_dict = {}
    if len(list(set(cols) - set(num_cols)))>1:
        for each in list(set(cols) - set(num_cols)):
            if each !=target:
                tbl = pd.crosstab(df[target], df[each])
                chisq = stats.chi2_contingency(tbl, correction=False)[0]
                try:
                    cramer = np.sqrt(chisq/sum(tbl))
                except:
                    cramer = np.sqrt(chisq/tbl.values.sum())
                    pass
                cramer_dict[each] = cramer

        topk_cramer = pd.DataFrame.from_dict(cramer_dict, orient='index').unstack().sort_values(\
            kind = 'quicksort', ascending=False).head(col).reset_index().set_index('level_1')
        topk_cramer.columns = ['level_0','CramersV']
        topk_cramer.index.names = ['TopAssociated Categorical Variables']
        sns.set_theme(style="whitegrid")
        g=sns.barplot( x=topk_cramer.index,y='CramersV', data=topk_cramer)
        g.set_xticklabels(g.get_xticklabels(), rotation=45)

interactive(children=(IntSlider(value=4, description='col', max=20), Button(description='Run Interact', style=…


- <font size="4.5" color='black'>**Explore interactions between categorical variables**</font>

In [20]:
matplotlib.rcParams['figure.figsize'] = (15.0, 6.0)

if target in list(set(cols) - set(num_cols)):
    cols_list = [target] + list(set(cols) - set(num_cols)) #Make target the default reference variable
    cols_list = list(OrderedDict.fromkeys(cols_list)) #remove variables that might be duplicates with target
else:
    cols_list = list(set(cols) - set(num_cols))
    
def NoLabels(x):
    return ''

@interact_manual
def plot(col1=cols_list,col2=cols_list):# categorical univariate plot
    
    display(HTML(f'<h4>{col1}☜ VS ☞{col2}'))
    
    if col1 != col2:
        df2 = df[(df[col1].isin(df[col1].value_counts().head(10).index.tolist()))&(df[col2].isin(df[col2].value_counts().head(10).index.tolist())) ]
        df3 = pd.crosstab(df2[col1], df2[col2])
        df3 = df3+1e-8
    else:
        df3 = pd.DataFrame(df[col1].value_counts())[:10]
    fig,ax = plt.subplots()
    fig,rects = mosaic(df3.unstack(),ax=ax, statistic=False,labelizer=NoLabels,  label_rotation=30)
    ax.set_ylabel(col1)
    ax.set_xlabel(col2)
    

interactive(children=(Dropdown(description='col1', options=('label_IsOver50K', 'race', 'relationship', 'educat…

- <font size="4.5" color='black'>**Explore interactions between numerical variables**</font>

In [21]:
#help(df.iplot)
@interact_manual
def scatter_plot(x=num_cols, 
                 y=num_cols[1:],
                 theme=list(cf.themes.THEMES.keys()), 
                 colorscale=list(cf.colors._scales_names.keys())):
    fig=go.Figure()
    df.iplot(kind='scatter', x=x, y=y, mode='markers',interpolation="linear" ,dimensions=(800,600),
             xTitle=x.title(), yTitle=y.title(), 
             text='',
             title=f'Correlation between {x} and {y} is {round(df[x].corr(df[y],method="pearson"),2)} ',
            theme=theme, colorscale=colorscale)


interactive(children=(Dropdown(description='x', options=('age', 'fnlwgt', 'educationnum', 'capitalgain', 'capi…

In [22]:
import plotly.express as px
@interact_manual
def scatter_plot(x=num_cols, 
                 y=num_cols[1:],
                 ):
    display(HTML(f'<h4>Correlation between {x} and {y} is {round(df[x].corr(df[y],method="pearson"),2)} '))
    fig=go.Figure()
    fig.add_trace(go.Scatter(x=df[x], y=df[y], mode='markers',name="scatter plot"))
    m, b = np.polyfit(df[x].to_numpy() , df[y].to_numpy(), 1)
    fig.add_trace(go.Scatter(x=df[x].to_numpy(), y=m*df[x].to_numpy() + b,name="Best Linear Fit"))
    fig.update_layout(
    autosize=False,
    width=850,
    height=500,
    margin=dict(
        l=50,
        r=50,
        b=50,
        t=50,
        pad=4
    ),
    paper_bgcolor="LightSteelBlue",
)
    iplot(fig)


interactive(children=(Dropdown(description='x', options=('age', 'fnlwgt', 'educationnum', 'capitalgain', 'capi…


- <font size="4.5" color='black'>**Explore correlation matrix between numerical variables**</font>

In [23]:
#df[num_cols].corr().iplot(kind='heatmap',colorscale="Blues",title="Feature Correlation Matrix")
import plotly.figure_factory as ff

cscales = ['Greys', 'YlGnBu', 'Greens', 'YlOrRd', 'Bluered', 'RdBu',
            'Reds', 'Blues', 'Picnic', 'Rainbow', 'Portland', 'Jet',
            'Hot', 'Blackbody', 'Earth', 'Electric', 'Viridis', 'Cividis']

corrs = df[num_cols].corr()

@interact_manual
def plot_corrs(colorscale=cscales):
    figure = ff.create_annotated_heatmap(z = corrs.round(2).values, 
                                     x =list(corrs.columns), 
                                     y=list(corrs.index), 
                                     colorscale=colorscale,
                                     annotation_text=corrs.round(2).values)
    iplot(figure)

interactive(children=(Dropdown(description='colorscale', options=('Greys', 'YlGnBu', 'Greens', 'YlOrRd', 'Blue…



- <font size="4.5" color='black'>**Explore interactions between numerical and categorical variables**</font>

In [24]:

@interact_manual
def box_plot2(x=list(set(cols) - set(num_cols)), 
                 y=num_cols,
                 ):
    
    mod = ols('{} ~ {}'.format(y, x), data=df[[y, x]]).fit()
    aov_table = sm.stats.anova_lm(mod, typ=1)
    p_val = round(aov_table['PR(>F)'][0], 6)
    status = 'Passed'
    color = 'blue'
    if p_val < 0.05:
        status = 'Rejected'
        color = 'red'
    df[[x, y]].pivot(columns=x, values=y).iplot(kind='box',title='ho {} (p_value = {})'.format( status, p_val),dimensions=(900,500))

    

interactive(children=(Dropdown(description='x', options=('race', 'relationship', 'education', 'occupation', 'w…


- <font size="4.5" color='black'>**Explore interactions between two numerical variables and a categorical variable (on sampled data)**</font>

In [25]:

@interact_manual
def scatter_plot2(x=num_cols, 
                 y=num_cols,
                 z=list(set(cols) - set(num_cols)),
                 ):
    fig = px.scatter(df, x=x, y=y, color=z)#,template="plotly_dark")
    fig.update_layout(width=950,height=600)
    #margin=dict(l=20, r=20, t=20, b=20),
    #paper_bgcolor="LightSteelBlue")
    fig.show()

interactive(children=(Dropdown(description='x', options=('age', 'fnlwgt', 'educationnum', 'capitalgain', 'capi…


<font size="5" color='darkblue'>*6- Visualize numerical data by projecting to principal component spaces*</font>



- <font size="4.5" color='black'>**Project data to 2-D principal component space**</font>

In [26]:
fig=go.Figure()

df0=df.copy()
cols_t = df0.columns
if target in df0._get_numeric_data().columns:
    num_cols_t = df0._get_numeric_data().columns.drop(target)
    
cat_cols_t=list(set(cols_t) - set(num_cols_t))
cat_cols_t.remove(target)

features= num_cols_t
x = df0.loc[:, features].values
y = df0.loc[:,target].values
# Standardizing the features

x = StandardScaler().fit_transform(x)
pca = PCA()
components = pca.fit_transform(x)
exp_var_cumul = np.cumsum(pca.explained_variance_ratio_)
loadings = pca.components_.T * np.sqrt(pca.explained_variance_)
principalDf = pd.DataFrame(data = components)
finalDf = pd.concat([principalDf, df0[[target]]], axis = 1)

fig=px.area(
    x=range(1, exp_var_cumul.shape[0] + 1),
    y=exp_var_cumul,
    labels={"x": "# Components", "y": "Cumsum Explained Variance"})
fig.update_layout(width=650,height=400)


fig=px.bar(
    x=range(1, exp_var_cumul.shape[0] + 1),
    y=pca.explained_variance_ratio_,
    labels={"x": "# Components", "y": "Explained Variance"})

fig.update_layout(width=650,height=400)

@interact_manual
def pca_plot(PCA_Num1=widgets.IntSlider(min=0, max=finalDf.shape[1]-2, step=1, value=0),PCA_Num2=widgets.IntSlider(min=0, max=finalDf.shape[1]-2, step=1, value=1)):
    fig = px.scatter(components, x=PCA_Num1, y=PCA_Num2, color=df0[target])
    
    fig.add_shape(
        type='line',
        x0=0, y0=0,
        x1=loadings[0, 0],
        y1=loadings[0, 1]
    ).add_annotation(
        x=loadings[0, 0],
        y=loadings[0, 1],
        ax=0, ay=0,
        xanchor="center",
        yanchor="bottom",
        text=target,
    )
    fig.show()


interactive(children=(IntSlider(value=0, description='PCA_Num1', max=5), IntSlider(value=1, description='PCA_N…

In [27]:
# Provide the path to the yaml file relative to the working directory
display(HTML('''<style>
    .widget-label { min-width: 20ex !important; }
    .widget-text { min-width: 60ex !important; }
</style>'''))

#Toggle Code
HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();

 } else {
 $('div.input').show();

 }
 code_show = !code_show
} 
//$( document ).ready(code_toggle);//commenting code disabling by default
</script>
<form action = "javascript:code_toggle()"><input type="submit" value="Toggle Raw Code"></form>''')