This notebook is based on this https://www.kaggle.com/dwin183287/tps-june-2021-eda notebook. As a practice of plotly I have tried to replicate the graphs using plotly. For more information please follow the above mentioned link.

In [None]:
# Import the libraries
import numpy as np
import pandas as pd

pd.set_option("display.max_row", None)
pd.set_option("display.max_columns", None)

import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.figure_factory as ff

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# Read the csv files
train_df = pd.read_csv("/kaggle/input/tabular-playground-series-jun-2021/train.csv")
test_df = pd.read_csv("/kaggle/input/tabular-playground-series-jun-2021/test.csv")

In [None]:
# Print the first five rows of the train dataframe
train_df.head()

In [None]:
# Print the number of columns, rows and missing values in train dataframe
print(f'''No. of rows: {train_df.shape[0]}; No. of columns: {train_df.shape[1]}; 
No. of missing value: {sum(train_df.isna().sum())}''')

In [None]:
# analyse the typess of column in the dataframe
train_df.dtypes

In [None]:
# Describe various statistical values of dataframe
train_df.describe().T

In [None]:
# Describe the target column
print("Target column statistics: ")
train_df["target"].describe()

In [None]:
# Calculate the number of times each class occur in the target column
print("Frequency of each class:")
train_df["target"].value_counts()

In [None]:
temp_target = pd.DataFrame(train_df["target"].value_counts()/len(train_df))
target_percentage = (train_df["target"].value_counts()/len(train_df)) * 100
temp_target["target_percent"] = target_percentage
temp_target = temp_target.reset_index(drop = False)
temp_target = temp_target.sort_values(by = "index")
temp_target.head(10)

In [None]:
fig = px.bar(
    x = temp_target['index'],
    y = temp_target["target_percent"],
    color_discrete_sequence = ["#ff355d"] * len(temp_target),
    text = temp_target["target_percent"]
    )

fig.update_layout(
    plot_bgcolor = "#EEEEEE",
    paper_bgcolor = "#f6f5f5",
    autosize = True,
    xaxis = dict(
        title_text = "<b> Class </b>",
        titlefont = dict(size = 12)
    ),
    yaxis = dict(
        title_text = "<b> Target(%) </b>",
        titlefont = dict(size = 12)
    ),
    title_text = "<b> Target Distribution </b>",
    title_font_size = 16,
    title_font_color = "black",
    title_pad_t = 5,
    title_pad_l = 20
    )
 

fig.update_yaxes(showticklabels = True, showline = True, linewidth = 2, linecolor = "black")
fig.update_xaxes(showticklabels = True, showline = True, linewidth = 2, linecolor = "black")

fig.update_traces(texttemplate = "%{text:.2s}", textposition = "outside")
fig.update_layout(uniformtext_minsize = 8, uniformtext_mode = "hide")

fig.show()

In [None]:
# Print the first five rows of the test dataframe
test_df.head()

In [None]:
# Print the number of columns, rows and missing values in test dataframe
print(f"Number of rows: {test_df.shape[0]}, Number of columns: {test_df.shape[1]}, Number of missing rows: {sum(test_df.isna().sum())}")

In [None]:
# analyse the typess of column in the test dataframe
test_df.dtypes

In [None]:
# Describe various statistical values of test dataframe
test_df.describe().T

In [None]:
# Create a list containing all the feature except id and target
features = [feature for feature in train_df.columns if feature not in ["id", "target"]]

In [None]:
unique_value_train = np.zeros(2) # Create an array of zeros
for feature in features:
    temp = train_df[feature].unique() # Collect the unique value of series object using .unique()
    unique_value_train = np.concatenate([unique_value_train, temp]) # join two or more arrays of the same shape along a specified axis
unique_value_train = np.unique(unique_value_train) #Returns the array of sorted unique elements of an array

In [None]:
unique_value_test = np.zeros(2)
for feature in features:
    temp = test_df[feature].unique()
    unique_value_test = np.concatenate([unique_value_test, temp])
unique_value_test = np.unique(unique_value_test)

In [None]:
unique_value_feature_train  = pd.DataFrame(train_df[features].nunique()) # Create dataframe of count of each unique element in a feature
unique_value_feature_train = unique_value_feature_train.reset_index(drop = False)
unique_value_feature_train.columns = ["Features", "Count"]
unique_value_feature_train

In [None]:
unique_value_feature_test = pd.DataFrame(test_df[features].nunique())
unique_value_feature_test = unique_value_feature_test.reset_index(drop = False)
unique_value_feature_test.columns = ["Features", "Count"]
unique_value_feature_test

In [None]:
fig = go.Figure()

fig.add_trace(go.Bar(
    x = unique_value_feature_train["Features"],
    y = unique_value_feature_train["Count"],
    name='Train Features',
    marker_color="#0099cc"
))
fig.add_trace(go.Bar(
    x = unique_value_feature_test["Features"],
    y = unique_value_feature_test["Count"],
    name='Test Features',
    marker_color='#ff355d'
))

fig.update_layout(
                  barmode='group', 
                  xaxis_tickangle=-45,
                  plot_bgcolor="#EEEEEE",
                  paper_bgcolor = "#f6f5f5",
                  autosize=True,
                  yaxis = dict(
                          title_text = "<b>Count</b>",
                          titlefont = dict(size = 12)),
                  title_text = "<b> Feature Count of train and test data </b><br>",
                  title_font_size = 16,
                  title_font_color = "black",
                  title_pad_t = 2.3,
                  title_pad_l = 18
                  
                 )  
fig.update_yaxes(showticklabels=True, showline=True, linewidth=2, linecolor='black')
fig.update_xaxes(showticklabels=True, showline=True, linewidth=2, linecolor='black')

fig.show()

In [None]:
unique_value_feature_diff = unique_value_feature_train.copy()
unique_value_feature_diff["Count"] = unique_value_feature_train["Count"] - unique_value_feature_test["Count"]
unique_value_feature_diff = unique_value_feature_diff[unique_value_feature_diff["Count"] != 0] 
unique_value_feature_diff
# Count the number of features which has different count in train and test dataset

In [None]:
fig = px.bar(x = unique_value_feature_diff["Features"], y = unique_value_feature_diff["Count"])

fig.update_layout( 
                  plot_bgcolor="#EEEEEE",
                  paper_bgcolor = "#f6f5f5",
                  autosize=True,
                  xaxis = dict(
                          title_text = "<b>Features</b>",
                          titlefont = dict(size = 12)),
                  yaxis = dict(
                          title_text = "<b>Count</b>",
                          titlefont = dict(size = 12)),
                  title_text = "<b> Difference of number of unique values in features </b><br>",
                  title_font_size = 16,
                  title_font_color = "black",
                  title_pad_t = 2.3,
                  title_pad_l = 18
                  
                 )  
fig.update_yaxes(showticklabels=True, showline=True, linewidth=2, linecolor='black')
fig.update_xaxes(showticklabels=True, showline=True, linewidth=2, linecolor='black')

fig.show()

In [None]:
transpose_feature_train = train_df[features]
transpose_feature_train = transpose_feature_train.apply(pd.Series.value_counts, axis = 1).fillna(0)

In [None]:
transpose_features_test = test_df[features]
transpose_features_test = transpose_features_test.apply(pd.Series.value_counts, axis = 1).fillna(0)

In [None]:


fig = make_subplots(rows = 2, cols = 1, shared_yaxes=True, 
                   x_title="Unique Values",  y_title="Count")

fig.append_trace(go.Histogram(
                    x = unique_value_train,
                    marker=dict(color="#ffd514"),
                    name = 'train',
                    nbinsx = 352
                    ),
                    row = 1, col = 1
                )
fig.append_trace(go.Histogram(
                    x = unique_value_test,
                    marker = dict(color = '#ff355d'),
                    name = 'test',
                    nbinsx = 352),
                    row = 2, col = 1
                )


fig.update_layout(height = 300,
                  title_text = "<b> Unique Values</b> <br>" + " Unique value for train and test data set", 
                  paper_bgcolor = "#f6f5f5", plot_bgcolor = "#f6f5f5")
fig.update_yaxes(showticklabels=False)
fig.update_xaxes(showline=True, linewidth=2, linecolor='black')


In [None]:


fig = make_subplots(rows = 1, cols = 2,  
                   x_title="Unique Values",  horizontal_spacing = 0.2)

fig.append_trace(go.Bar(
                    x = unique_value_feature_train['Count'],
                    y = unique_value_feature_train['Features'],
                    marker=dict(color="#ffd514"),
                    name = 'train',
                    orientation='h'
                    ),
                    row = 1, col = 1
                )
fig.append_trace(go.Bar(
                    x = unique_value_feature_test['Count'],
                    y = unique_value_feature_test['Features'],
                    marker = dict(color = '#ff355d'),
                    name = 'test',
                    orientation='h'
                    ),
                    row = 1, col = 2
                )


fig.update_layout( height = 1000,
                  title_text =  " <b> Unique value for train and test data set </b>", 
                  paper_bgcolor = "#f6f5f5", plot_bgcolor = "#f6f5f5")
fig.update_yaxes(showticklabels=True, showline=True, linewidth=2, linecolor='black')
fig.update_xaxes(showline=True, linewidth=2, linecolor='black')


In [None]:


fig = px.bar(x=unique_value_feature_diff['Features'], y=unique_value_feature_diff['Count'], 
            color_discrete_sequence =['#0099cc']*len(unique_value_feature_diff), 
             text=unique_value_feature_diff['Count'])


fig.update_layout(plot_bgcolor="#EEEEEE",
                  paper_bgcolor = "#f6f5f5",
                  autosize= True,
                  xaxis = dict(
                          title_text = "<b>Features</b>",
                          titlefont=dict(size=12)),
                  yaxis = dict(
                          title_text = "<b>Unique values</b>",
                          titlefont = dict(size = 12)),
                  title_text = "<b> Unique Values for Train dataset </b><br>" + 
                                " Positive means that train dataset has higher unique value than test dataset",
                  title_font_size = 16,
                  title_font_color = "black",
                  title_pad_t = 2.3,
                  title_pad_l = 18
                  
                 )  
fig.update_yaxes(showticklabels=True, showline=True, linewidth=2, linecolor='black')
fig.update_xaxes(showticklabels=True, showline=True, linewidth=2, linecolor='black')

fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')

fig.show()

In [None]:
mean_unique_value_train = pd.DataFrame(transpose_feature_train.mean(axis=0))
mean_unique_value_train = mean_unique_value_train.reset_index(drop=False)
mean_unique_value_train.columns = ['Unique', 'Mean']
mean_unique_value_train = mean_unique_value_train.sort_values('Mean', ascending=False)[:10]
mean_unique_value_train

In [None]:
fig = px.bar(
    x=mean_unique_value_train['Unique'],
    y=mean_unique_value_train['Mean'],
    color_discrete_sequence =['#ffd514']*len(unique_value_feature_diff), 
    text=mean_unique_value_train['Mean']
)


fig.update_layout(
    plot_bgcolor="#EEEEEE",
    paper_bgcolor = "#f6f5f5",
    autosize=True,
    xaxis = dict(
        title_text = "<b>Unique values</b>",
        titlefont=dict(size=12)
    ),
    yaxis = dict(
        title_text = "<b>Mean occurance</b>",
        titlefont = dict(size = 12)
    ),
    title_text = "<b> Unique Values for Train dataset </b><br>" + 
                 " Positive means that train dataset has higher unique value than test dataset",
    title_font_size = 16,
    title_font_color = "black",
    title_pad_t = 2.3,
    title_pad_l = 18
                  
                 )  
fig.update_yaxes(showticklabels=True, showline=True, linewidth=2, linecolor='black')
fig.update_xaxes(showticklabels=True, showline=True, linewidth=2, linecolor='black')

fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')

fig.show()

In [None]:
mean_unique_value_test = pd.DataFrame(transpose_features_test.mean(axis=0))
mean_unique_value_test = mean_unique_value_test.reset_index(drop=False)
mean_unique_value_test.columns = ['Unique', 'Mean']
mean_unique_value_test = mean_unique_value_test.sort_values('Mean', ascending=False)[:10]
mean_unique_value_test

In [None]:


fig = px.bar(
    x=mean_unique_value_test['Unique'],
    y=mean_unique_value_test['Mean'],
    color_discrete_sequence =['#ff355d']*len(unique_value_feature_diff), 
    text=mean_unique_value_train['Mean']
)


fig.update_layout(
    plot_bgcolor="#EEEEEE",
    paper_bgcolor = "#f6f5f5",
    autosize=True,
    xaxis = dict(
        title_text = "<b>Unique values</b>",
        titlefont=dict(size=12)
    ),
    yaxis = dict(
        title_text = "<b>Mean occurance</b>",
        titlefont = dict(size = 12)
    ),
    title_text = "<b> Unique Values for Test dataset </b><br>" + 
                 " Number zero is dominating in every row",
    title_font_size = 16,
    title_font_color = "black",
    title_pad_t = 2.3,
    title_pad_l = 18
                  
                 )  
fig.update_yaxes(showticklabels=True, showline=True, linewidth=2, linecolor='black')
fig.update_xaxes(showticklabels=True, showline=True, linewidth=2, linecolor='black')

fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')

fig.show()

In [None]:
zero_positive_train = pd.DataFrame()
zero_positive_train["zero"] = transpose_feature_train.iloc[:, 0]
zero_positive_train["positive"] = transpose_feature_train.iloc[:, 1:].sum(axis = 1)

In [None]:
zero_positive_test = pd.DataFrame()
zero_positive_test["zero"] = transpose_features_test.iloc[:, 0]
zero_positive_test["positive"] = transpose_features_test.iloc[:, 1:].sum(axis = 1)

In [None]:
x1 = zero_positive_test['positive']
x2 = zero_positive_test['zero']

hist_data = [x1, x2]

group_labels = ["positive", "zero"]

colors = ['#ff5573', '#0099cc']

fig = ff.create_distplot(hist_data, group_labels, colors = colors)
fig.update_yaxes(showticklabels=True, showline=True, linewidth=2, linecolor='black')
fig.update_xaxes(showline=True, linewidth=2, linecolor='black')

fig.update_traces(opacity= .8, selector=dict(type='histogram'))
fig.update_layout(height = 300,
                  title_text = "<b>Test Data Set</b>",
                  xaxis_title_text='<b>Number of Occurance</b>', # xaxis label
                  yaxis_title_text='Count',
                  paper_bgcolor = "#f6f5f5", plot_bgcolor = "#f6f5f5")

fig.show()

In [None]:
fig = go.Figure()

fig.add_trace(go.Histogram(
    x = zero_positive_train['positive'],
    histnorm = 'percent',
    name = 'positive', # name used in legend and hover label
    marker_color = '#ff5573',
    opacity = 1
))

fig.add_trace(go.Histogram(
    x = zero_positive_train['zero'],
    histnorm = 'percent',
    name = 'zero',
    marker_color = '#0099cc',
    opacity = 1
))

fig.update_layout(
    title_text='<b>Train Data Set</b>', # title of plot
    xaxis_title_text='<b>Number of Occurance</b>', # xaxis label
    yaxis_title_text='Count', # yaxis label
    height = 300,
    bargap = 0.01, # gap between bars of adjacent location coordinates
    bargroupgap = 0.0001, # gap between bars of the same location coordinates
    paper_bgcolor = "#f6f5f5", plot_bgcolor = "#f6f5f5"
)

fig.update_traces(marker_line_width= 0.9, 
                  selector=dict(type='histogram'),
                 marker_pattern_fillmode= "overlay")


fig.update_yaxes(showticklabels=True, showline=True, linewidth=2, linecolor='black')
fig.update_xaxes(showline=True, linewidth=2, linecolor='black')


fig.show()

If you have any suggestion or liked the work please comment and upvote!