In [None]:
" Essential Modules "

from dataclasses import dataclass
from tqdm import tqdm
import time
import pandas as pd

" Visualizations "

import plotly.graph_objects as go
import seaborn as sns
import plotly.io as pio
import plotly.figure_factory as ff

" Stats "

from scipy.stats import skew, mode, kurtosis

In [None]:
colab = '../input/customer-analytics/Train.csv'


@dataclass 
class CommerceFreight():
    path: str
    

    def read(self):
        
        "Read dataframe and set pkey as unique index."

        dataframe = pd.read_csv(self.path)
        self.original = dataframe
              
        for columns in self.original:
            if self.original[columns].nunique() == len(self.original):
                try:
                    return self.original.set_index(columns, inplace=True)
                except: 
                    pass
            else:
                pass


    def iformat(self):

        " Format column names to lowercase. "

        self.original.columns = self.original.columns.str.lower()
        
        return self.original


    def summarize(self):

        " Summarize dataframe entries. "

        summary = pd.DataFrame()
        summary['null_values'] = self.original.isna().sum()
        summary['nunique'] = [
            self.original[columns].nunique() for columns in self.original]
        summary['dtype'] = [
            self.original[columns].dtype.name for columns in self.original]
        
        return summary


    def optimize(self):

        " Optimizes features with high memory by converting to Cat. dtype"

        mean_memory = self.original.memory_usage(deep=True).mean()
        pbar = tqdm(total=100)
        old_memory = self.original.memory_usage().sum()
        self.old_memory = old_memory

        # Checks memory usage for all columns

        for columns in self.original:
            time.sleep(0.5)
            if self.original[columns].memory_usage(deep=True) > mean_memory:

                self.original[columns] = self.original[columns].astype(
                    'category')
                
            pbar.update(10)
        pbar.close()

        new_memory = self.original.memory_usage().sum()
        self.new_memory = new_memory
        
        """
        Plotly Figure
        """
        
        fig = go.Figure()

        fig.add_trace(go.Bar(x=['Old', 'New'], y=[
            self.old_memory, self.new_memory],
            marker_color = ['#f25c54', '#f7b267']))
        
        fig.update_yaxes(showgrid=False)
         

        fig.update_layout(
        height=500, width=400, autosize=False,
        title = '<b>Memory</b>', template = 'plotly_white',
        font = dict(size = 12, color = 'gray'),
        margin=dict(l=50,r=50,b=100,t=100,pad=10)
        )
        
        return fig.show()
    
    def split(self):
        
        "splits int and cat"

        int_df = self.original.select_dtypes(exclude=['category'])
        cat_df = self.original.select_dtypes(exclude=['int64'])

        self.int_df = int_df
        self.cat_df = cat_df
 
        return print("""
         created:
            int_df -> dataframe
            cat_df -> dataframe
        """)


In [None]:
class Stat():

    def describe(self, dataframe):

        "Describe the distribution of all int dtypes. "

        cm = sns.color_palette("flare", as_cmap=True)

        return dataframe.describe().drop(
            'count', axis=0).style.background_gradient(cmap=cm)

    def dispersion(self, dataframe):

        " Provides the disperion skew and kurtosis "

        dispersion = pd.DataFrame() 
        dispersion['skew'] = pd.Series([skew(dataframe[col].values) for col in dataframe])
        dispersion['kurtosis'] = pd.Series([kurtosis(dataframe[col].values) for col in dataframe])
        dispersion['int_columns'] = [dataframe[col].name for col in dataframe]
        dispersion.set_index('int_columns', inplace = True)

        column_names = dataframe.columns.to_list()

        return dispersion.style.applymap(color_negative_red).apply(highlight_max)

In [None]:
@dataclass
class Visualization():
    margin: str
    font: str
    pio.templates.default = 'plotly_white'
    

    def layout(self, figure):

        "Uses a default layout for every plot function"

        return figure.update_layout(
            font = self.font,
            margin = self.margin
        )


    def line_plot(self, dataframe, x, y, scatter, plot_title, sub_title):
        
        "Creates a line plot for our figure"

        line = go.Figure()

        line.add_trace(go.Scatter(
            x = dataframe[x],
            y = dataframe[y],
            mode = 'lines+markers' if scatter==False else 'markers'
        ))

        line.update_layout(title = '<b>{}</b><br>{}'.format(plot_title,
            sub_title))
        
        return self.layout(line)

    def pair_plot(self, dataframe):

        " Plots a seaborn pairplot "

        return sns.pairplot(dataframe)
    
    def generate_kde(self, dataframe):

        " Manage data and labels in preparationg for plotting "

        data = [dataframe[cols].values for cols in dataframe]
        label = [dataframe[cols].name for cols in dataframe]
        print('KDE data preparation done.')

        self.kde_data = data 
        self.kde_label = label

    def plot_kde(self):
        
        " plots KDE for all int series in int_df "
        
        for datas, labels in zip(self.kde_data, self.kde_label):

            kde = ff.create_distplot([datas], [labels], colors=['#333F70'], 
                show_hist=False)
            
            yield self.layout(kde)

In [None]:
def color_negative_red(val):
    """
    Takes a scalar and returns a string with
    the css property `'color: red'` for negative
    strings, black otherwise.
    """
    color = 'red' if val < 0 else 'black'
    return 'color: %s' % color

def highlight_max(s):
    '''
    highlight the maximum in a Series yellow.
    '''
    is_max = s == s.max()
    return ['background-color: yellow' if v else '' for v in is_max]

In [None]:
# Create instance
df = CommerceFreight(colab)

In [None]:
# Read and format Dataset
df.read()
df.iformat()

In [None]:
df.summarize()

In [None]:
df.optimize()

In [None]:
df.split()

In [None]:
df.int_df

In [None]:
df.cat_df

In [None]:
viz = Visualization(
    margin = dict(l=50,r=50,t=100,b=100,pad=10),
    font = dict(size=12))

In [None]:
viz.generate_kde(df.int_df)

In [None]:
for items in viz.plot_kde():
      items.show()

In [None]:
stat = Stat()

stat.dispersion(df.int_df)

In [None]:
stat.describe(df.int_df)