In [1]:
# Import Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import plotly.express as px
import plotly.io as pio
import plotly
from ydata_profiling import  ProfileReport
import PyQt5 as qt
from IPython.display import display, Markdown
#Enable graphing inside jupytor
#pip install PyQt5 #Install it if not installed
get_ipython().run_line_magic('matplotlib', 'inline')
matplotlib.get_backend()

'module://matplotlib_inline.backend_inline'

In [3]:
# Load Data

In [4]:
df = pd.read_excel('maintenance_cleaned_extended.xlsx')

In [5]:
# Visualization Functions

In [6]:
## Bar, Scatter, Line charts

In [7]:
def myPlot(data,plotType,title):
    data = data.sort_values(ascending=True)
    xs = data.index.astype(str)  
    ys = data.values
    if plotType == 'bar':
        fig = px.bar(data_frame=data, x = xs, y = ys,color=ys,title=title+' Analysis')
    elif plotType == 'scatter':
        fig = px.scatter(data_frame=data, x = xs, y = ys,color=ys,title=title+' Analysis')
    elif plotType == 'line':
        fig = px.line(data_frame=data, x = xs, y = ys,title=title+' Analysis')
    fig.update_layout(title_x=0.45)
    fig.show()

In [8]:
def myPlot1(data,xs,ys,clr,plotType,title, sort_by=None, ascending=True):
    if sort_by is not None:
        data_sorted = data.sort_values(by=sort_by, ascending=ascending)
    else:
        data_sorted = data
    xt=str(xs)
    yt=str(ys)
    xs = data_sorted[xs].astype(str)  
    ys = data_sorted[ys]              
    clr = data_sorted[clr].astype(str)
    if plotType == 'bar':
        fig = px.bar(data_frame=data_sorted, x = xs, y = ys,color=clr,title=title+' Analysis')
    elif plotType == 'scatter':
        fig = px.scatter(data_frame=data_sorted, x = xs, y = ys,color=clr,title=title+' Analysis')
    elif plotType == 'line':
        fig = px.line(data_frame=data_sorted, x = xs, y = ys,color=clr,title=title+' Analysis')
    fig.update_layout(title_x=0.5)
    fig.update_layout(
        xaxis_title=xt,
        yaxis_title=yt
)
    fig.show()

In [9]:
def myBoxPlot(data,x,y,color,title):
    fig = px.box(data, x=x, y=y, color=color, title=title)
    fig.update_layout(
        title_x=0.5,
        xaxis_title=str(x),
        yaxis_title=str(y)
    )
    fig.show()

In [10]:
## Sunburst chart

In [11]:
def mySunBurst(data, name, value, title):
    fig = px.sunburst(
        data_frame=data,
        #path=['cost_category', 'damage type'],   # Add both cost_category and damage type to the hierarchy
        path=name,
        values=value,  # Define the values (damage_count)
        title=title+' Analysis'
    )
    fig.update_layout(title_x=0.45)
    fig.show()

In [12]:
## Pie chart

In [13]:
def myPie(data,title_prefix):
    name  = data.index
    value = data.values
    fig = px.pie(data_frame=data,
                 names = name, 
                 values = value,
                 title ='Top 5 '+ title_prefix +' Analysis'
                )
    fig.update_layout(title_x=0.5)

    fig.show()

In [14]:
## Combine DataFrames

In [84]:
def combine(data,first_field,first_field_count,field_grouped_on,resulting_field_value):
    data_first_cat = data[first_field].value_counts().reset_index()
    data_first_cat.columns = [first_field,first_field_count]
    data_merged = data.groupby([first_field])[field_grouped_on].sum().reset_index(name=resulting_field_value)
    data_merged = data_merged.merge(data_first_cat,on=first_field)
    return first_field_count, resulting_field_value, data_merged

In [16]:
## Bi - Variance Analysis

In [None]:
#### Total cost distribution across different years

In [51]:
df_year_cost = df.groupby(['yearReady'])['cost'].sum().reset_index(name='total_year_cost')
mySunBurst(df_year_cost, name=['yearReady'], value='total_year_cost',title='Total cost distribution across different years')
myPlot1(
    data=df_year_cost,
    xs='yearReady',
    ys='total_year_cost',
    clr='total_year_cost',
    plotType='bar',
    title='Total cost distribution across different years',
    sort_by=['total_year_cost'],
    ascending=[True]
)

In [52]:
#### Total cost distribution across different car models

In [53]:
df_car_cost = df.groupby(['car'])['cost'].sum().reset_index(name='total_car_cost')
mySunBurst(df_car_cost, name=['car'], value='total_car_cost',title='Total cost distribution across different car models')
myPlot1(
    data=df_car_cost,
    xs='car',
    ys='total_car_cost',
    clr='total_car_cost',
    plotType='bar',
    title='Total cost distribution across different cars',
    sort_by=['total_car_cost'],
    ascending=[True]
)

In [None]:
#### Total cost distribution across different locations

In [54]:
df_location_cost = df.groupby(['location'])['cost'].sum().reset_index(name='total_location_cost')
mySunBurst(df_location_cost, name=['location'], value='total_location_cost',title='Total cost distribution across different locations')
myPlot1(
    data=df_location_cost,
    xs='location',
    ys='total_location_cost',
    clr='total_location_cost',
    plotType='bar',
    title='Total cost distribution across different locations',
    sort_by=['total_location_cost'],
    ascending=[True]
)

In [None]:
#### Total cost distribution across different corporates

In [55]:
df_corporate_cost = df.groupby(['corporate'])['cost'].sum().reset_index(name='total_corporate_cost')
mySunBurst(df_corporate_cost, name=['corporate'], value='total_corporate_cost',title='Total cost distribution across different corporates')
myPlot1(
    data=df_corporate_cost,
    xs='corporate',
    ys='total_corporate_cost',
    clr='total_corporate_cost',
    plotType='bar',
    title='Total cost distribution across different corporate',
    sort_by=['total_corporate_cost'],
    ascending=[True]
)

In [None]:
#### Total cost distribution across different delivered by persons

In [56]:
df_deliveredBy_cost = df.groupby(['delivered by'])['cost'].sum().reset_index(name='total_deliveredBy_cost')
mySunBurst(df_deliveredBy_cost, name=['delivered by'], value='total_deliveredBy_cost',title='Total cost distribution across different delivered by persons')
myPlot1(
    data=df_deliveredBy_cost,
    xs='delivered by',
    ys='total_deliveredBy_cost',
    clr='total_deliveredBy_cost',
    plotType='bar',
    title='Total cost distribution across different delivered by persons',
    sort_by=['total_deliveredBy_cost'],
    ascending=[True]
)

In [None]:
#### Total cost distribution across different returned by persons

In [57]:
df_returnedBy_cost = df.groupby(['returned by'])['cost'].sum().reset_index(name='total_returnedBy_cost')
mySunBurst(df_returnedBy_cost, name=['returned by'], value='total_returnedBy_cost',title='Total cost distribution across different returned by persons')
myPlot1(
    data=df_returnedBy_cost,
    xs='returned by',
    ys='total_returnedBy_cost',
    clr='total_returnedBy_cost',
    plotType='bar',
    title='Total cost distribution across different returned by persons',
    sort_by=['total_returnedBy_cost'],
    ascending=[True]
)

In [None]:
#### Total cost distribution across different returned by other persons

In [58]:
deliveredByOthers_RB = df[(df.get('returned by') == df.get('delivered by')) ==False]['returned by'].reset_index(name='returnedByOthers')
deliveredByOthers_CS = df[(df.get('returned by') == df.get('delivered by')) ==False]['cost'].reset_index(name='returnedByOthers_cost')
deliveredByOthers_combined = deliveredByOthers_RB.join(deliveredByOthers_CS, lsuffix='_caller')
deliveredByOthers_combined = deliveredByOthers_combined.drop(columns=['index_caller','index'])
deliveredByOthers_cost = deliveredByOthers_combined.groupby(['returnedByOthers'])['returnedByOthers_cost'].sum().reset_index(name='total_returnedByOthers_cost')
mySunBurst(deliveredByOthers_cost, name=['returnedByOthers'], value='total_returnedByOthers_cost',title='Total cost distribution across different returned by other persons')
myPlot1(
    data=deliveredByOthers_cost,
    xs='returnedByOthers',
    ys='total_returnedByOthers_cost',
    clr='total_returnedByOthers_cost',
    plotType='bar',
    title='Total cost distribution across different returned by other persons',
    sort_by=['total_returnedByOthers_cost'],
    ascending=[True]
)

In [None]:
#### Count distribution across different returned by other persons

In [59]:
#display(Markdown(deliveredByOthers_combined.to_markdown()))
deliveredByOthers_count = deliveredByOthers_combined['returnedByOthers'].value_counts().reset_index(name='total_returnedByOthers_count')
#display(Markdown(deliveredByOthers_count.to_markdown()))
mySunBurst(deliveredByOthers_count, name=['index'], value='total_returnedByOthers_count',title='Count distribution across different returned by other persons')
myPlot1(
    data=deliveredByOthers_count,
    xs='index',
    ys='total_returnedByOthers_count',
    clr='total_returnedByOthers_count',
    plotType='bar',
    title='Count distribution across different returned by other persons ',
    sort_by=['total_returnedByOthers_count'],
    ascending=[True]
)

In [None]:
#### Total cost of each Damage type for each Cost Category

In [61]:
df_cost_damage = df.groupby(['cost_category','damage type'])['cost'].sum().reset_index(name='total_damage_cost')
mySunBurst(df_cost_damage, name=['cost_category','damage type'], value='total_damage_cost',title='Total cost of each damage type for each cost category')

In [None]:
#### Total cost of each year for each month name

In [62]:
df_year_month_cost = df.groupby(['yearReady','monthNReady'])['cost'].sum().reset_index(name='total_year_month_cost')
mySunBurst(df_year_month_cost, name=['yearReady','monthNReady'], value='total_year_month_cost',title='Total cost of each year for each month name')