In [1]:
# Import Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import plotly.express as px
import plotly.io as pio
import plotly
from ydata_profiling import  ProfileReport
import PyQt5 as qt
from IPython.display import display, Markdown
#Enable graphing inside jupytor
#pip install PyQt5 #Install it if not installed
get_ipython().run_line_magic('matplotlib', 'inline')
matplotlib.get_backend()

'module://matplotlib_inline.backend_inline'

In [3]:
# Load Data

In [62]:
df = pd.read_excel('maintenance_cleaned_extended.xlsx')

In [5]:
# Visualization Functions

In [6]:
## Bar, Scatter, Line charts

In [79]:
def myPlot(data,plotType,title):
    data = data.sort_values(by='',ascending=True)
    xs = data.index.astype(str)  
    ys = data.values
    if plotType == 'bar':
        fig = px.bar(data_frame=data, x = xs, y = ys,color=ys,title=title+' Analysis')
    elif plotType == 'scatter':
        fig = px.scatter(data_frame=data, x = xs, y = ys,color=ys,title=title+' Analysis')
    elif plotType == 'line':
        fig = px.line(data_frame=data, x = xs, y = ys,title=title+' Analysis')
    fig.update_layout(title_x=0.45)
    fig.show()

In [5]:
def myPlot1(data,xs,ys,clr,plotType,title, sort_by=None, ascending=True):
    if sort_by is not None:
        data_sorted = data.sort_values(by=sort_by, ascending=ascending)
    else:
        data_sorted = data
    xt=str(xs)
    yt=str(ys)
    xs = data_sorted[xs].astype(str)  
    ys = data_sorted[ys]              
    clr = data_sorted[clr].astype(str)
    if plotType == 'bar':
        fig = px.bar(data_frame=data_sorted, x = xs, y = ys,color=clr,title=title+' Analysis')
    elif plotType == 'scatter':
        fig = px.scatter(data_frame=data_sorted, x = xs, y = ys,color=clr,title=title+' Analysis')
    elif plotType == 'line':
        fig = px.line(data_frame=data_sorted, x = xs, y = ys,color=clr,title=title+' Analysis')
    fig.update_layout(title_x=0.5)
    fig.update_layout(
        xaxis_title=xt,
        yaxis_title=yt
)
    fig.show()

In [6]:
def myBoxPlot(data,x,y,color,title):
    fig = px.box(data, x=x, y=y, color=color, title=title)
    fig.update_layout(
        title_x=0.5,
        xaxis_title=str(x),
        yaxis_title=str(y)
    )
    fig.show()

In [7]:
## Sunburst chart

In [8]:
def mySunBurst(data, name, value, title):
    fig = px.sunburst(
        data_frame=data,
        #path=['cost_category', 'damage type'],   # Add both cost_category and damage type to the hierarchy
        path=name,
        values=value,  # Define the values (damage_count)
        title=title+' Analysis'
    )
    fig.update_layout(title_x=0.45)
    fig.show()

In [9]:
## Pie chart

In [10]:
def myPie(data,title_prefix):
    name  = data.index
    value = data.values
    fig = px.pie(data_frame=data,
                 names = name, 
                 values = value,
                 title ='Top 5 '+ title_prefix +' Analysis'
                )
    fig.update_layout(title_x=0.5)

    fig.show()

In [11]:
## Combine DataFrames

In [12]:
def combine(data,first_field,first_field_count,field_grouped_on,resulting_field_value):
    data_first_cat = data[first_field].value_counts().reset_index()
    data_first_cat.columns = [first_field,first_field_count]
    data_merged = data.groupby([first_field])[field_grouped_on].sum().reset_index(name=resulting_field_value)
    data_merged = data_merged.merge(data_first_cat,on=first_field)
    return first_field_count, resulting_field_value, data_merged

In [13]:
## Bi - Variance Analysis

In [14]:
### Service Duration Analysis

In [15]:
#### What is the Average Service Duration by Damage Type

In [36]:
avg_service_duration_by_damage = df.groupby('damage type')['service_duration'].mean().round(1).sort_values().reset_index(name='AverageServiceDuration')
mySunBurst(avg_service_duration_by_damage, name=['damage type'], value='AverageServiceDuration',title='Average Service Duration by Damage Type')

In [None]:
#### Damage Type vs. Service Duration

In [40]:
#### Which damage types tend to require the longest service durations?

In [38]:
myBoxPlot(df, x='damage type', y='service_duration',color='damage type',title='Service Duration distributed within each damage type')

In [None]:
#### Service duration by car model

In [46]:
myBoxPlot(df, x='car', y='service_duration',color='car',title='Service duration by car model')

In [None]:
#### Average Service duration by location

In [96]:
avg_service_duration_by_damage = df.groupby('location')['service_duration'].mean().round(1).sort_index().reset_index(name='AverageServiceDuration')
myPlot1(avg_service_duration_by_damage,'location','AverageServiceDuration','location','bar','Average service duration by location', sort_by='AverageServiceDuration', ascending=True)

In [None]:
#### Service duration trends by  year - month 

In [107]:
service_duration_by_dateReady = df.groupby(['yearReady','monthReady'])['service_duration'].mean().round(1).sort_values().reset_index(name='ServiceDurationDateReady')
myPlot1(service_duration_by_dateReady,'monthReady','ServiceDurationDateReady','yearReady','line','Average Service duration trends by year - month', sort_by='monthReady', ascending=True)

In [None]:
#### Correlation between service duration and cost

In [100]:
#### Is there a correlation between the service duration and the cost of repair?

In [44]:
fig = px.scatter(df, x='service_duration', y='cost',color='service_duration', title='Correlation between service duration and cost')
fig.update_layout(title_x=0.5)
fig.show()

In [None]:
#### Correlation between service duration and kilometers driven during repair period

In [108]:
fig = px.scatter(df, x='KMs Diff', y='service_duration',color='KMs Diff', title='Correlation between service duration and kilometers driven during repair period')
fig.update_layout(title_x=0.5)
fig.show()

In [None]:
#### Fuel Diff vs. Service Duration

In [None]:
#### Does the fuel difference (amount of fuel used) relate to the service duration?

In [45]:
fig = px.scatter(df, x='Fuel Diff', y='service_duration',color='service_duration', title='Fuel Difference vs. Service Duration')
fig.update_layout(title_x=0.5)
fig.show()

In [None]:
#### What is the average service duration by damage type and cost_category

In [39]:
avg_service_duration_by_damage_cost_category = df.groupby(['damage type','cost_category'])['service_duration'].mean().round(1).sort_values().reset_index(name='AverageServiceDuration')
mySunBurst(avg_service_duration_by_damage_cost_category, name=['damage type','cost_category'], value='AverageServiceDuration',title='Average Service Duration by Damage Type')
#mydata = avg_service_duration_by_damage_cost_category
#xs = mydata['damage type'].astype(str)
#ys = mydata['AverageServiceDuration']
#clr = mydata['cost_category'].astype(str)

myPlot1(
    data=avg_service_duration_by_damage_cost_category,
    xs='damage type',
    ys='AverageServiceDuration',
    clr='cost_category',
    plotType='bar',
    title='Average Service Duration by Damage Type and Cost Category',
    sort_by=['damage type','AverageServiceDuration'],
    ascending=[True, True]
)
myPlot1(
    data=avg_service_duration_by_damage_cost_category,
    xs='damage type',
    ys='AverageServiceDuration',
    clr='cost_category',
    plotType='scatter',
    title='Average Service Duration by Damage Type and Cost Category',
    sort_by=['cost_category', 'damage type'],
    ascending=[True, True])