In [1]:
# Import Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import plotly.express as px
import plotly.io as pio
import plotly
from ydata_profiling import  ProfileReport
import PyQt5 as qt
from IPython.display import display, Markdown
#Enable graphing inside jupytor
#pip install PyQt5 #Install it if not installed
get_ipython().run_line_magic('matplotlib', 'inline')
matplotlib.get_backend()

'module://matplotlib_inline.backend_inline'

In [3]:
# Load Data

In [4]:
df = pd.read_excel('maintenance_cleaned_extended.xlsx')

In [5]:
# Visualization Functions

In [6]:
## Bar, Scatter, Line charts

In [7]:
def myPlot(data,plotType,title):
    data = data.sort_values(ascending=True)
    xs = data.index.astype(str)  
    ys = data.values
    if plotType == 'bar':
        fig = px.bar(data_frame=data, x = xs, y = ys,color=ys,title=title+' Analysis')
    elif plotType == 'scatter':
        fig = px.scatter(data_frame=data, x = xs, y = ys,color=ys,title=title+' Analysis')
    elif plotType == 'line':
        fig = px.line(data_frame=data, x = xs, y = ys,title=title+' Analysis')
    fig.update_layout(title_x=0.45)
    fig.show()

In [8]:
def myPlot1(data, xs, ys, clr, plotType, title, sort_by=None, ascending=True):
    if sort_by is not None:
        data_sorted = data.sort_values(by=sort_by, ascending=ascending)
    else:
        data_sorted = data
    xt = str(xs)
    yt = str(ys)
    xs = data_sorted[xs]
    ys = data_sorted[ys]
    clr = data_sorted[clr].astype(str) if clr else None
    if plotType == 'bar':
        fig = px.bar(x=xs, y=ys, color=clr, title=title + ' Analysis')
    elif plotType == 'scatter':
        fig = px.scatter(x=xs, y=ys, color=clr, title=title + ' Analysis')
    elif plotType == 'line':
        fig = px.line(x=xs, y=ys, color=clr, title=title + ' Analysis')
    fig.update_layout(title_x=0.5)
    fig.update_layout(xaxis_title=xt, yaxis_title=yt)
    fig.show()

In [9]:
def myPlot2(data, plotType, title):
    xs = data.index.astype(str)  # Index (x-axis)
    ys = data.values             # Values (y-axis)
    # Plot based on the plotType
    if plotType == 'bar':
        fig = px.bar(x=xs, y=ys, color=ys, title=title + ' Analysis')
    elif plotType == 'scatter':
        fig = px.scatter(x=xs, y=ys, color=ys, title=title + ' Analysis')
    elif plotType == 'line':
        fig = px.line(x=xs, y=ys, title=title + ' Analysis')
    # Center the title
    fig.update_layout(title_x=0.5)
    fig.show()

In [10]:
def myBoxPlot(data,x,y,color,title):
    fig = px.box(data, x=x, y=y, color=color, title=title)
    fig.update_layout(
        title_x=0.5,
        xaxis_title=str(x),
        yaxis_title=str(y)
    )
    fig.show()

In [11]:
## Sunburst chart

In [12]:
def mySunBurst(data, name, value, title):
    fig = px.sunburst(
        data_frame=data,
        #path=['cost_category', 'damage type'],   # Add both cost_category and damage type to the hierarchy
        path=name,
        values=value,  # Define the values (damage_count)
        title=title+' Analysis'
    )
    fig.update_layout(title_x=0.45)
    fig.show()

In [13]:
## Pie chart

In [14]:
def myPie(data,title_prefix):
    name  = data.index
    value = data.values
    fig = px.pie(data_frame=data,
                 names = name, 
                 values = value,
                 title ='Top 5 '+ title_prefix +' Analysis'
                )
    fig.update_layout(title_x=0.5)

    fig.show()

In [15]:
## Combine DataFrames

In [16]:
def combine(data,first_field,first_field_count,field_grouped_on,resulting_field_value):
    data_first_cat = data[first_field].value_counts().reset_index()
    data_first_cat.columns = [first_field,first_field_count]
    data_merged = data.groupby([first_field])[field_grouped_on].sum().reset_index(name=resulting_field_value)
    data_merged = data_merged.merge(data_first_cat,on=first_field)
    return first_field_count, resulting_field_value, data_merged

In [17]:
## Bi - Variance Analysis

In [18]:
### Cost Category Insights

In [19]:
#### Distribution of damage types across different cost categories

In [67]:
damageType_costCategory = df.groupby(['damage type','cost_category']).size().reset_index(name='Count').sort_values(by='Count')
myPlot1(damageType_costCategory,'damage type','Count','cost_category','bar','Distribution of damage types across different cost categories', sort_by=None, ascending=True)
Maximum_row = damageType_costCategory.loc[damageType_costCategory['Count'].idxmax()]
print('The damage type that has maximum service count is :',Maximum_row['damage type'])
print('The Cost Category that has maximum service count is :',Maximum_row['cost_category'])
print('The maximum service count is :',Maximum_row['Count'])

The damage type that has maximum service count is : غيار زيت
The Cost Category that has maximum service count is : 0001:0050
The maximum service count is : 496


In [21]:
#### Relationship between cost categories and car models

In [66]:
car_costCategory = df.groupby(['car','cost_category']).size().reset_index(name='Count').sort_values(by=['car'])
myPlot1(car_costCategory,'car','Count','cost_category','bar','Relationship between cost categories and car models', sort_by=None, ascending=True)
car_costCategoryMax = df.groupby(['car','cost_category']).size().reset_index(name='Count').sort_values(by=['car'])
Maximum_row = car_costCategory.loc[car_costCategory['Count'].idxmax()]
print('The car that has maximum service count is :',Maximum_row['car'])
print('The Cost Category that has maximum service count is :',Maximum_row['cost_category'])
print('The maximum service count is :',Maximum_row['Count'])

The car that has maximum service count is : FORTUNER
The Cost Category that has maximum service count is : 0001:0050
The maximum service count is : 101


In [23]:
#### Comparison of service duration across different cost categories

In [68]:
myBoxPlot(data=df,x='cost_category',y='service_duration',color='cost_category',title='Comparison of service duration across different cost categories')
Maximum_row = df.loc[df['service_duration'].idxmax()]
print('The cost category that has maximum service duration is :',Maximum_row['cost_category'])
print('The maximum service duration is :',Maximum_row['service_duration'])

The cost category that has maximum service duration is : 2000:3000
The maximum service duration is : 70


In [25]:
#### Cost category breakdown by location

In [69]:
location_costCategory = df.groupby(['location','cost_category']).size().reset_index(name='Count').sort_values(by=['location'])
myPlot1(location_costCategory,'location','Count','cost_category','bar','Cost category breakdown by location', sort_by=None, ascending=True)
Maximum_row = location_costCategory.loc[location_costCategory['Count'].idxmax()]
print('The location that has maximum service count is :',Maximum_row['location'])
print('The Cost Category that has maximum service count is :',Maximum_row['cost_category'])
print('The maximum service count is :',Maximum_row['Count'])

The location that has maximum service count is : الغزاوي
The Cost Category that has maximum service count is : 0001:0050
The maximum service count is : 385


In [27]:
#### Trends in cost categories over time

In [71]:
df['year_month'] = df['date ready'].dt.year.astype(str) + '_' + df['date ready'].dt.month.astype(str).str.zfill(2)
date_costCategory = df.groupby(['year_month','cost_category']).size().reset_index(name='Frequency').sort_values(by='year_month')
myPlot1(date_costCategory,'year_month','Frequency','cost_category','line','Trends in cost categories over time', sort_by=None, ascending=True)
Maximum_row = date_costCategory.loc[date_costCategory['Frequency'].idxmax()]
print('The year and month that has maximum cost category frequency is :',Maximum_row['year_month'])
print('The Cost Category that has maximum frequency is :',Maximum_row['cost_category'])
print('The maximum frequency is :',Maximum_row['Frequency'])

The year and month that has maximum cost category frequency is : 2015_09
The Cost Category that has maximum frequency is : 0001:0050
The maximum frequency is : 85
