In [1]:
import sklearn
# import ucimlrepo
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as pl

---
# Access Dataset

### Download Dataset
We will access the dataset via. the URL. However, UCI does provide a Python library which you can use if desired: https://archive.ics.uci.edu/dataset/10/automobile

In [2]:
# Download the data to "automobile.zip"
import urllib
_, response = urllib.request.urlretrieve("https://archive.ics.uci.edu/static/public/10/automobile.zip", "automobile.zip")

In [3]:
# Unzip the data
from zipfile import ZipFile
with ZipFile("automobile.zip", "r") as zObj:
    zObj.extractall(path="automobile")

In [4]:
# Display the downloaded items
from pathlib import Path
print("[ Downloaded Files ]")
for idx, p in enumerate(Path("automobile").glob("*")):
    print(f"{idx}: {p}")

[ Downloaded Files ]
0: automobile/app.css
1: automobile/imports-85.names
2: automobile/imports-85.data
3: automobile/misc
4: automobile/Index


In [5]:
# Access the local data
fp = "automobile/imports-85.data"

# Data headers
header = [
    'symboling',
    'normalized-losses',
    'make',
    'fuel-type',
    'aspiration',
    'num-of-doors',
    'body-style',
    'drive-wheels',
    'engine-location',
    'wheel-base',
    'length',
    'width',
    'height',
    'curb-weight',
    'engine-type',
    'num-of-cylinders',
    'engine-size',
    'fuel-system',
    'bore',
    'stroke',
    'compression-ratio',
    'horsepower',
    'peak-rpm',
    'city-mpg',
    'highway-mpg',
    'price'
]

# Read the CSV file from the URL and assign column names
df = pd.read_csv(fp, names=header, na_values='?')

# Display the first few rows of the DataFrame
df

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.40,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.40,8.0,115.0,5500.0,18,22,17450.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,-1,95.0,volvo,gas,std,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,9.5,114.0,5400.0,23,28,16845.0
201,-1,95.0,volvo,gas,turbo,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,8.7,160.0,5300.0,19,25,19045.0
202,-1,95.0,volvo,gas,std,four,sedan,rwd,front,109.1,...,173,mpfi,3.58,2.87,8.8,134.0,5500.0,18,23,21485.0
203,-1,95.0,volvo,diesel,turbo,four,sedan,rwd,front,109.1,...,145,idi,3.01,3.40,23.0,106.0,4800.0,26,27,22470.0


## Access data directly via. URL

You can optionally access the data directly via. the provided URL.

In [None]:
# import pandas as pd

# url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data'

# # Read the CSV file from the URL and assign column names
# df = pd.read_csv(url, names=header, na_values='?')

# # Display the first few rows of the DataFrame
# df

---
# Breakout Session 1
Univariate Analysis to understand the distributions and central tendencies of individual features.

### Objective:​

To delve into the characteristics and distributions of individual features within the dataset.​

### Agenda:​

Data Preprocessing: pre-process the dataset to ensure quality and consistency by handling missing values.​

Visualise Feature Distributions: Utilise histograms, box plots, and other suitable plots to visualise feature distributions and discuss their central tendencies (mean, median, mode).​

Descriptive Statistics: Generate summary statistics to provide insights into the numerical attributes of the dataset by highlighting key metrics such as mean, standard deviation, and quartiles.​

Key Insights and Preliminary Findings: Identify patterns, trends, and outliers within individual features and formulating preliminary findings.

### Deliverables:

Cleaned and pre-processed dataset ready for analysis.​

Descriptive statistics summary highlighting key numerical attributes.​

Visualizations showcasing feature distributions and central tendencies.​

Preliminary findings document outlining initial observations and potential areas for exploration.​

In [None]:
# Clean the dataset to handle missing values and ensure consistency (see additional instructions at the end of this task).
...

In [None]:
# Familiarise yourself with the different features available in the dataset.
...

In [None]:
# Explore individual features to understand their distributions and central tendencies.
...

In [None]:
# Visualise the distributions using histograms, box plots, or other suitable plots.
...

---
# Breakout Session 2
Bivariate Analysis to investigate relationships between pairs of features and their impact on car prices.​

### Objective: ​

To investigate the relationships between different features and their impact on car prices​

### Agenda:​

Data Exploration Techniques: Review scatter plots to visualise relationships between pairs of features. Utilise box plots to identify variations in car prices across different feature categories.​

Correlation Analysis: Conduct correlation analysis to quantify the strength and direction of relationships between numerical features and car prices. Discuss the significance of correlation coefficients in understanding feature impacts.​

Findings on Feature Impact on Car Prices: Analyse scatter plots and box plots to discern patterns and trends in feature-price relationships. Discuss the impact of specific feature pairs on car prices, including both positive and negative correlations.​

Insights and Recommendations: Derive insights into the most influential features affecting car prices. Formulate recommendations for feature highlighting and pricing strategies based on the findings.

### Deliverables:

Scatter plots and box plots illustrating feature relationships with car prices.​

Correlation matrix highlighting the strength of associations between features.​

Discussion of findings regarding the impact of feature pairs on car prices.​

Recommendations for feature highlighting and pricing strategies based on the analysis.​

In [None]:
# Investigate relationships between pairs of features and their impact on car prices.
...

In [None]:
# Use scatter plots, box plots, or correlation analysis to explore these relationships.
...

---
# Breakout Session 3
Customised Plotting to create visualisations tailored to the dataset for enhanced interpretability.

### Objective:

To create tailored visualisations that effectively communicate insights from the dataset and enhance interpretability.​

### Agenda:​

Selecting Plot Types and Customisation Techniques: Review various plot types (e.g., bar charts, line plots, heatmaps) suitable for different data types and relationships. Demonstrate customisation techniques to enhance aesthetics and clarity of visualizations.​

Interpreting Visualizations: Discuss key insights derived from the visualisations. Analysing patterns, trends, and outliers revealed by the customized visualizations.​

### Deliverables:

Customised visualizations tailored to the dataset, including bar charts, line plots, scatter plots, and heatmaps.​

Aesthetic enhancements such as color coding, labeling, and annotations for clarity.​

Interpretation of insights gleaned from the visualizations, with a focus on actionable takeaways.​

In [None]:
# Create customised visualizations tailored to the dataset to enhance interpretability.
...

In [None]:
# Select appropriate plot types and customise their aesthetics for clarity.
...

---
# Breakout Session 4
Dimensionality Reduction & Key Findings to apply PCA for dimensionality reduction and summarise key insights.​

### Objective:

To identify key factors driving price variance using Principal Component Analysis (PCA).​

### Agenda:​

PCA Implementation and Interpretation: Apply PCA on the dataset to identify principal components. Interpret of PCA results, including variance explained by each component and loadings of original features.​

Key Insights from PCA Analysis: Discuss key insights derived from PCA, including dominant factors influencing price variance. Identify of significant features contributing to each principal component.​

Recommendations for Pricing Strategies: Formulate segmented pricing strategies based on identified factors influencing price variance. Discuss pricing adjustments and promotions tailored to different customer segments.​

### Deliverables:


In [None]:
# Apply Principal Component Analysis (PCA) to reduce the dimensionality of the dataset.
...

In [None]:
# Identify key factors influencing price variance using PCA.
...