In [1]:
import pandas as pd
import numpy as np
import plotly as px

# Load the dataset
df = pd.read_csv('/kaggle/input/used-car-price-prediction/car_web_scraped_dataset.csv')

# Display the first few rows of the dataset
df.head()

Unnamed: 0,name,year,miles,color,condition,price
0,Kia Forte,2022,"41,406 miles","Gray exterior, Black interior","No accidents reported, 1 Owner","$15,988"
1,Chevrolet Silverado 1500,2021,"15,138 miles","White exterior, Black interior","1 accident reported, 1 Owner","$38,008"
2,Toyota RAV4,2022,"32,879 miles","Silver exterior, Unknown interior","No accidents reported, 1 Owner","$24,988"
3,Honda Civic,2020,"37,190 miles","Blue exterior, Black interior","No accidents reported, 1 Owner","$18,998"
4,Honda Civic,2020,"27,496 miles","Black exterior, Black interior","No accidents reported, 1 Owner","$19,498"


In [2]:
# Check for missing values in each column
missing_values = df.isnull().sum()

# Display the missing values
missing_values

name         0
year         0
miles        0
color        0
condition    0
price        0
dtype: int64

In [3]:
# Drop rows with missing values
df_cleaned = df.dropna()

# Display the shape of the cleaned dataset
df_cleaned.shape

(2840, 6)

In [4]:
# Display summary statistics of numerical columns
summary_stats = df_cleaned.describe()

# Display the summary statistics
summary_stats

Unnamed: 0,year
count,2840.0
mean,2018.833803
std,3.557585
min,2000.0
25%,2017.0
50%,2020.0
75%,2021.0
max,2024.0


**How to Visualize the Distribution of Car Years?**

In [5]:
import plotly.express as px

# Visualize the distribution of car years
fig = px.histogram(df_cleaned, x='year', nbins=20, title='Distribution of Car Years')
fig.show()


**How to Visualize the Relationship between Mileage and Price?**

In [6]:
# Visualize the relationship between mileage and price
fig = px.scatter(df_cleaned, x='miles', y='price', title='Mileage vs. Price')
fig.show()


**How to Explore the Distribution of Car Conditions?**

In [7]:
# Visualize the distribution of car conditions
fig = px.pie(df_cleaned, names='condition', title='Distribution of Car Conditions')
fig.show()


**How to Analyze the Distribution of Car Colors?**

In [8]:
# Visualize the distribution of car colors
fig = px.bar(df_cleaned['color'].value_counts(), x=df_cleaned['color'].value_counts().index, y=df_cleaned['color'].value_counts().values, title='Distribution of Car Colors')
fig.show()


**How to Explore the Relationship Between Car Year and Price?**

In [9]:
# Remove dollar signs and commas from 'price' column and convert it to numeric
df_cleaned['price'] = pd.to_numeric(df_cleaned['price'].replace('[\$,]', '', regex=True), errors='coerce')

# Visualize the relationship between car year and price
fig = px.scatter(df_cleaned, x='year', y='price', trendline='ols', title='Car Year vs. Price')
fig.show()

**How to Examine the Distribution of Car Mileage?**

In [10]:
# Visualize the distribution of car mileage
fig = px.box(df_cleaned, y='miles', title='Distribution of Car Mileage')
fig.show()


**What is the Average Price of Cars Based on Condition?**

In [11]:
# Visualize the average price of cars based on condition
fig = px.bar(df_cleaned, x='condition', y='price', title='Average Price of Cars Based on Condition')
fig.show()


**How to Explore the Overall Distribution of Car Prices?**

In [12]:
# Visualize the overall distribution of car prices
fig = px.histogram(df_cleaned, x='price', nbins=30, title='Distribution of Car Prices')
fig.show()


**How to Investigate the Average Price Across Different Car Colors?**

In [13]:
# Visualize the average price across different car colors
fig = px.bar(df_cleaned.groupby('color')['price'].mean().reset_index(), x='color', y='price', title='Average Price Across Different Car Colors')
fig.show()


**How to Explore the Relationship Between Car Mileage and Price Based on Condition?**

In [14]:
# Visualize the relationship between car mileage and price based on condition
fig = px.scatter(df_cleaned, x='miles', y='price', color='condition', title='Mileage vs. Price Based on Condition')
fig.show()


**How to Explore the Top 10 Most Common Car Models?**

In [15]:
# Extract the top 10 most common car models
top_models = df_cleaned['name'].value_counts().head(10)

# Visualize the top 10 most common car models
fig = px.bar(top_models, x=top_models.index, y=top_models.values, title='Top 10 Most Common Car Models')
fig.show()


**How to Visualize the Spread of Prices for Each Car Model?**

In [16]:
# Visualize the spread of prices for each car model
fig = px.box(df_cleaned, x='name', y='price', title='Spread of Prices for Each Car Model')
fig.show()


**How to Explore the Distribution of Car Prices Across Different Years?**

In [17]:
# Visualize the distribution of car prices across different years
fig = px.box(df_cleaned, x='year', y='price', title='Distribution of Car Prices Across Different Years')
fig.show()


**How to Analyze the Distribution of Mileage for Each Car Condition?**

In [18]:
# Visualize the distribution of mileage for each car condition
fig = px.box(df_cleaned, x='condition', y='miles', title='Distribution of Mileage for Each Car Condition')
fig.show()


**How to Investigate the Relationship Between Car Age and Price?**

In [19]:
# Calculate the car age
df_cleaned['age'] = 2024 - df_cleaned['year']

# Visualize the relationship between car age and price
fig = px.scatter(df_cleaned, x='age', y='price', trendline='ols', title='Car Age vs. Price')
fig.show()


**How to Examine the Distribution of Car Prices for Each Condition?**

In [20]:
# Visualize the distribution of car prices for each condition
fig = px.box(df_cleaned, x='condition', y='price', title='Distribution of Car Prices for Each Condition')
fig.show()


**How to Explore the Relationship Between Car Condition and Mileage?**

In [21]:
# Visualize the relationship between car condition and mileage
fig = px.scatter(df_cleaned, x='miles', y='condition', color='condition', title='Relationship Between Car Condition and Mileage')
fig.show()


**How to Explore the Relationship Between Car Mileage and Price Using a 3D**

In [22]:
# Visualize the relationship between car mileage and price using a 3D scatter plot
fig = px.scatter_3d(df_cleaned, x='miles', y='price', z='condition', color='condition', title='3D Scatter Plot: Mileage vs. Price vs. Condition')
fig.show()


**How to Explore the Relationship Between Car Year and Mileage**

In [23]:
# Visualize the relationship between car year and mileage with a scatter plot
fig = px.scatter(df_cleaned, x='year', y='miles', color='condition', title='Scatter Plot: Car Year vs. Mileage')
fig.show()


**How to Visualize the Trend in Car Prices Over the Years?**

In [24]:
# Visualize the trend in car prices over the years
fig = px.line(df_cleaned.groupby('year')['price'].mean().reset_index(), x='year', y='price', title='Trend in Car Prices Over the Years')
fig.show()


**How to Explore the Spread of Mileage for Different Car Colors**

In [25]:
# Visualize the spread of mileage for different car colors with a scatter plot
fig = px.scatter(df_cleaned, x='color', y='miles', color='color', title='Scatter Plot: Mileage vs. Car Color')
fig.show()


**How to Analyze the Distribution of Prices for Each Car Model?**

In [26]:
# Visualize the distribution of prices for each car model
fig = px.histogram(df_cleaned, x='price', color='name', title='Distribution of Prices for Each Car Model')
fig.show()


**How to Explore the Relationship Between Mileage and Price Based on Car Colors?**

In [27]:
# Visualize the relationship between mileage and price based on car colors
fig = px.scatter(df_cleaned, x='miles', y='price', color='color', title='Scatter Plot: Mileage vs. Price by Car Color')
fig.show()


**How to Investigate the Average Price for Each Car Condition**

In [28]:
# Visualize the average price for each car condition using a bar chart
fig = px.bar(df_cleaned.groupby('condition')['price'].mean().reset_index(), x='condition', y='price', title='Average Price for Each Car Condition')
fig.show()