In [1]:
def dataset(directory, id):
  image_url = f'https://drive.google.com/uc?id={id}'
  zip_path = '/content/dataset.zip'

  # Directory to extract the ZIP contents
  extract_to = f'/content/{directory}/'

  # Download the ZIP file
  gdown.download(image_url, zip_path, quiet=False)

  # Extract ZIP file
  if os.path.exists(zip_path):
      try:
          with zipfile.ZipFile(zip_path, 'r') as zip_ref:
              zip_ref.extractall(extract_to)
          print("Extraction was successful")
          os.remove(zip_path)
      except zipfile.BadZipFile:
          print("Failed to extract: the downloaded file is not a ZIP file or is corrupt")
  else:
      print("File does not exist, check the download URL and process.")

In [None]:
img_id = '1WN25VCfedbXBQWxCQrgbZHUrrvPmB6Ii' #Google drive id for Image Dataset

#Downloading dataset
dataset('dataset', img_id)

Drive not mounted, so nothing to flush and unmount.
Mounted at /content/gdrive
/content/gdrive/MyDrive/Colab Notebooks/dataset


In [None]:
import pandas as pd

pd.set_option('display.max_columns', None)

df = pd.read_csv('cars.csv')
df.head()

Unnamed: 0,ID,Name,Price,Brand,Model,Variant,Series,Year,Kilometers,Type,Gearbox,Fuel,Status,CC,Color,Seating Capacity
0,12849595,2004 Holden Rodeo LT (4X4) RA,11000,Holden,Rodeo,LT (4X4),RA,2004,270000,Crew Cab Pickup,Manual,Unleaded Petrol,Used,3494,White,5
1,12776591,2002 Toyota Corolla Ascent Seca ZZE122R,5000,Toyota,Corolla,Ascent Seca,ZZE122R,2002,987475,Hatchback,Manual,Unleaded Petrol,Used,1794,Black,5
2,12751248,2008 Toyota Hiace Slwb KDH221R MY07 Upgrade,15999,Toyota,Hiace,Slwb,KDH221R MY07 Upgrade,2008,634470,Van,Manual,Diesel,Used,2982,White,3
3,12815698,2009 Ford Falcon XT (lpg) FG,1600,Ford,Falcon,XT (lpg),FG,2009,598000,Sedan,Automatic,Liquid Petroleum Gas,Used,3984,White,5
4,12776183,2014 Toyota Camry Hybrid H AVV50R,13000,Toyota,Camry,Hybrid H,AVV50R,2014,583000,Sedan,Automatic,Unleaded Petrol/Electric,Used,2494,White,5


In [None]:
df.isna().sum()

ID                  0
Name                0
Price               0
Brand               0
Model               0
Variant             0
Series              0
Year                0
Kilometers          0
Type                0
Gearbox             0
Fuel                0
Status              0
CC                  0
Color               0
Seating Capacity    0
dtype: int64

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17024 entries, 0 to 17023
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   ID                17024 non-null  int64 
 1   Name              17024 non-null  object
 2   Price             17024 non-null  int64 
 3   Brand             17024 non-null  object
 4   Model             17024 non-null  object
 5   Variant           17024 non-null  object
 6   Series            17024 non-null  object
 7   Year              17024 non-null  int64 
 8   Kilometers        17024 non-null  int64 
 9   Type              17024 non-null  object
 10  Gearbox           17024 non-null  object
 11  Fuel              17024 non-null  object
 12  Status            17024 non-null  object
 13  CC                17024 non-null  int64 
 14  Color             17024 non-null  object
 15  Seating Capacity  17024 non-null  int64 
dtypes: int64(6), object(10)
memory usage: 2.1+ MB


In [None]:
impCols = ['Price', 'Kilometers', 'CC', 'Seating Capacity']

df[impCols].describe()

Unnamed: 0,Price,Kilometers,CC,Seating Capacity
count,17024.0,17024.0,17024.0,17024.0
mean,36717.126703,103223.425458,2492.577714,5.115895
std,30270.016189,77896.698421,882.374677,1.122166
min,1000.0,1.0,875.0,2.0
25%,18800.0,44726.5,1987.0,5.0
50%,29990.0,88557.5,2354.0,5.0
75%,45988.0,148931.0,2982.0,5.0
max,999000.0,987475.0,7300.0,14.0


Standard Deviation Kilometers and Price is very large so from this we can conclude that they have considerable variability or dispersion in the data.
Whereas CC has relatively less Standard Deviation and Seating Capacity has the least of 1.12

In [None]:
import plotly.express as px

color_counts = df['Color'].value_counts().reset_index().head(10)
color_counts.columns = ['Color', 'Count']

# Create a bar chart for car colors
fig = px.bar(
    color_counts,
    x='Color',
    y='Count',
    title='Distribution of Car Colors',
    labels={'Color': 'Car Color', 'Count': 'Count'},
)

# Show the chart
fig.show()

In [None]:
status_counts = df['Status'].value_counts()

# Create a Pie chart using Plotly
fig = px.pie(
    status_counts,
    names=status_counts.index,
    values=status_counts.values,
    title="Distribution of Car Status"
)

fig.update_layout(
    width = 700,
    height = 600
)

# Show the chart
fig.show()

In [None]:
gearbox_counts = df['Gearbox'].value_counts()

# Create a bar plot using Plotly
fig = px.bar(
    x=gearbox_counts.index,
    y=gearbox_counts.values,
    title="Distribution of Gearbox Types",
    labels={'x': 'Gearbox Type', 'y': 'Count'}
)

fig.update_layout(
    width = 700,
    height = 600
)

# Show the chart
fig.show()

In [None]:
import plotly.graph_objs as go

# Calculate mean and median
mean_price = df['Price'].mean()
median_price = df['Price'].median()

# Create a histogram trace
histogram_trace = go.Histogram(x=df['Price'], nbinsx=20, name='Price Rate Distribution')

# Create mean and median lines
mean_line = go.Scatter(x=[mean_price, mean_price], y=[0, df.shape[0]], mode='lines', name=f'Mean: {mean_price:.2f}', line=dict(color='red', dash='dash'))
median_line = go.Scatter(x=[median_price, median_price], y=[0, df.shape[0]], mode='lines', name=f'Median: {median_price:.2f}', line=dict(color='green', dash='dash'))

# Create the layout
layout = go.Layout(title='Price Rate Distribution', xaxis=dict(title='Price Rate'), yaxis=dict(title='Frequency'))

# Create the figure
fig = go.Figure(data=[histogram_trace, mean_line, median_line], layout=layout,)

fig.update_layout(width = 800, height = 600)

# Show the plot
fig.show()


In [None]:
import pandas as pd
import plotly.express as px

df_copy = df.copy()

# Group the data by 'Brand' and average the price
df_copy['Brand_Model'] = df_copy['Brand'] + ' ' + df_copy['Model']
top_brand_models = df_copy['Brand_Model'].value_counts().head(10).reset_index()
top_brand_models.columns = ['Brand_Model', 'Count']

# Group the data by 'Brand_Model' and count the number of models
fig = px.bar(top_brand_models, x='Brand_Model', y='Count', labels={'Brand_Model': 'Brand and Model', 'Count': 'Number of Models'},
             title='Number of Car Models by Top 10 Brand and Model Combinations')
fig.update_layout(
    width=800,  # Set the width
    height=600  # Set the height
)
fig.show()

In [None]:
fig = px.scatter(df, y='Kilometers', x='Price', labels={'Kilometers': 'Kilometers', 'Price': 'Price'},
                 title='Scatterplot between Price and Kilometers', color = 'Year', opacity = 0.5)
fig.update_layout(
    width = 800,
    height = 600
)
fig.show()

In [None]:
fig = px.box(
    df,
    x='Fuel',
    y='Price',
    title='Distribution of Prices by Fuel Type',
    labels={'Fuel': 'Fuel Type', 'Price': 'Price'},
    color='Fuel'
)

fig.update_layout(showlegend = False)

# Show the chart
fig.show()

In [None]:
seating_capacity_avg_price = df.groupby('Seating Capacity')['Price'].mean().reset_index()

# Create a bar plot for seating capacity vs. average price
fig = px.bar(
    seating_capacity_avg_price,
    x='Seating Capacity',
    y='Price',
    title='Average Car Price by Seating Capacity',
    labels={'Seating Capacity': 'Seating Capacity', 'Price': 'Average Price'},
)

# Show the chart
fig.show()

In [None]:
avg_price_df = df.groupby('Year')['Price'].mean().reset_index()
fig = px.line(avg_price_df, y='Price', x='Year',
                 title='Lineplot between Price and Year')
fig.update_layout(
    width = 800,
    height = 600
)
fig.show()

In [None]:
df_1989 = df_copy[df_copy['Year'] == 1989]
df_1989

Unnamed: 0,ID,Name,Price,Brand,Model,Variant,Series,Year,Kilometers,Type,Gearbox,Fuel,Status,CC,Color,Seating Capacity,Brand_Model
501,12756153,1989 Holden Commodore Berlina VN,129990,Holden,Commodore,Berlina,VN,1989,275000,Sedan,Automatic,Unleaded Petrol,Used,4987,Green,5,Holden Commodore
5226,12810355,1989 Toyota Landcruiser (4X4) HJ75RP,38913,Toyota,Landcruiser,(4X4),HJ75RP,1989,132918,Cab Chassis,Manual,Diesel,Used,3980,White,3,Toyota Landcruiser


In [None]:
import plotly.graph_objects as go

df_89 = df_1989[['Brand_Model', 'Price', 'Kilometers', 'CC', 'Seating Capacity']]
df_89 = df_89.set_index('Brand_Model')
for col in df_89:
  df_89[col] = df_89[col].apply(lambda x :x / max(df_89[col]))
angles = [i / float(len(df_89.columns)) * 360 for i in range(len(df_89.columns))]
values0 = df_89.iloc[0].tolist()
values0 += values0[:1]
values1 = df_89.iloc[1].tolist()
values1 += values1[:1]

trace0 = go.Scatterpolar(
    r=values0,
    theta=angles,
    fill='toself',
    name=df_89.index[0]
)
trace1 = go.Scatterpolar(
    r=values1,
    theta=angles,
    fill='toself',
    name=df_89.index[1]
)
layout = go.Layout(
    polar=dict(
        radialaxis=dict(
            visible=True,
            range=[0, 1],
            showticklabels = False
        ),
    angularaxis=dict(
            tickvals=angles,
            ticktext=df_89.columns,
            direction='clockwise'
        )
    ),
    title=f'Car comparison of 1989'
)

fig = go.Figure(data=[trace0, trace1], layout=layout)
fig.update_layout(
    polar=dict(
        radialaxis=dict(visible=True)
    ),
    width=700,  # set width to 400 pixels
    height=500  # set height to 400 pixels
)
fig.show()

In [None]:
features = df[impCols]

# Calculate the correlation matrix
correlation_matrix = features.corr(numeric_only=True)

fig = px.imshow(correlation_matrix,
                x=correlation_matrix.columns,
                y=correlation_matrix.columns,
                color_continuous_scale='earth',
                zmin=-1, zmax=1)

# Customize the heatmap
fig.update_layout(
    title='Correlation Heatmap',
    xaxis_title='Features',
    yaxis_title='Features',
    width=600,  # Adjust the width as needed
    height=600  # Adjust the height as needed
)

for i in range(len(correlation_matrix.columns)):
    for j in range(len(correlation_matrix.columns)):
        fig.add_annotation(
            x=correlation_matrix.columns[i],
            y=correlation_matrix.columns[j],
            text=str(round(correlation_matrix.iloc[j, i], 2)),
            showarrow=False
        )

# Show the plot
fig.show()

In [None]:
from sklearn.preprocessing import LabelEncoder

data = df.copy()
obj_features = ['Name', 'Price', 'Brand', 'Model', 'Variant', 'Series', 'Type', 'Gearbox', 'Fuel', 'Status', 'Color']

label_encoder = LabelEncoder()
for feature in obj_features:
  data[feature] = label_encoder.fit_transform(data[feature])

In [None]:
import pandas as pd
from scipy.stats import pearsonr

# Define the variables you want to assess
numerical_vars = ['Year', 'Kilometers', 'CC']

# Initialize empty dictionaries to store correlation results
correlation_results = {}

# Perform correlation tests for each variable
for var in numerical_vars:
    correlation, p_value = pearsonr(data[var], data['Price'])
    correlation_results[var] = {'correlation': correlation, 'p-value': p_value}

# Set your significance level (alpha)
alpha = 0.05

# Check the p-values and make a decision
for var, result in correlation_results.items():
    print(f"Test for {var}:")
    print(f"Correlation: {result['correlation']}")
    print(f"P-Value: {result['p-value']}")
    if result['p-value'] < alpha:
        print(f"{var} has a significant influence on car price.")
    else:
        print(f"{var} does not have a significant influence on car price.")
    print("\n")


Test for Year:
Correlation: 0.6760308366678005
P-Value: 0.0
Year has a significant influence on car price.


Test for Kilometers:
Correlation: -0.5997535143625465
P-Value: 0.0
Kilometers has a significant influence on car price.


Test for CC:
Correlation: 0.28037397963369776
P-Value: 5.232446879108647e-305
CC has a significant influence on car price.




In [None]:
from scipy import stats

categorical_vars = ['Name', 'Brand', 'Model', 'Variant', 'Series', 'Type', 'Gearbox', 'Fuel', 'Status', 'Color']

for var in categorical_vars:
  # ANOVA-test for model differences
  var_groups = data.groupby(var)
  results_var = stats.f_oneway(*[group['Price'] for name, group in var_groups])

  # Print the results
  print(f"ANOVA-test for {var} Differences:")
  print("F-statistic:", results_var.statistic)
  print("p-value:", results_var.pvalue)
  if results_var.pvalue < 0.05:
    print(f"There is a significant association between {var} and Car price.")
  else:
    print(f"There is no significant association between {var} and Car price.")
  print("\n")

ANOVA-test for Name Differences:
F-statistic: 43.515409145170935
p-value: 0.0
There is a significant association between Name and Car price.


ANOVA-test for Brand Differences:
F-statistic: 70.60304680487805
p-value: 0.0
There is a significant association between Brand and Car price.


ANOVA-test for Model Differences:
F-statistic: 34.464087994550084
p-value: 0.0
There is a significant association between Model and Car price.


ANOVA-test for Variant Differences:
F-statistic: 21.400096635753933
p-value: 0.0
There is a significant association between Variant and Car price.


ANOVA-test for Series Differences:
F-statistic: 44.40344867504379
p-value: 0.0
There is a significant association between Series and Car price.


ANOVA-test for Type Differences:
F-statistic: 82.99991389021136
p-value: 0.0
There is a significant association between Type and Car price.


ANOVA-test for Gearbox Differences:
F-statistic: 544.7015542565921
p-value: 1.295721244026918e-118
There is a significant associati