In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv(r"C:\Users\User\Downloads\IDS-SEM-PROJECT\car_price_prediction.csv")
df

In [None]:
pd.set_option('display.float_format', '{:.2f}'.format)
df.describe()

In [None]:
#__edit columns' names and data types of some columns............
df.columns = df.columns.str.replace(' ', '_')
df['ID'] = df['ID'].astype(str)
df['Levy'] = df['Levy'].astype(str)
df['Levy'] = df['Levy'].str.replace('-','0')
df['Levy'] = df['Levy'].astype(np.int64)
df['Mileage'] = df['Mileage'].astype(np.int64)
df.columns

In [506]:
#____________Statistical Calulations _____________
#_______ Here we'll generate a new data frame with basic statistical calculations of some important quantities
indices = ['Price','Levy','Cylinders','Airbags']
stats_df = pd.DataFrame({'Mean':df[indices].mean(),'Median':df[indices].median(),'Mode':df[indices].mode().iloc[0],
                         'Std.Dev.':df[indices].std(),'Variance':df[indices].var(),'Count':df[indices].count(),'0.25':df[indices].quantile(0.25),
                        '0.50':df[indices].quantile(0.50),'0.75':df[indices].quantile(0.75),'Max':df[indices].max(),'Min':df[indices].min().values})
#stats_df

In [507]:
#______Correlation analysis between price, levy, airbags, and cylinders
correlation = df[['Price','Levy','Cylinders','Airbags']].corr()
#correlation

In [508]:
#______Anlaysis of missing values
# here we'll generate a table/data frame for missing values comparison for every column in df
missing_value_df = pd.DataFrame({'Count': len(df),'Missing_valus':df.isnull().sum(),'Percent_missing_values':df.isnull().sum()/len(df)*100})
#missing_value_df

In [509]:
#____Analysis of data types and unique values in columns
# we'll create a new data frame for every column's unique counts
unique_counts_df = pd.DataFrame({'Data_Type':df.dtypes, 'Counts': len(df),'Unique_Counts':df.nunique()})
#unique_counts_df

In [None]:
#_____Group Aggregations
#____________Here we'll generate grouped aggregations of all quantities with price___________________
model_price_group = df.groupby('Model')['Price'].agg(['mean','median','std','max','min'])
category_price_group = df.groupby('Category')['Price'].agg(['mean','median','std','max','min','count'])
fuel_price_group = df.groupby('Fuel_type')['Price'].agg(['mean','median','std','max','min','count'])
doors_price_group = df.groupby('Doors')['Price'].agg(['mean','median','std','max','min','count'])
airbags_price_group = df.groupby('Airbags')['Price'].agg(['mean','median','std','max','min','count'])
manufacturer_price_group = df.groupby('Manufacturer')['Price'].agg(['mean','median','std','max','min','count'])
year_price_group = df.groupby('Prod._year')['Price'].agg(['mean','median','std','max','min','count'])
leather_price_group = df.groupby('Leather_interior')['Price'].agg(['mean','median','std','max','min','count'])
mileage_price_group = df.groupby('Mileage')['Price'].agg(['mean','median','std','max','min','count'])
color_price_group = df.groupby('Color')['Price'].agg(['mean','median','std','max','min','count'])
drivewheels_price_group = df.groupby('Drive_wheels')['Price'].agg(['mean','median','std','max','min','count'])
gearbox_price_group = df.groupby('Gear_box_type')['Price'].agg(['mean','median','std','max','min','count'])
cylinder_price_group = df.groupby('Cylinders')['Price'].agg(['mean','median','std','max','min','count'])
wheel_price_group = df.groupby('Wheel')['Price'].agg(['mean','median','std','max','min','count'])
enginevolume_price_group = df.groupby('Engine_volume')['Price'].agg(['mean','median','std','max','min','count'])
drivewheels_price_group 

#### Outlier Detection and Solution
"Here we have one value of price which is very high than mean in whole data set and also in categorical mean...so we'll change it with median value."


In [511]:
df.sort_values('Price', ascending=False).head(20)
med = df['Price'].median()
df.loc[df['Price']>1000000,'Price'] = med

In [512]:
#plt.boxplot(df['Price'])

In [513]:
upperlimit = df['Price'].quantile(0.95)

“95% of cars are priced below 49,495. The histogram focuses on this range to clearly visualize the distribution of typical cars. Cars priced above 49,495 are rare luxury vehicles and are considered outliers for analysis and ML modeling purposes.”

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(14,5))
ax[0].hist(df['Price'][df['Price']<=upperlimit],bins=10,edgecolor='black',color='skyblue')
ax[0].set_title('Price distribution of typical cars (<5000)')
ax[0].set_xlabel('Price')
ax[0].set_ylabel('Count')
ax[1].hist(df['Price'][df['Price']>upperlimit],bins=10,edgecolor='black',color='skyblue')
ax[1].set_title('Price distribution of luxury cars (>5000)')
ax[1].set_xlabel('Price')
ax[1].set_ylabel('Count') 
plt.tight_layout()
#plt.show()

In [None]:
fig, ax1 = plt.subplots(figsize=(15,3))
ax1.bar(category_price_group.index,category_price_group['mean'],color='skyblue',width=0.5)
ax1.set_title('Distribution of Mean Prices about Category')
ax1.set_xlabel('Car\'s Category')
ax1.set_ylabel('Mean Price')

In [None]:
fig, ax2 = plt.subplots(2,2,figsize=(14,10))
ax2 = ax2.flatten()
ax2[0].bar(fuel_price_group.index,fuel_price_group['mean'],color='grey',width=0.3)
ax2[0].set_title('Distribution of Mean Prices about Fuel Type')
ax2[0].set_xlabel('Car\'s Category')
ax2[0].set_ylabel('Mean Price')
ax2[1].bar(airbags_price_group.index,airbags_price_group['mean'],color='skyblue',width=0.3)
ax2[1].set_title('Distribution of Mean Prices about No. of Airbags')
ax2[1].set_xlabel('No. of Airbags')
ax2[1].set_ylabel('Mean Price')
ax2[2].bar(gearbox_price_group.index,gearbox_price_group['mean'],color='lightgreen',width=0.3)
ax2[2].set_title('Distribution of Mean Prices about Gear Boxes')
ax2[2].set_xlabel('Gear Boxes')
ax2[2].set_ylabel('Mean Price')
ax2[3].bar(drivewheels_price_group.index,drivewheels_price_group['mean'],color='orange',width=0.3)
ax2[3].set_title('Distribution of Mean Prices about Drive Wheels')
ax2[3].set_xlabel('Drive Wheels')
ax2[3].set_ylabel('Mean Price')

plt.tight_layout()

In [None]:
fig, ax3 = plt.subplots(2,2,figsize=(14,10))
ax3 = ax3.flatten()
ax3[0].scatter(x=mileage_price_group.index,y=mileage_price_group['mean'],c='green',marker='o')
ax3[0].set_title('Distribution of Mean Prices about Mileage')
ax3[0].set_xlabel('Mileage')
ax3[0].set_ylabel('Mean Price')

plt.tight_layout()