# Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns                       #visualisation
import matplotlib.pyplot as plt 

# Read the data

In [None]:
df = pd.read_csv('/kaggle/input/cardataset/data.csv')

 # To display the 1st 5 rows

In [None]:
df.head(5) 

 # To display the botton 5 rows

In [None]:
df.tail(5)   

# Total details of dataframe

In [None]:
df.info()

# Statistical summary

In [None]:
df.describe()

# Dropping the columns and displaying 1st 5 rows

In [None]:
df = df.drop(['Engine Fuel Type', 'Number of Doors','Market Category'], axis=1)
df.head(5)

# Renaming the columns

In [None]:
df = df.rename(columns={"Engine HP": "HP", "Engine Cylinders": "Cylinders", "Transmission Type": "Transmission", "Driven_Wheels": "Drive Mode","highway MPG": "MPG-H", "city mpg": "MPG-C", "MSRP": "Price" })
df.head(5)

# Getting the no of rows & columns from dataframe

In [None]:
df.shape

# Checking for duplicates

In [None]:
duplicate_rows_df = df[df.duplicated()]
print("number of duplicate rows: ", duplicate_rows_df.shape)

# Deleting duplicates

In [None]:
df = df.drop_duplicates()
df.head(5)

In [None]:
df.shape

# Checking for null values

In [None]:
print(df.isnull().sum())

# Drop null values and count the no of rows

In [None]:
df = df.dropna()
df.count()

In [None]:
print(df.isnull().sum())

# Detecting outliers

> Boxplot of price

In [None]:
sns.boxplot(x=df['Price'])

> Boxplot of HP

In [None]:
sns.boxplot(x=df['HP'])

> Boxplot of Cylinders

In [None]:
sns.boxplot(x=df['Cylinders'])

# Removing outliers

In [None]:
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
print(IQR)

In [None]:
df = df[~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)]
df.shape

# Checking if outliers are removed or not

> Boxplot of price

In [None]:
sns.boxplot(x=df['Price'])

> Boxplot of HP

In [None]:
sns.boxplot(x=df['HP'])

> Boxplot of Cylinders

In [None]:
sns.boxplot(x=df['Cylinders'])

# Top 10 most represented car brands

In [None]:
# Percentage of car per brand
counts = df['Make'].value_counts()*100/sum(df['Make'].value_counts())

# Top 10 car brands
popular_labels = counts.index[:10]
    
# Plot
plt.figure(figsize=(15,9))
plt.barh(popular_labels, width=counts[:10])
plt.title('Top 10 Car brands')
plt.show()

# Avg price of top car brands

In [None]:
prices = df[['Make','Price']].loc[(df['Make'] == 'Chevrolet')|
               (df['Make'] == 'Ford')|
               (df['Make'] == 'Volkswagen')|
               (df['Make'] == 'Toyota')|
               (df['Make'] == 'Dodge')|
               (df['Make'] == 'Nissan')|
               (df['Make'] == 'GMC')|
               (df['Make'] == 'Honda')|
               (df['Make'] == 'Mazda')].groupby('Make').mean()
print(prices)

# Finding correlation

In [None]:
df.corr()

# HeatMap

In [None]:
car_corr=df.corr()
f,ax=plt.subplots(figsize=(15,10))
sns.heatmap(car_corr, cmap='jet',annot=True)
plt.title("Correlation between features", 
          weight='bold', 
          fontsize=20)
plt.show()

There is a +ve correlation between:

1. Engine HP and Engine Cylinders
2. Engine HP and MSRP
3. Engine Cylinders and MSRP
4. MPG-H and MPG-C

There is Strong -ve correlation between :

1. MPG-H and Cylinders
2. MPG-C and Cylinders

# Plotting graphs

> Graph between Year and price

In [None]:
plt.figure(figsize=(15,10))
sns.barplot(df['Year'],df['Price'])

> Graph between Cylinders and price

In [None]:
plt.figure(figsize=(15,10))
sns.barplot(df['Cylinders'],df['Price'])

> Graph between MPG-H and price

In [None]:
plt.figure(figsize=(15,10))
sns.barplot(df['MPG-H'],df['Price'])

> Graph between MPG-C and price

In [None]:
plt.figure(figsize=(15,10))
sns.barplot(df['MPG-C'],df['Price'])

# Finding from the graph

1. Car price increased after year 2000
2. More the no the cylinders, more is the price
3. For MPG-H 13, price is maximum and for MPG-H 15, price is lowest.
4. For MPG-C 10, price is maximum and for MPG-C 31, price is minimum

# Encoding data

In [None]:
dcode=df

dcode["Make"]=dcode["Make"].astype("category")
dcode["Make"]=dcode["Make"].cat.codes

dcode["Model"] = dcode["Model"].astype('category')
dcode["Model"] = dcode["Model"].cat.codes

'''dcode["Year"]=dcode["Year"].astype("category")
dcode["Year"]=dcode["Year"].cat.codes
dcode["HP"]=dcode["HP"].astype("category")
dcode["HP"]=dcode["HP"].cat.codes
dcode["Cylinders"]=dcode["Cylinders"].astype("category")
dcode["Cylinders"]=dcode["Cylinders"].cat.codes
dcode["MPG-H"]=dcode["MPG-H"].astype("category")
dcode["MPG-H"]=dcode["MPG-H"].cat.codes
dcode["MPG-C"]=dcode["MPG-C"].astype("category")
dcode["MPG-C"]=dcode["MPG-C"].cat.codes
dcode["Popularity"]=dcode["Popularity"].astype("category")
dcode["Popularity"]=dcode["Popularity"].cat.codes'''

dcode["Transmission"]=dcode["Transmission"].astype("category")
dcode["Transmission"]=dcode["Transmission"].cat.codes

dcode["Drive Mode"]=dcode["Drive Mode"].astype("category")
dcode["Drive Mode"]=dcode["Drive Mode"].cat.codes

dcode["Vehicle Size"]=dcode["Vehicle Size"].astype("category")
dcode["Vehicle Size"]=dcode["Vehicle Size"].cat.codes

dcode["Vehicle Style"]=dcode["Vehicle Style"].astype("category")
dcode["Vehicle Style"]=dcode["Vehicle Style"].cat.codes

# Pre-processing data

In [None]:
from sklearn import preprocessing
X = np.asarray(dcode[['Make', 'Model', 'Transmission', 'Drive Mode','Vehicle Size','Vehicle Style']])
y = np.asarray(dcode['Price'])
X = preprocessing.StandardScaler().fit(X).transform(X)

# Splitting data

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=44)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

# Linear Regression

In [None]:
from sklearn import linear_model
lm = linear_model.LinearRegression()
model = lm.fit(X_train,y_train)
predictions = lm.predict(X_test)

> Score Prediction

In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression().fit(X_train,y_train)
y_pred = model.predict(X_test)
model.score(X_test,y_pred)

# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier 
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report 
d_m = DecisionTreeClassifier(random_state = 0)
d_m.fit(X_train,y_train)
y_pred = d_m.predict(X_test)
print("Confusion Matrix:\n\n", confusion_matrix(y_test, y_pred)) 
print ("\nAccuracy : ", accuracy_score(y_test,y_pred)*100)

# MSE, RMSE, MAE, R-sq

In [None]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import math

print('MSE: %.2f' % mean_squared_error(y_test, y_pred))
print('Coefficient of determination or R Squared : %.2f' % r2_score(y_test, y_pred))
print('MAE :%.2f' % mean_absolute_error(y_test, y_pred))
print('RMSE : %.2f' % math.sqrt(mean_squared_error(y_test, y_pred)))