In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Data Cleaning
Data cleansing or data cleaning is the process of detecting and correcting (or removing) corrupt or inaccurate records from a record set, table, or database and refers to identifying incomplete, incorrect, inaccurate or irrelevant parts of the data and then replacing, modifying, or deleting the dirty or coarse data.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px

#importing libraries to use the library functions

In [None]:
df=pd.read_csv("../input/cardataset/data.csv")
#df contains the file information  which is in csv format

In [None]:
df

In [None]:
df.columns
#listing the column names of the dataset/dataframe

In [None]:
df.dtypes
#checking the datatypes of different columns of the dataframe

In [None]:
df.info()
#getting the information of dataframe such as no. of entries,data columns,non-null count,data types,etc

In [None]:
df.shape
#shape of the dataframe ie no. of rows and columns

In [None]:
df.describe()
#checking for statistical summary such as count,mean,etc. of numeric columns

In [None]:
df.drop(df[df['MSRP'] == 0].index,inplace=True)
#dropping rows which have zero as a value for MSRP column as it is our dependent/target variable.

In [None]:
df.shape

In [None]:
df.drop(['Market Category'], axis=1, inplace=True)
#dropping 'market category' column as MSRP is independent of it and hence not useful in predicting price of car. 

In [None]:
df.shape

In [None]:
df=df.rename(columns={'Engine HP':'HP','Engine Cylinders':'Cylinders','Transmission Type':'Transmission','Driven_Wheels':'Drive Mode','highway MPG':'MPG-H','city mpg':'MPG-C','MSRP':'Price'})
#renaming the column names as per mentioned in the steps of the problem statement

In [None]:
df

In [None]:
df.duplicated().sum()
#checking for any duplicates in the data

In [None]:
df.drop_duplicates(keep=False,inplace=True)
#removing the duplicates in the data

In [None]:
df

In [None]:
df.isnull().sum()
#checking for any null values in the data

In [None]:
df.dropna(inplace=True,axis=0)
#removing the null values in the data

In [None]:
df.isnull().sum()
#verfying for any null values

## EDA with Data Visualization
Exploratory Data Analysis refers to the critical process of performing initial investigations on data so as to discover patterns,to spot anomalies,to test hypothesis and to check assumptions with the help of summary statistics and graphical representations.

Data visualization is the graphical representation of data in order to interactively and efficiently convey insights to clients, customers, and stakeholders in general.

#### Box plot for outliers
In descriptive statistics, a box plot is a method for graphically depicting groups of numerical data through their quartiles.
Box plots may also have lines extending vertically from the boxes (whiskers) indicating variability outside the upper and lower quartiles, hence the terms box-and-whisker plot and box-and-whisker diagram. Outliers may be plotted as individual points.



In [None]:
sns.boxplot(data=df,orient='h',palette='Set2')
#checking for any outliers in the data

In [None]:
df.drop(df[df['Price'] >= 500000].index,inplace=True)
#removing the unnecessary data points from the dataset

In [None]:
df

In [None]:
sns.boxplot(x=df['Price'])

In [None]:
sns.boxplot(x=df['Cylinders'])

In [None]:
sns.boxplot(x=df['HP'])

In [None]:
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
print(IQR)

In [None]:
df = df[~((df < (Q1 - 1.5*IQR))|(df > (Q3 + 1.5*IQR))).any(axis = 1)]
df.shape

In [None]:
counts=df['Make'].value_counts()*100/sum(df['Make'].value_counts())
#calculating percentage of each brand

In [None]:
popular_labels=counts.index[:10]

colors=['lightslategray',]*len(popular_labels)
colors[0]='crimson'

fig=go.Figure(data=[go.Bar(x=counts[:10],y=popular_labels,marker_color=colors,orientation='h')])
fig.update_layout(title_text='Most represented Car Brands in the Dataset',xaxis_title="Percentage",yaxis_title="Car Brand")
#plotting the top 10 brands represented in the dataset

In [None]:
prices = df[['Make','Price']].loc[(df['Make'].isin(popular_labels))].groupby('Make').mean()
print(prices)
#calculating the average price of top 10 brands represented in the dataset

#### Boxplot
Boxplots are a measure of how well distributed the data in a data set is. It divides the data set into three quartiles. This graph represents the minimum, maximum, median, first quartile and third quartile in the data set.

In [None]:
display_p=df[['Make','Year','Price']].loc[(df['Make'].isin(popular_labels))]

fig=px.box(display_p,x="Make",y="Price")
fig.update_layout(title_text='Average Price over 10 most represented Car Brands',xaxis_title="Make",yaxis_title="Average Price")

#### Correlation matrix
Correlation coefficients quantify the association between variables or features of a dataset. These statistics are of high importance for science and technology, and Python has great tools that you can use to calculate them. SciPy, NumPy, and Pandas correlation methods are fast, comprehensive, and well-documented.

The correlation matrix can be used to estimate the linear historical relationship between the returns of multiple assets. You can use the built-in . corr() method on a pandas DataFrame to easily calculate the correlation matrix. Correlation ranges from -1 to 1.

In [None]:
df.corr()

#### Heatmap
A heatmap is a graphical representation of data in which data values are represented as colors. That is, it uses color in order to communicate a value to the reader. This is a great tool to assist the audience towards the areas that matter the most when you have a large volume of data.

In [None]:
df_corr=df.corr()
f,ax=plt.subplots(figsize=(12,7))
sns.heatmap(df_corr,cmap='viridis',annot=True)
plt.title("Correlation between features",weight='bold',fontsize=18)
plt.show()

#plotting the heatmap for different features

### From the above heatmap ,we can conclude that :
#### >> Price greatly depends upon features Horse Power(HP) and Year
#### >> Also, the features HP and Cylinders are positively dependent on each other.
i.e if no. of cylinders are increased, HP also increases.

#### >> And features MPG-H,MPG-C are negatively dependent on Cylinders
i.e if no. of cylinders are increased, MPG-H & MPG-C decreases.

#### Scatterplot
Scatter plots are used to plot data points on horizontal and vertical axis in the attempt to show how much one variable is affected by another. Each row in the data table is represented by a marker the position depends on its values in the columns set on the X and Y axes.

In [None]:
fig,ax = plt.subplots(figsize=(12,7))
ax.scatter(df['HP'],df['Price'])
ax.set_xlabel('HP')
ax.set_ylabel('Price')
plt.show()

In [None]:
fig,ax = plt.subplots(figsize=(12,7))
ax.scatter(df['HP'],df['Price'])
ax.set_xlabel('HP')
ax.set_ylabel('Price')
plt.show()

In [None]:
fig,ax = plt.subplots(figsize=(12,7))
ax.scatter(df['HP'],df['Cylinders'])
ax.set_xlabel('HP')
ax.set_ylabel('Cylinders')
plt.show()

In [None]:
#creating new column 'Price Range' for easy visualization
def getrange(Price):
    if (Price >= 0 and Price < 25000):
        return '0 - 25000'
    if (Price >= 25000 and Price < 50000):
        return '25000 - 50000'
    if (Price >= 50000 and Price < 75000):
        return '50000 - 75000'
    if (Price >= 75000 and Price < 100000):
        return '75000 - 100000'
       
df['Price Range'] = df.apply(lambda x:getrange(x['Price']),axis = 1)

df['Price Range'].value_counts()

In [None]:
#distribution of number of cars over the years
dic = {1990+i : sum(df['Year']==1990+i) for i in range(28)}
x_dic = [1990 + i for i in range(28)]
y_dic = [dic[1990 + i] for i in range(28)]

# Plot
fig = go.Figure([go.Bar(x=x_dic, y=y_dic)])

fig.update_layout(title="Car year distribution",
                  xaxis_title="Year",
                  yaxis_title="Count Cars sold")


fig.show()

#### Bar plot
A bar chart or bar graph is a chart or graph that presents categorical data with rectangular bars with heights or lengths proportional to the values that they represent. The bars can be plotted vertically or horizontally. A bar graph shows comparisons among discrete categories.

In [None]:
plt.rcParams['figure.figsize'] = (15,9)

x = pd.crosstab(df['Price Range'],df['Engine Fuel Type'])
color = plt.cm.copper(np.linspace(0,1,9))
x.div(x.sum(1).astype(float),axis = 0).plot(kind = 'bar',stacked = True ,color=color)
plt.title("Price vs Engine Fuel Type",fontweight = 30,fontsize = 20)
plt.show()

In [None]:
plt.rcParams['figure.figsize'] = (15,9)
x = pd.crosstab(df['Price Range'],df['Drive Mode'])
x.div(x.sum(1).astype(float),axis = 0).plot(kind = 'bar',stacked = False)
plt.title('Price vs Drive Mode',fontweight = 30,fontsize = 20)
plt.show()

In [None]:
plt.rcParams['figure.figsize'] = (15,9)
x = pd.crosstab(df['Price Range'],df['Vehicle Size'])
x.div(x.sum(1).astype(float),axis = 0).plot(kind = 'bar',stacked = False)
plt.title('Price vs Size',fontweight = 30,fontsize = 20)
plt.show()

In [None]:
plt.rcParams['figure.figsize'] = (15,9)

x = pd.crosstab(df['Price Range'],df['Vehicle Style'])
x.div(x.sum(1).astype(float),axis = 0).plot(kind = 'bar',stacked = True)
plt.title("Price vs Vehicle Style",fontweight = 30,fontsize = 20)
plt.show()

#### Pie Chart
A pie chart is a type of data visualization that is used to illustrate numerical proportions in data. The python library ‘matplotlib’ provides many useful tools for creating beautiful visualizations, including pie charts.

In [None]:
data_pie = df['Transmission'].value_counts()

fig = go.Figure(data=[go.Pie(labels=data_pie.index, values=data_pie.tolist(), textinfo='label+percent',insidetextorientation='radial')])

fig.update_traces(hole=.3, hoverinfo="label+percent+name")

In [None]:
df.head()

In [None]:
df.shape

## Modelling and Prediction
#### Label Encoding
Label Encoding refers to converting the labels into numeric form so as to convert it into the machine-readable form. Machine learning algorithms can then decide in a better way on how those labels must be operated. It is an important pre-processing step for the structured dataset in supervised learning.

In [None]:
# performing label encoding to the categorical columns
columns_to_convert=['Make','Model','Engine Fuel Type','Transmission','Drive Mode','Vehicle Size','Vehicle Style','Price Range']
df[columns_to_convert] = df[columns_to_convert].astype('category')

In [None]:
df.dtypes

In [None]:
from sklearn import preprocessing
  
# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()
  
# Encode labels in column 'species'.
for col in ['Make','Model','Engine Fuel Type','Transmission','Drive Mode','Vehicle Size','Vehicle Style','Price Range']: df[col] = label_encoder.fit_transform(df[col])

In [None]:
df.head()

In [None]:
# splitting the dependent and independent variables

x = df[['Popularity','Year','HP','Cylinders','MPG-H','MPG-C']].values
y = df['Price'].values

print(x.shape)
print(y.shape)

In [None]:
#normalizing the data
from sklearn.preprocessing import StandardScaler
sc_x=StandardScaler()
sc_y=StandardScaler()

x=sc_x.fit_transform(x)
y=sc_y.fit_transform(y.reshape(-1,1))

#### Splitting the dataset
Typically,we separate a data set into a training set and testing set, most of the data is used for training, and a smaller portion of the data is used for testing. Analysis Services randomly samples the data to help ensure that the testing and training sets are similar.

In [None]:
# splitting the dataset into training and test sets

from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2,random_state = 0)

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

#### Predictive Modelling
Predictive modeling is a powerful way to add intelligence to your application. It enables applications to predict outcomes against new data. The act of incorporating predictive analytics into your applications involves two major phases: model training and model deployment.

#### Linear Regression
Linear regression is a basic and commonly used type of predictive analysis.The overall idea of regression is to examine two things:

(1) does a set of predictor variables do a good job in predicting an outcome (dependent) variable?

(2) Which variables in particular are significant predictors of the outcome variable, and in what way do they–indicated by the magnitude and sign of the beta estimates–impact the outcome variable?

These regression estimates are used to explain the relationship between one dependent variable and one or more independent variables.

In [None]:
from sklearn.linear_model import LinearRegression

lr_model = LinearRegression()
lr_model.fit(x_train,y_train)

# calculating the accuracies
print("Training Accuracy :",lr_model.score(x_train,y_train))
print("Testing Accuracy :",lr_model.score(x_test,y_test))

In [None]:
y_pred = lr_model.predict(x_test)
y_pred[0:5]

In [None]:
plt.scatter(y_test,y_pred)
plt.xlabel("True Values")
plt.ylabel("Predicted Values")

In [None]:
sns.distplot((y_test-y_pred),bins=50)

In [None]:
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
import math

print("R2_Score : ", r2_score(y_test,y_pred))
print("Mean Squared Error : ", mean_squared_error(y_test,y_pred))
print("MAE : ",mean_absolute_error(y_test,y_pred))
print("RSME : ",math.sqrt(mean_squared_error(y_test,y_pred)))

In [None]:
results_df = pd.DataFrame(data=[["Linear Regression", lr_model.score(x_train,y_train),lr_model.score(x_test,y_test),r2_score(y_test,y_pred),mean_squared_error(y_test,y_pred),mean_absolute_error(y_test,y_pred),math.sqrt(mean_squared_error(y_test,y_pred))]], 
                          columns=['Model', 'Training Accuracy %', 'Testing Accuracy %','r2 score','MSE','MAE','RSME'])

results_df

#### Support Vector Machine
“Support Vector Machine” (SVM) is a supervised machine learning algorithm which can be used for both classification or regression challenges. However, it is mostly used in classification problems. In the SVM algorithm, we plot each data item as a point in n-dimensional space (where n is number of features you have) with the value of each feature being the value of a particular coordinate.

In [None]:
from sklearn.svm import SVR
svr_model=SVR(kernel = 'rbf')
svr_model.fit(x_train,y_train)

# calculating the accuracies
print("Training Accuracy :",svr_model.score(x_train,y_train))
print("Testing Accuracy :",svr_model.score(x_test,y_test))

In [None]:
y_pred = svr_model.predict(x_test)
y_pred[0:5]

In [None]:
plt.scatter(y_test,y_pred)
plt.xlabel("True Values")
plt.ylabel("Predicted Values")

In [None]:
sns.distplot((y_test-y_pred),bins=50)

In [None]:
print("R2_Score : ", r2_score(y_test,y_pred))
print("Mean Squared Error : ", mean_squared_error(y_test,y_pred))
print("MAE : ",mean_absolute_error(y_test,y_pred))
print("RSME : ",math.sqrt(mean_squared_error(y_test,y_pred)))

In [None]:
results_df_2 = pd.DataFrame(data=[["Support Vector Machine", svr_model.score(x_train,y_train),svr_model.score(x_test,y_test),r2_score(y_test,y_pred),mean_squared_error(y_test,y_pred),mean_absolute_error(y_test,y_pred),math.sqrt(mean_squared_error(y_test,y_pred))]], 
                          columns=['Model', 'Training Accuracy %', 'Testing Accuracy %','r2 score','MSE','MAE','RSME'])
results_df = results_df.append(results_df_2, ignore_index=True)
results_df

#### Random Forest
Random forest is like bootstrapping algorithm with Decision tree (CART) model. Say, we have 1000 observation in the complete population with 10 variables. Random forest tries to build multiple CART models with different samples and different initial variables. For instance, it will take a random sample of 100 observation and 5 randomly chosen initial variables to build a CART model. It will repeat the process (say) 10 times and then make a final prediction on each observation. Final prediction is a function of each prediction. This final prediction can simply be the mean of each prediction.

In [None]:
from sklearn.ensemble import RandomForestRegressor

rfc_model=RandomForestRegressor(n_estimators=300,random_state=0)
rfc_model.fit(x_train,y_train)

# calculating the accuracies
print("Training Accuracy :",rfc_model.score(x_train,y_train))
print("Testing Accuracy :",rfc_model.score(x_test,y_test))

In [None]:
y_pred = rfc_model.predict(x_test)
y_pred[0:5]

In [None]:
plt.scatter(y_test,y_pred)
plt.xlabel("True Values")
plt.ylabel("Predicted Values")

In [None]:
sns.distplot((y_test-y_pred),bins=50)

In [None]:
print("R2_Score : ", r2_score(y_test,y_pred))
print("Mean Squared Error : ", mean_squared_error(y_test,y_pred))
print("MAE : ",mean_absolute_error(y_test,y_pred))
print("RSME : ",math.sqrt(mean_squared_error(y_test,y_pred)))

In [None]:
results_df_2 = pd.DataFrame(data=[["Random Forest", rfc_model.score(x_train,y_train),rfc_model.score(x_test,y_test),r2_score(y_test,y_pred),mean_squared_error(y_test,y_pred),mean_absolute_error(y_test,y_pred),math.sqrt(mean_squared_error(y_test,y_pred))]], 
                          columns=['Model', 'Training Accuracy %', 'Testing Accuracy %','r2 score','MSE','MAE','RSME'])
results_df = results_df.append(results_df_2, ignore_index=True)
results_df

### So, from the above table, the best suitable algorithm for the give dataset is "Random Forest" with an accuracy of "93%".