In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing Essential Libraries

In [None]:
# import pandas as pd
# import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.figure_factory as ff
from plotly import io

In [None]:
df = pd.read_csv('/kaggle/input/car-price-prediction/CarPrice_Assignment.csv')
data_dict = pd.read_excel('/kaggle/input/car-price-prediction/Data Dictionary - carprices.xlsx')

In [None]:
df.head()

In [None]:
data_dict.head()

In [None]:
data_dict[['Unnamed: 6', 'Unnamed: 7', 'Unnamed: 11']]

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
# Droppipng car_Id as it is not a deciding factor in Prediction of cars. (Business Knowledge)
df = df.drop('car_ID', axis = 1)

In [None]:
# Checking Outliers
px.box(data_frame = df['price'], labels = ({'variable':'Checking for Outliers in Price'}),
       color_discrete_sequence= ['seagreen'], hover_name = df['CarName'])

### Outliers consist of cars from only BMW, Porsche, Jaguar & Buick.
### Although we could remove them, we will instead scale all the values to have a same Mean and Standard Deviation.

# Feature Extraction of CarName Column

In [None]:
df['CarName'] = df['CarName'].str.split(' ', expand = True)

In [None]:
df.head()

In [None]:
df['CarName'].unique()

In [None]:
# Renaming Car Company Names' typo errors to their original names.
df['CarName'] = df['CarName'].replace({'maxda':'mazda', 'Nissan':'nissan', 'porcshce':'porsche', 'toyouta': 'toyota', 
                                      'vokswagen':'volkswagen', 'vw':'volkswagen'})

In [None]:
df['CarName'].unique()

In [None]:
# Data Dictionary says symboling is a categorical variable. So changing it from an 'int' to 'str'.
df['symboling'] = df['symboling'].astype(str)

# Exploratory Data Analysis (EDA)

In [None]:
# Heatmap woth Plotly
fig = px.imshow(df.corr())
fig.update_layout(autosize=False,
    width=700,
    height=700)

In [None]:
# Heatmap with Seaborn
plt.figure(figsize = (15, 8))
sns.heatmap(df.corr(), annot = True, cmap = 'coolwarm')

In [None]:
car_count = df['CarName'].value_counts()
px.bar(data_frame = car_count, labels = {'index':'Car Companies', 'value':'Total no of Cars Sold', 'color':'Popularity'}, color = car_count)

In [None]:
px.histogram(data_frame = df, x = 'price', marginal = 'box')

In [None]:
plt.figure(figsize = (15, 8))
sns.distplot(df['price'])

In [None]:
sns.pairplot(df.select_dtypes(exclude = ['object']))

# Observation :

1. carlength, carwidth, curbweight, enginesize & horsepower seems to have a positive correlation with price.
2. citympg & highwaympg both have a negative correlation with price.

In [None]:
df = df[['price','enginetype','fueltype', 'aspiration','carbody','cylindernumber', 'drivewheel',
            'wheelbase','curbweight', 'enginesize', 'boreratio','horsepower', 
                    'citympg','highwaympg', 'carlength','carwidth']]

In [None]:
df.head()

# Data Preparation

## Encoding Categorical Data

In [None]:
df = pd.get_dummies(df, drop_first = True)

In [None]:
df.head()

In [None]:
df.iloc[:, 9:21]

In [None]:
df.shape

# Splitting into Training & Testing Datasets

In [None]:
#X = df.drop('price', axis = 1)
#y = df['price']

In [None]:
from sklearn.model_selection import train_test_split

np.random.seed(0)
df_train, df_test = train_test_split(df, test_size = 0.3, random_state = 42)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
y_train.shape

In [None]:
y_test.shape

# Standard Scaling all the values except dummy variables (Because it makes no sense to scale binary variables)

In [None]:
scaled_features = ['price', 'wheelbase', 'curbweight', 'enginesize', 'boreratio', 'horsepower', 'citympg',
                     'highwaympg', 'carlength', 'carwidth']

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
df_train[scaled_features] = sc.fit_transform(df_train[scaled_features])
df_test[scaled_features] = sc.transform(df_test[scaled_features])

In [None]:
df_train

In [None]:
X_train = df_train.drop('price', axis = 1)
y_train = df_train['price']

In [None]:
X_test = df_test.drop('price', axis = 1)
y_test = df_test['price']

In [None]:
y_train.head()

In [None]:
print("Shape of X_train: ", X_train.shape)
print("Shape of y_train: ", y_train.shape)
print("Shape of X_test: ", X_test.shape)
print("Shape of y_test: ", y_test.shape)

# Model Building

In [None]:
from xgboost import XGBRegressor
regressor = XGBRegressor()
regressor.fit(X_train, y_train)

In [None]:
y_pred = regressor.predict(X_test)

In [None]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
print ("R2 Score: ", r2_score(y_test, y_pred))
print ("Mean Absolute Error: ", mean_absolute_error(y_test, y_pred))
print ("Mean Square Error: ", mean_squared_error(y_test, y_pred))
print ("Root Mean Square Error: ", np.sqrt(mean_squared_error(y_test, y_pred)))