## Importing the libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Importing the data

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
dataframe = pd.read_csv('/kaggle/input/car-price-prediction/CarPrice_Assignment.csv')
dataframe.head()

## Data Preparation

### Taking a look at the columns

In [None]:
dataframe.info()

In [None]:
dataframe.nunique()

In [None]:
dataframe.describe()

In [None]:
dataframe.describe(include='object')

In [None]:
dataframe.drop('car_ID', axis=1, inplace=True)

In [None]:
dataframe['CarName'] = dataframe['CarName'].apply(lambda x: x.split()[0])

In [None]:
dataframe.head()

In [None]:
dataframe['CarName'].value_counts()

In [None]:
correct_mapping = {
    'toyouta' : 'toyota',
    'Nissan' : 'nissan',
    'maxda' : 'mazda',
    'porcshce' : 'porsche',
    'vokswagen' : 'volkswagen',
    'vw':'volkswagen'
}

In [None]:
dataframe['CarName'] = dataframe['CarName'].apply(lambda x: correct_mapping[x] if x in correct_mapping else x)

In [None]:
dataframe['CarName'].value_counts()

In [None]:
dataframe.drop('CarName', axis=1, inplace=True)

In [None]:
dataframe.head()

## Data Analysis

### Finding Correlation for the Integer Datatype

In [None]:
corr = abs(dataframe.corr())

In [None]:
plt.figure(figsize=(12,12))
sns.heatmap(corr, annot=True)
plt.show()

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(pd.DataFrame(corr.iloc[:-1,-1]), annot=True)
plt.show()

In [None]:
temp = pd.DataFrame(corr.iloc[:-1,-1])
numeric_cols_to_consider = temp[temp['price']>0.5]
print(numeric_cols_to_consider)
numeric_cols_to_consider = numeric_cols_to_consider.index.tolist()

In [None]:
# numeric_cols_to_consider = numeric_cols_to_consider + ['carheight', 'peakrpm']

### Finding Important Columns for the Non-integer Datatype

In [None]:
df = dataframe.loc[:,dataframe.dtypes=='object']
df.head()

In [None]:
cols = df.columns.tolist()
cols.append('price')
temp = dataframe.loc[:,cols]
temp.head()

In [None]:
fig, ax = plt.subplots(ncols=3, nrows=3, figsize=(20,20))
for i,col in enumerate(temp.columns.tolist()[:-1]):
    sns.boxplot(x=col, y='price', data=temp, ax=ax[i//3][i%3])
plt.show()

In [None]:
temp = pd.get_dummies(temp, columns=temp.columns.tolist()[:-1])

In [None]:
temp_corr = pd.DataFrame(abs(temp.corr())['price'])
temp_corr[temp_corr['price']>0.5]

In [None]:
category_cols_to_consider = dataframe.loc[:,dataframe.dtypes=='object'].columns.tolist()

## Implementing Machine Learning Model

### Trying implementing Linear Regression

In [None]:
from sklearn import linear_model
from sklearn.model_selection import train_test_split

In [None]:
# numeric_cols_to_consider = corr.iloc[:-1,-1].index.tolist()

In [None]:
cols = numeric_cols_to_consider + category_cols_to_consider + ['price']
cols

In [None]:
df = dataframe.loc[:,cols]
df = pd.get_dummies(df, columns=category_cols_to_consider, dtype=int)
df.info()

In [None]:
heatmap_df = pd.DataFrame(abs(df.corr()['price']))
criteria = 0.5
heatmap_df[heatmap_df['price']>=criteria]

In [None]:
# important_cols = [x for x in df.columns.tolist() if x!='price']
important_cols = [x for x in heatmap_df[heatmap_df['price']>criteria].index.tolist() if x!='price']

In [None]:
train_independent_columns = important_cols
train_dependent_column = 'price'

In [None]:
X = df.loc[:,train_independent_columns]
Y = df.loc[:,train_dependent_column]

In [None]:
x_train, x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2, random_state=12)

In [None]:
regression_model = linear_model.LinearRegression()

#### Model Learning and Testing

In [None]:
regression_model.fit(x_train, y_train)

In [None]:
regression_model.score(x_train, y_train)

In [None]:
regression_model.score(x_test, y_test)

In [None]:
predicted_y = regression_model.predict(x_test)

In [None]:
MAPE = round((sum(abs(y_test-predicted_y)/y_test)/y_test.size) * 100, 2)
print("MAPE = {0}%".format(MAPE))

### XGBoost Machine Learning Model

In [None]:
from xgboost import XGBRegressor

In [None]:
size = 200
base_score = np.random.uniform(0.5,1,size)
learning_rate = np.random.uniform(0.3,1,size)
n_estimators = np.random.randint(100,500,size)

In [None]:
dataa = pd.DataFrame({"base_score":base_score, "learning_rate":learning_rate, "n_estimators":n_estimators})
dataa.head()

In [None]:
dataa = dataa.sample(frac=1)
dataa.head()

In [None]:
lowest_MAPE = 20
count = 0
for index in dataa.index:
    base_score, learning_rate, n_estimators = dataa.loc[index,
                                                        'base_score'], dataa.loc[index, 'learning_rate'], dataa.loc[index, 'n_estimators']

    model = XGBRegressor(base_score = base_score, learning_rate=learning_rate, n_estimators=n_estimators)
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    MAPE = round((sum(abs(y_test-y_pred)/y_test)/y_test.size) * 100, 2)
    if(MAPE <= lowest_MAPE):
        lowest_MAPE = MAPE
        print(base_score, learning_rate, n_estimators)
        print("MAPE = {0}%\n".format(MAPE))
        
        
print("Minimum MAPE is {0}%".format(lowest_MAPE))

In [None]:
model = XGBRegressor(base_score = 0.8714602023456752, learning_rate=0.8179946161849307 , n_estimators=157)
model.fit(x_train, y_train)
model.score(x_test, y_test)

### Rough Work

In [None]:
temp = dataframe.loc[:,['carlength','carwidth','price']]
temp['sumofdimension'] = dataframe['carlength'] + dataframe['carwidth']
temp.drop(columns=['carlength','carwidth'],inplace=True)
temp

In [None]:
temp.corr()['sumofdimension'][0]