In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
mercedesData = pd.read_csv("/kaggle/input/used-car-dataset-ford-and-mercedes/merc.csv")

# Exploring Data Analysis (EDA)

## Overview of The Dataset

In [None]:
mercedesData.head()

The method of **describe** gives us detailed information about numerical features. As well, you can detect outliers values here. For instance, the minimum year of the cars is 1970 and the maximum year of the cars is 2020. So the price difference between those cars very large, it may be caused to the wrong prediction so we should remove old cars.

In [None]:
mercedesData.describe()

In [None]:
mercedesData.info()

In [None]:
mercedesData.dtypes

This dataset has no non-recorded data. We can jump into develop the model after data-analysis without handle missing values

In [None]:
mercedesData.isnull().sum()

The dataset below consists of categorical features. If we want to obtain recognizable predict results, we should use categorical data to train our model, too.

To do this, I will convert categorical data to numerical data with dummies. Then I will concat dummy variables df with the main df

In [None]:
categorical_datas = mercedesData.select_dtypes(include=["object"])

In [None]:
categorical_datas.head()

In [None]:
categorical_datas["transmission"].value_counts()

In [None]:
categorical_datas.fuelType.value_counts()

In [None]:
categorical_datas["model"].value_counts()

In [None]:
mercedesData = mercedesData[mercedesData.year >= 1990]

**I have removed old cars from our dataset to prevent occur outliers**

## Visualize The Dataset

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
numericalDatas = mercedesData.select_dtypes(exclude=["object"])

In [None]:
plt.figure(figsize=(10,15))
sns.pairplot(numericalDatas)
plt.show()

Heatmap is one of the most important tools to see relationship between the features of the data

In [None]:
plt.figure(figsize=(15,10))
plt.title("Relationship Between The Features of The Mercedes Data")
sns.heatmap(mercedesData.corr(),annot=True,fmt=".5f",linewidths=1,linecolor="gray")
plt.show()

### Price vs Year

In [None]:
pricevsYear = mercedesData.loc[:,["price","year"]]

In [None]:
pricevsYear=pricevsYear.groupby(["year"]).mean().price

In [None]:
plt.figure(figsize=(15,10))
sns.barplot(x=pricevsYear.index,y=pricevsYear.values,palette=sns.cubehelix_palette())
plt.xticks(rotation=45,fontsize=20)
plt.yticks(fontsize=20)
plt.ylabel("Price",fontsize=20)
plt.xlabel("Year",fontsize=20)
plt.title("Price vs Year",fontsize=30)
plt.show()

### Engine Size vs Price

In [None]:
mercedesData.engineSize.mean()

In [None]:
under2 = mercedesData[mercedesData.engineSize < 2].price.mean()
over2 = mercedesData[mercedesData.engineSize > 2].price.mean()

d = {
    "under2" : under2,
    "over2" : over2
}

data = pd.Series(d)


sns.barplot(x = data.index,y=data.values)
plt.title("Car's Prices According to Kinds of Engine Sizes")
plt.show()

## Mileage vs Engine-Sizes

In [None]:
zeroTo2 = mercedesData[mercedesData.engineSize < 2.0].price.mean()
toTwo4 = mercedesData[(mercedesData.engineSize > 2) & (mercedesData.engineSize < 4)].price.mean()
fourTo6 = mercedesData[(mercedesData.engineSize > 4) & (mercedesData.engineSize < 6)].price.mean()

In [None]:
d1 = {
    "0-2" : zeroTo2,
    "2-4" : toTwo4,
    "4-6" : fourTo6
}

In [None]:
data1 = pd.Series(d1)


sns.barplot(x = data1.index,y=data1.values)
plt.title("Car's Milages According to Kinds of Engine Sizes")
plt.show()

In [None]:
mercedesData.mileage.describe()

### Year vs Price

In [None]:
yearvsPrice = mercedesData.groupby(["year"]).price.mean()

plt.figure(figsize=(15,10))
plt.xticks(rotation=45,fontsize=10)
plt.ylabel("Prices")
plt.xlabel("Years")
sns.barplot(x= yearvsPrice.index,y=yearvsPrice.values)
plt.show()

# Data Preparation

In [None]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder


In [None]:
clone = mercedesData.copy()

In [None]:
# numFeatures = clone.select_dtypes(exclude=["object"])
# catFeatures = clone.select_dtypes(include=["object"])


# preprocessor = make_column_transformer(
#     (StandardScaler(), numFeatures),
#     (OneHotEncoder(), catFeatures),
# )


In [None]:
df = pd.get_dummies(data=clone,columns=["model","transmission","fuelType"])
df.head()

## Split Data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# X = df.drop("price",axis=1)
# y = df["price"]

In [None]:
df.head()

In [None]:
df_train, df_test = train_test_split(df, 
                                     train_size = 0.7, 
                                     test_size = 0.3, 
                                     random_state = 100)

# Developing Models

## Model-1 -> Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error as MSE 
from sklearn.metrics import r2_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFE
from sklearn.metrics import mean_absolute_error

I will scale train and test data to get better result 

### scaling training data

In [None]:
# rescale the features
scaler = MinMaxScaler()

numCols = [col for col in df.columns if df[col].dtype in ["int64","float64"] ]

# scaling training data

df_train[numCols] = scaler.fit_transform(df_train[numCols])

df_train.head()


### scale test data

In [None]:
df_test[numCols] = scaler.fit_transform(df_test[numCols])
df_test.head()

### divide into our datas as X_train, y_train, X_test, y_test

In [None]:
y_train = df_train.pop("price")
X_train = df_train

In [None]:
y_test = df_test.pop("price")
X_test = df_test

In [None]:
linReg = LinearRegression()
linReg.fit(X_train,y_train)

rfe = RFE(linReg, n_features_to_select=10)             
rfe = rfe.fit(X_train, y_train)

# y_pred = linReg.predict(X_test)
y_pred = rfe.predict(X_test)


# print("Model score : {}".format(np.sqrt(MSE(y_test,y_pred))))
print("Model score : {}".format(mean_absolute_error(y_test,y_pred)))


### Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
lm = LinearRegression()
scores = cross_val_score(lm, X_train, y_train, scoring='r2', cv=5)
scores

## Model-2 -> Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
# Separate target from predictors
y = df.price
X = df.drop(['price'], axis=1)

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                                random_state=0)

In [None]:
ranFor = RandomForestRegressor(n_estimators=100, random_state=0)
ranFor.fit(X_train,y_train)
preds = ranFor.predict(X_valid)
# score = np.sqrt(MSE(y_valid,preds))
score = mean_absolute_error(y_valid,preds)
print('mean_absolute_error:', score)

## Model-3 -> XGBoost

In [None]:
from xgboost import XGBRegressor

In [None]:
my_model = XGBRegressor(n_estimators=1000, learning_rate=0.05, n_jobs=4)
my_model.fit(X_train, y_train, 
             early_stopping_rounds=5, 
             eval_set=[(X_valid, y_valid)], 
             verbose=False)
my_model.score(X_valid,y_valid)

In [None]:
predictions_1 = my_model.predict(X_valid)

print("MSE : {}".format(MSE(y_valid,predictions_1)))

In [None]:
from sklearn.metrics import mean_absolute_error
mae_1 = mean_absolute_error(predictions_1,y_valid)
print("Mean Absolute Error:" , mae_1)


## Model - 4 -> Neural Network

- I will develop neural network model with keras.
- 

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import layers
from tensorflow.keras import callbacks
from tensorflow import keras

In [None]:
input_shape = [X_train.shape[1]]

model = keras.Sequential([
    layers.BatchNormalization(input_shape=input_shape),
    layers.Dense(1024,activation="relu"),
    layers.Dropout(0.5),
    layers.BatchNormalization(),
    layers.Dense(1024,activation="relu"),
     layers.Dropout(0.5),
    layers.BatchNormalization(),
    layers.Dense(1024,activation="relu"),
     layers.Dropout(0.5),
    layers.BatchNormalization(),
    layers.Dense(1),
])

model.compile(
    optimizer='sgd',
    loss='mae',
    metrics=['mae'],
)

# model.compile(
#     optimizer='adam',
#     loss='mse'
# )

early_stopping = callbacks.EarlyStopping(
    patience=20,
    min_delta=0.001,
    restore_best_weights=True,
)


history = model.fit(
    X_train,y_train,
    validation_data=(X_valid,y_valid),
    batch_size=512,
    epochs = 600,
    callbacks=[early_stopping],
    verbose=0
)



In [None]:
history_df = pd.DataFrame(history.history)
history_df.head()

In [None]:
history_df.loc[:,["loss","val_loss"]].plot()
plt.show()

In [None]:
history_df.loc[:,"mae"].max()