In [None]:
!pip install tensorflow-transform

In [None]:
import numpy as np
import pandas as pd
import os

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LinearRegression, RANSACRegressor
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA

import tensorflow as tf
import tensorflow_transform as tft

from tensorflow.keras import layers
from tensorflow import keras

import xgboost as xgb

pd.set_option('display.max_columns', None)

In [None]:
car_data_one = pd.read_csv('../input/vehicle-dataset-from-cardekho/CAR DETAILS FROM CAR DEKHO.csv')
car_data_two = pd.read_csv('../input/vehicle-dataset-from-cardekho/Car details v3.csv')
car_data_three = pd.read_csv('../input/vehicle-dataset-from-cardekho/car data.csv')

columns = car_data_one.columns

car_data_two = car_data_two[columns]

di = {"First Owner":0 , "Second Owner":1,"Third Owner": 2, "Fourth & Above Owner": 3}

car_data_three.columns = map(str.lower, car_data_three.columns)
car_data_three = car_data_three.rename(columns={"car_name": "name", "kms_driven": "km_driven", "fuel_type": "fuel"})[columns]

car_data_two.replace({"owner": di}, inplace=True)
car_data_one.replace({"owner": di}, inplace=True)

ds = pd.concat([car_data_one, car_data_two, car_data_three])
ds = ds[ds['owner'] != "Test Drive Car"]

In [None]:
ds.tail()

In [None]:
fig, ax = plt.subplots(2,2, figsize = (12,12))
((ax1, ax2), (ax3, ax4)) = ax

labels = ds['fuel'].value_counts().index.tolist()
values = ds['fuel'].value_counts().tolist()
ax1.pie(x=values, labels=labels, autopct="%1.2f%%", shadow=True, explode=[0, 0.2, 0.2, 0.01, 0.01])
ax1.set_title("Fuel Type:", fontdict={'fontsize': 14})

labels = ds['transmission'].value_counts().index.tolist()
values = ds['transmission'].value_counts().tolist()
ax2.pie(x=values, labels=labels, autopct="%1.2f%%", shadow=True, explode=[0, 0.2])
ax2.set_title("Transmission:", fontdict={'fontsize': 14})

labels = ds['seller_type'].value_counts().index.tolist()
values = ds['seller_type'].value_counts().tolist()
ax3.pie(x=values, labels=labels, autopct="%1.2f%%", shadow=True, explode=[0, 0.2, 0.1])
ax3.set_title("Seller Type:", fontdict={'fontsize': 14})

labels = ds['owner'].value_counts().index.tolist()
values = ds['owner'].value_counts().tolist()
ax4.pie(x=values, labels=labels, autopct="%1.2f%%", shadow=True, explode=[0, 0.2, 0.2, 0.2])
ax4.set_title("Past Owner:", fontdict={'fontsize': 14})

In [None]:
ds.isna().sum()

In [None]:
full_data = ds.copy()

full_data = pd.get_dummies(full_data, columns=['fuel', 'seller_type', 'transmission'])

full_data['age'] = 2021-full_data['year']

full_data.drop(columns=['year'], inplace=True)
full_data.drop(columns=['name'], inplace=True)

In [None]:
full_data.head()

In [None]:
cmap = sns.diverging_palette(30, 230, 90, 20, as_cmap=True)
fig, ax = plt.subplots(figsize=(12,12))
sns.heatmap(full_data.corr(),annot=True, cmap=cmap)
sns.set(font_scale=1)

In [None]:
corr_matrix = full_data.corr()
correlations = corr_matrix['selling_price'].sort_values(ascending = False) 
high_corr = (correlations > 0.2)|(correlations < -0.2)

fig, ax = plt.subplots(figsize=(15,15))
sns.heatmap(full_data[correlations[high_corr].index].corr(),annot=True, cmap=cmap)
sns.set(font_scale=1)

In [None]:
sns.pairplot(full_data[['km_driven', 'selling_price', 'age']], diag_kind='kde')

In [None]:
sns.pairplot(full_data[['selling_price', 'km_driven', 'age']], kind='reg')

In [None]:
fig = plt.figure(figsize=(10,5))
sns.barplot(x='age',y='selling_price',data=full_data).set_title('Selling Price range by Car Age')

In [None]:
cols = ['selling_price', 'km_driven', 'owner', 'fuel_CNG', 'fuel_Diesel',
       'fuel_Electric', 'fuel_LPG', 'fuel_Petrol', 'seller_type_Dealer',
       'seller_type_Individual', 'seller_type_Trustmark Dealer',
       'transmission_Automatic', 'transmission_Manual', 'age']

Y = full_data['selling_price']
X = full_data.drop(columns=['selling_price'])

sc_x = StandardScaler()
sc_y = StandardScaler()

X = sc_x.fit_transform(X)
Y = sc_y.fit_transform(Y[:, np.newaxis]).flatten()

X = np.asarray(X).astype('float32')
Y = np.asarray(Y).astype('float32')

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25)

In [None]:
test_results = {}

In [None]:
def plot_loss(history):
  plt.plot(history.history['loss'], label='loss')
  plt.plot(history.history['val_loss'], label='val_loss')
  plt.ylim([0, 1])
  plt.xlabel('Epoch')
  plt.ylabel('Error [Price]')
  plt.legend()
  plt.grid(True)

In [None]:
linear_model = tf.keras.Sequential([
    layers.Dense(units=1)
])

linear_model.compile(
    optimizer=tf.optimizers.Adam(learning_rate=0.1),
    loss='mean_squared_error')

history = linear_model.fit(
    X_train, y_train, 
    epochs=100,
    verbose=0,
    validation_split = 0.2)

In [None]:
plot_loss(history)

In [None]:
test_results['linear_model_tf'] = linear_model.evaluate(X_test, y_test, verbose=0)

In [None]:
model = keras.Sequential([
      layers.Dense(256, activation='relu'),
      layers.Dense(256, activation='relu'),
      layers.Dense(256, activation='relu'),
      layers.Dense(128, activation='relu'),
      layers.Dense(1)
  ])

model.compile(loss='mean_squared_error', optimizer=tf.keras.optimizers.Adam(0.001))

history = model.fit(
    X_train, y_train,
    validation_split=0.2,
    verbose=0, epochs=100)

model.summary()

In [None]:
plot_loss(history)

In [None]:
test_results['dnn_model'] = model.evaluate(X_test, y_test, verbose=0)

In [None]:
test_predictions = model.predict(X_test).flatten()

a = plt.axes(aspect='equal')
plt.scatter(y_test, test_predictions)
plt.xlabel('True Values [MPG]')
plt.ylabel('Predictions [MPG]')
lims = [0, 12]
plt.xlim(lims)
plt.ylim(lims)
_ = plt.plot(lims, lims)

In [None]:
slr = LinearRegression()
slr.fit(X_train, y_train)
test_results['linear_model_sl'] = mean_squared_error(y_test, slr.predict(X_test))

In [None]:
ransac = RANSACRegressor(LinearRegression(),
                        max_trials=100,
                        min_samples=50,
                        residual_threshold=5.0,
                        random_state=0)

ransac.fit(X_train, y_train)
test_results['ransac'] = mean_squared_error(y_test, ransac.predict(X_test))

In [None]:
qubic = PolynomialFeatures(degree=3)
lr= LinearRegression()
X_train_q = qubic.fit_transform(X_train)
lr.fit(X_train_q, y_train)
test_results['linear_model_sl_qubic'] = mean_squared_error(y_test, lr.predict(qubic.fit_transform(X_test)))

In [None]:
forest = RandomForestRegressor(n_estimators=1000,
                              criterion='mse',
                              random_state=1,
                              n_jobs=1)

forest.fit(X_train, y_train)
test_results['forest'] = mean_squared_error(y_test, forest.predict(X_test))

In [None]:
#PCA
covariance_matrix = np.cov(X_train.T)
eigen_vals, eigen_vecs = np.linalg.eig(covariance_matrix)
eigen_pairs = [(np.abs(eigen_vals[i]), eigen_vecs[:, i])
              for i in range(len(eigen_vals))]
eigen_pairs.sort(key=lambda k: k[0], reverse=True)
w = np.hstack((eigen_pairs[0][1][:, np.newaxis],
              eigen_pairs[1][1][:, np.newaxis]))

In [None]:
tot = sum(eigen_vals)
var_exp = [(i / tot) for i in sorted(eigen_vals, reverse=True)]
cum_var_exp = np.cumsum(var_exp)

plt.bar(range(1, 14), var_exp, alpha=0.5, align='center',
        label='Individual explained variance')
plt.step(range(1, 14), cum_var_exp, where='mid',
         label='Cumulative explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal component index')
plt.legend(loc='best')
plt.tight_layout()
plt.show()

In [None]:
X_train_pca = X_train.dot(w)

print('Before PCA')
print(X_train.shape)

print('After PCA')
print(X_train_pca.shape)

colors = ['r', 'b', 'g']
markers = ['s', 'x', 'o']

for l, c, m in zip(np.unique(y_train), colors, markers):
    plt.scatter(X_train_pca[y_train == l, 0], 
                X_train_pca[y_train == l, 1], 
                c=c, label=l, marker=m)

plt.xlabel('PC 1')
plt.ylabel('PC 2')
plt.legend(loc='lower left')
plt.tight_layout()
plt.show()

In [None]:
model_pca = keras.Sequential([
      layers.Dense(256, activation='relu'),
      layers.Dense(256, activation='relu'),
      layers.Dense(256, activation='relu'),
      layers.Dense(128, activation='relu'),
      layers.Dense(1)
  ])

model_pca.compile(loss='mean_squared_error', optimizer=tf.keras.optimizers.Adam(0.001))

history = model_pca.fit(
    X_train_pca, y_train,
    validation_split=0.2,
    verbose=0, epochs=4, steps_per_epoch=3)

model_pca.summary()

In [None]:
pca = PCA(n_components=2)

X_test_pca = pca.fit_transform(X_test)
test_results['dnn_model_pca'] = model_pca.evaluate(X_test_pca, y_test, verbose=0)

In [None]:
train_dmatrix = xgb.DMatrix(data = X_train, label = y_train)
test_dmatrix = xgb.DMatrix(data = X_test, label = y_test)
param = {"booster":"gblinear", "objective":"reg:squarederror"}
  
xgb_r = xgb.train(params=param, dtrain = train_dmatrix, num_boost_round = 10)

In [None]:
pred = xgb_r.predict(test_dmatrix)
test_results['xgboost_linear'] = mean_squared_error(y_test, pred)

In [None]:
pd.DataFrame(test_results, index=['Mean squared error']).T