In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from warnings import filterwarnings
filterwarnings('ignore')

In [None]:
Data=pd.read_csv('../input/used-car-dataset-ford-and-mercedes/ford.csv')

In [None]:
Data.head()

In [None]:
Data.info()

In [None]:
Data.isna().sum()

## Data Analysis

In [None]:
print('Model unique values: ',len(Data['model'].unique()))
print(Data['model'].unique())

In [None]:
plt.figure(figsize=(25,8))
sns.countplot(Data['model'])
plt.show()

In [None]:
plt.figure(figsize=(25,8))
sns.countplot(Data['year'])
plt.show()

In [None]:
print('Transmission unique: ',len(Data['transmission'].unique()))
print(Data['transmission'].unique())

In [None]:
sns.countplot(Data['transmission'])
plt.show()

In [None]:
print('Fuelt type unique values: ',len(Data['fuelType'].unique()))
print(Data['fuelType'].unique())

In [None]:
sns.countplot(Data['fuelType'])
plt.show()

In [None]:
Data['engineSize'].value_counts()

In [None]:
plt.figure(figsize=(13,8))
sns.countplot(Data['engineSize'])
plt.show()

In [None]:
plt.figure(figsize=(15,8))
sns.countplot(Data['tax'])
plt.show()

In [None]:
Data['price'].describe()

## Find values

In [None]:
Data['fuelType'][Data['fuelType']=='Other']

In [None]:
Data['year'][Data['year']<2000]

## Data Preprocessing

In [None]:
New_Data=Data.copy().drop(index=Data['fuelType'][Data['fuelType']=='Other'].index.tolist()+Data['year'][Data['year']<2000].index.tolist()) 

In [None]:
New_Data.info()

In [None]:
data_model_dummies=pd.get_dummies(New_Data['model'])

In [None]:
data_model_dummies

In [None]:
transmission_dummies=pd.get_dummies(New_Data['transmission'])

In [None]:
transmission_dummies

In [None]:
fueltype_dummies=pd.get_dummies(New_Data['fuelType'])

In [None]:
fueltype_dummies

In [None]:
New_Data=New_Data.drop(columns=(['fuelType','transmission','model']))  

In [None]:
New_Data.head()

In [None]:
New_Data=pd.concat(([data_model_dummies,New_Data.iloc[:,:2],transmission_dummies,New_Data.iloc[:,2:3],fueltype_dummies,New_Data.iloc[:,3:]]),axis=1)

In [None]:
New_Data

In [None]:
x=New_Data.copy().drop(columns='price')

In [None]:
x.head()

In [None]:
y=New_Data.copy().pop('price')

In [None]:
y.head()

## Train - Validation-Test

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_array=x.values
y_array=y.values

In [None]:
x_array

In [None]:
y_array

### validation

In [None]:
x_train,x_val,y_train,y_val=train_test_split(x_array,y_array,test_size=0.10,random_state=42) 

In [None]:
x_train.shape

In [None]:
y_train.shape

In [None]:
x_val.shape

In [None]:
y_val.shape 

### Test

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x_train,y_train,test_size=0.20,random_state=42)

In [None]:
x_train.shape

In [None]:
y_train.shape

In [None]:
x_test.shape

In [None]:
y_test.shape

## Standart Scaler

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
obje_ss=StandardScaler()

x_train_ss=obje_ss.fit_transform(x_train)
x_val_ss=obje_ss.fit_transform(x_val)
x_test_ss=obje_ss.fit_transform(x_test)

In [None]:
x_train

In [None]:
x_train_ss

## Models

### LinearRegression

In [None]:
from sklearn.metrics import r2_score

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
model_le=LinearRegression().fit(x_train,y_train)
model_le

In [None]:
y_pred=model_le.predict(x_val)

In [None]:
print('Train success rate : %',model_le.score(x_train,y_train)*100)
print('Validation success rate : %',r2_score(y_val,y_pred)*100)

In [None]:
y_pred[y_pred<0] # Linearregression outliner data predict is so bad  

### SupportVectorRegression

In [None]:
from sklearn.svm import SVR

In [None]:
model_svr=SVR(kernel='linear',degree=3).fit(x_train_ss,y_train)
model_svr

In [None]:
y_pred=model_svr.predict(x_val_ss) 

In [None]:
print('Train success rate : %',model_svr.score(x_train_ss,y_train)*100)
print('Validation success rate : %',r2_score(y_val,y_pred)*100)

### PolynomialRegression

In [None]:
from sklearn.preprocessing import PolynomialFeatures

In [None]:
obje_poly=PolynomialFeatures(degree=2)

x_train_poly=obje_poly.fit_transform(x_train)
x_val_poly=obje_poly.fit_transform(x_val)
x_test_poly=obje_poly.fit_transform(x_test)

In [None]:
model_poly=LinearRegression(n_jobs=-1).fit(x_train_poly,y_train)
model_poly

In [None]:
y_pred=model_poly.predict(x_val_poly)

In [None]:
print('Train success rate : %',model_poly.score(x_train_poly,y_train)*100)
print('Validation success rate : %',r2_score(y_val,y_pred)*100)

### RandomForestRegressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
model_rfr=RandomForestRegressor(n_estimators=100,n_jobs=-1,random_state=42).fit(x_train_ss,y_train)
model_rfr

In [None]:
y_pred=model_rfr.predict(x_val_ss)

In [None]:
print('Train success rate : %',model_rfr.score(x_train_ss,y_train)*100)
print('Validation success rate : %',r2_score(y_val,y_pred)*100)

### AdaBoostRegressor

In [None]:
from sklearn.ensemble import AdaBoostRegressor

In [None]:
model_ada=AdaBoostRegressor(n_estimators=10,random_state=42).fit(x_train_ss,y_train)
model_ada

In [None]:
y_pred=model_ada.predict(x_val_ss) 

In [None]:
print('Train success rate : %',model_ada.score(x_train_ss,y_train)*100)
print('Validation success rate : %',r2_score(y_val,y_pred)*100)

### XGBoostRegressor

In [None]:
from xgboost import XGBRegressor

In [None]:
model_xgb=XGBRegressor(max_depth=3,learning_rate=0.1,n_estimators=100,verbosity=0,random_state=42).fit(x_train,y_train)
model_xgb

In [None]:
y_pred=model_xgb.predict(x_val) 

In [None]:
print('Train success rate : %',model_xgb.score(x_train,y_train)*100)
print('Validation success rate : %',r2_score(y_val,y_pred)*100)

### LightGBM

In [None]:
from lightgbm import LGBMRegressor

In [None]:
model_lgb=LGBMRegressor(max_depth=2,learning_rate=0.5,n_estimators=100,random_state=42).fit(x_train,y_train)
model_lgb

In [None]:
y_pred=model_lgb.predict(x_val)

In [None]:
print('Train success rate : %',model_lgb.score(x_train,y_train)*100)
print('Validation success rate : %',r2_score(y_val,y_pred)*100)

## DNN

In [None]:
import tensorflow
layer=tensorflow.keras.layers.experimental.preprocessing.Normalization()
layer.adapt(x_array)

In [None]:
model_dnn=tensorflow.keras.Sequential()
model_dnn.add(layer)
model_dnn.add(tensorflow.keras.layers.Dense(64,activation='relu',input_dim=34))
model_dnn.add(tensorflow.keras.layers.Dense(64,activation='relu'))
model_dnn.add(tensorflow.keras.layers.Dense(1))
model_dnn.compile(optimizer='Adam',loss='mean_absolute_error')

In [None]:
history=model_dnn.fit(x_train,y_train,epochs=400,verbose=1)  

In [None]:
print(history.history['loss'][-1])

In [None]:
plt.plot(history.history['loss'], label='loss')
plt.ylim([600, 1000])
plt.xlim([0, 400])
plt.xlabel('Epoch')
plt.ylabel('Error [Kuvvet]')
plt.legend()
plt.grid(True)

In [None]:
print('%',r2_score(y_val,model_dnn.predict(x_val))*100)

## Test

In [None]:
def test_models(model_name):

  for i in model_name:
    print(f'{i.__class__} | Test success rate : % {r2_score(y_test,i.predict(x_test))*100}')

def test_models_ss(model_name):

   for a in model_name:
    print(f'{a.__class__} Test success rate : % {r2_score(y_test,a.predict(x_test_ss))*100}')

In [None]:
model_names=[model_xgb,model_lgb,model_dnn]
model_names_ss=[model_rfr,model_svr,model_ada]

In [None]:
test_models(model_names)

In [None]:
test_models_ss(model_names_ss) 

## Cross-validation

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
model_names=[model_rfr,model_ada,model_xgb,model_lgb]

In [None]:
def test_models_cross(model_name):

  for i in model_name:
    print(f'{i.__class__} | Test(Cross validation) success rate : % {cross_val_score(i,X=x_train,y=y_train,cv=5,n_jobs=-1).mean()*100}')

def test_models_ss_cross(model_name):

   for a in model_name:
    print(f'{a.__class__} | Test(Cross validation-ss) success rate : % {cross_val_score(a,X=x_train_ss,y=y_train,cv=5,n_jobs=-1).mean()*100}')

In [None]:
test_models_cross(model_names)

In [None]:
test_models_ss_cross(model_names)

In [None]:
print('Support vector regression cross-val success rate:',cross_val_score(model_svr,X=x_train_ss,y=y_train,cv=3,n_jobs=-1).mean()*100)

In [None]:
print('DNN regression cross-val success rate:',r2_score(y_val,model_dnn.predict(x_val))*100) 