
*   preprocess

*   train

*   eval

*   cross validation


# imports


In [1]:
import sklearn
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor

from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.decomposition import PCA
import pandas as pd
import sklearn as sk
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, StandardScaler


import seaborn as sns
sns.set_style("whitegrid")
# sns.color_palette("Set2")
sns.set_palette("Set2")

# sns.set_theme(style="ticks", color_codes=True)

# load dataset

In [1]:
# get csv file

dataset = pd.read_csv("/kaggle/input/usedcarscatalog/cars.csv")
dataset

# preprocessing

- data cleaning
- categorical preprocessing
- numerical preprocessing
- bool processing

In [1]:
# dataset

## cleaning

In [1]:
# remove rows where engine_capacity is null

enc_dataset = dataset.copy() # encoded dataset

to_rem = enc_dataset[enc_dataset.engine_capacity.isna()].index

enc_dataset.drop(to_rem, inplace = True)

In [1]:
enc_dataset.info()

## data distribution visualize


In [1]:
for column in enc_dataset.columns:
  plt.figure(figsize = (10,8))

  if enc_dataset[column].dtype=="object" or enc_dataset[column].dtype=="bool":
  
    unqs = enc_dataset[column].unique()
  
    if len(unqs) >  10:
      unqs = unqs[0:10]

    sns.catplot(x=column, kind="count", data=enc_dataset, order = unqs)
    plt.xticks(rotation=45)

  else:
    # dataset[column].plot()
    plt.title(column)
    enc_dataset[column].hist(bins = 20)

## Encoding

In [1]:
# label encoding for all

lb_encoders = [] # for saving all encoders
catg_cols = [i for i in enc_dataset.columns if enc_dataset[i].dtype=="object" or enc_dataset[i].dtype=="bool" ]
for i in catg_cols:
  lb_encoder = LabelEncoder()
  col = enc_dataset[i]
  encoded_col = lb_encoder.fit_transform(col)
  enc_dataset[i] = encoded_col
  lb_encoders.append(lb_encoder)

enc_dataset

## dimensionality reduction

In [1]:
# see corelation for numerical values

cols = [i for i in enc_dataset.columns   if enc_dataset[i].dtype!="object" and enc_dataset[i].dtype!="bool"]
corr = enc_dataset[cols].corr()
last = corr.price_usd
corr.drop(columns =["price_usd"], inplace = True)
corr.drop(["price_usd"], inplace = True)
corr["price_usd"] = last
corr

In [1]:
plt.figure(figsize=(10,8))
sns.heatmap(corr,
            center  = 0)

it shows that there is a need to remove some features since there corelation is very low

In [1]:
# corr[col]# droping features with corr between -0.1  to 0./1

low_corr_features  = [col for col in corr.index if abs(corr["price_usd"][col]) < 0.1 ]
low_corr_features

In [1]:
enc_dataset.drop(columns = low_corr_features,inplace = True)

In [1]:
enc_dataset.columns

In [1]:
# recheck corr
# see corelation for numerical values

cols = [j for j in enc_dataset.columns  if enc_dataset[j].dtype!="object" and enc_dataset[j].dtype!="bool"]
corr = enc_dataset[cols].corr()
corr

## checking outliers

In [1]:
# check outliers
for column in enc_dataset.columns:

  if enc_dataset[column].dtype!="object" and enc_dataset[column].dtype!="bool":
    plt.figure()
    plt.title(column)
    sns.boxplot(data=enc_dataset[column])
''' outliers in 
 - odometer val
 - year produced
 - engine_capacity
 '''

In [1]:
# # odometer
# enc_dataset.odometer_value.value_counts()

# '''remove this feature bcz:
#    - many outliers
#    - v low corr'''
  
# enc_dataset.drop(columns=['odometer_value'], inplace = True)

In [1]:
# # year produced
# pd.set_option('display.max_columns', 100)
# enc_dataset.year_produced.value_counts().reset_index()

# # remove values with count < 2

In [1]:
# # dropping years with count < 5
# print("shape before {}".format(enc_dataset.shape))

# temp  = enc_dataset.year_produced.value_counts().reset_index()
# temp = temp[temp.year_produced<5]
# year_rem = temp.values[:,0]

# enc_dataset.drop(enc_dataset[enc_dataset.year_produced.isin(year_rem)].index, inplace = True)
# print("shape after {}".format(enc_dataset.shape))

In [1]:
# # dropping engine_capacity with count < 2
# print("shape before {}".format(enc_dataset.shape))

# temp  = enc_dataset.engine_capacity.value_counts().reset_index()
# temp = temp[temp.engine_capacity<2]
# value_rem = temp.values[:,0]

# enc_dataset.drop(enc_dataset[enc_dataset.engine_capacity.isin(value_rem)].index, inplace = True)
# print("shape after {}".format(enc_dataset.shape))

## normalization

In [1]:
# minmax scaling

minmax_scalar = MinMaxScaler()
mm_scaled = minmax_scalar.fit_transform(enc_dataset)
mm_scaled  = pd.DataFrame ( data = mm_scaled, columns = enc_dataset.columns)
mm_scaled

In [1]:
# standard scaling

st_scalar = StandardScaler()
st_scaled = st_scalar.fit_transform(enc_dataset)
st_scaled  = pd.DataFrame ( data = st_scaled, columns = enc_dataset.columns)
st_scaled

# evalution metrics

- model score
- accuracy

In [1]:
def eval(model  ,testx, testy):

  #acc score : returns th ecofficient of determination, r2 
  r2 = model.score(testx, testy)  
  y_pred = model.predict(testx)
  rmse = mean_squared_error(testy,y_pred, squared = False)
  return [r2,rmse]


In [1]:
def eval_graphs(models):
  
  df = pd.DataFrame.from_dict(models, orient='index', columns = ["model", "R^2", "RMSE"])
  

  plt.figure()
  df.plot.bar(y = "R^2", color = "#34eb9b"     )
  
  plt.figure()
  df.plot.bar( y = "RMSE", color =  "#eb3489"  )

  return df


In [1]:
# >>> cross_val_score(regressor, X, y, cv=10)
# # 

# train test 

- simple split
- cross validation


In [1]:
# # for st scaled
# predictors = st_scaled.drop("price_usd",axis=1)
# target = st_scaled["price_usd"]
# X_train,X_test,Y_train,Y_test =train_test_split(predictors,target,test_size=0.25,random_state=0)
# print(X_train.shape)
# print(X_test.shape)
# print(Y_train.shape)
# print(Y_test.shape)

In [1]:
## for mm scaled

predictors = mm_scaled.drop("price_usd",axis=1)
target = mm_scaled["price_usd"]
X_train,X_test,Y_train,Y_test =train_test_split(predictors,target,test_size=0.25,random_state=0)
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)



In [1]:
# train test , x y split

# predictors = enc_dataset.drop("price_usd",axis=1)
# target = enc_dataset["price_usd"]
# X_train,X_test,Y_train,Y_test =train_test_split(predictors,target,test_size=0.25,random_state=0)
# print(X_train.shape)
# print(X_test.shape)
# print(Y_train.shape)
# print(Y_test.shape)




# models

- linear regg
- knn reg
- svm
- dt regg
- ensemble regg
- MLP regg


In [1]:
models = {   
           "LR" : [ LinearRegression(n_jobs = 5) ] ,
           "KNNR" : [ KNeighborsRegressor(n_jobs=5, n_neighbors=8) ],
           "SVR" : [ SVR(kernel="rbf") ],
           "RFR" : [ RandomForestRegressor(n_estimators = 200) ],
           "GBR" : [ GradientBoostingRegressor(learning_rate=0.7, n_estimators=200) ],
           "ABR" : [ AdaBoostRegressor(n_estimators=200, learning_rate=0.01) ],
           "MLP" : [ MLPRegressor(random_state=1, max_iter=500,learning_rate_init=0.01) ],
          }

In [1]:
for model in list(models.keys()):

  print("**************************** {} ****************************".format(model))
  
  models[model][0] = models[model][0].fit(X_train,Y_train)

  res = eval(models[model][0], X_test, Y_test)
  print("r2 : ",res[0],"\nrmse :",res[1])

  models[model].extend( res )

In [1]:
models

In [1]:
eval_graphs(models)

rf : r2 = 0.9, gb: r2 = 0.87