In [None]:
import seaborn as sns
# ^^^ pyforest auto-imports - don't write above this line
import plotly.express as px
import pandas as pd
import numpy as np

In [None]:
data = pd.read_csv('/kaggle/input/usa-cers-dataset/USA_cars_datasets.csv', index_col=0)


|Feature	|Type |Description|
|--------|---------|--------------|
|Price|	Integer|	The sale price of the vehicle in the ad
|Years	|Integer	|The vehicle registration year
|Brand	|String	|The brand of car
|Model	|String	|model of the vehicle
|Color	|String	|Color of the vehicle
|State/City	|String	|The location in which the car is being available for purchase
|Mileage	|Float	|miles traveled by vehicle
|Vin	|String	|The vehicle identification number is a collection of 17 characters (digits and capital letters)
|Title |Status	|String	This feature included binary classification, which are clean title vehicles and salvage insurance
|Lot	|Integer	|A lot number is an identification number assigned to a particular quantity or lot of material from a single manufacturer.For cars, a lot number is combined with a serial number to form the Vehicle Identification Number.
|Condition	|String	|Time

[Data description link](https://www.kaggle.com/doaaalsenani/usa-cers-dataset)

In [None]:
data.shape

In [None]:
for col in data.columns:
    print(col)
    print()
    print(data[col].value_counts())
    print('= - ='*20)
    print()

## Pre-Process

In [None]:
data.head(1)

In [None]:
data.drop(columns=['vin', 'lot'], inplace= True)

In [None]:
data.rename(columns={'mileage' : 'miles_driven'}, inplace = True)

In [None]:
data['title_status'].replace({'clean vehicle' : 1, 'salvage insurance': 0}, inplace = True)

In [None]:
data['vehicle_age'] = 2020 - data['year']
data.drop(columns=['year'], inplace= True)

In [None]:
# converting day,hours into minutes

data['condition'] = data['condition'].str.replace('left','')
data.loc[data['condition'].str.contains('minutes'), 'condition'] = data.loc[data['condition'].str.contains('minutes'), 'condition'].apply(lambda x : str(x).split()[0])
data.loc[data['condition'].str.contains('hours'), 'condition'] = data.loc[data['condition'].str.contains('hours'), 'condition'].apply(lambda x : str(int(str(x).split()[0])*60))
data.loc[data['condition'].str.contains('days'), 'condition'] = data.loc[data['condition'].str.contains('days'), 'condition'].apply(lambda x : str(int(str(x).split()[0])*60*24))
data.loc[data['condition'].str.contains('Listing Expired'), 'condition'] = 0
data['condition'] = data['condition'].astype('int')

In [None]:
data[data.country == ' canada']

##### As Canada has only 7 values and 5 out of 7 have same price(30,000) even they has +/- 10,000 difference in miles_driven and most of the other attributes are equal. So ***Removing column Country***.      We have ***state*** column so i think that makes sense.

In [None]:
data.drop(columns='country',inplace = True)

In [None]:
# Replacing colors having counts less than 10 to other

colors_less_counts = data.color.value_counts()[data['color'].value_counts() < 10].index
data['color'].replace(colors_less_counts, 'other', inplace= True)

In [None]:
pd.set_option('display.max_rows',200)

In [None]:
data.groupby('model')['brand'].value_counts()

In [None]:
data['model'].replace(['doors','d'], 'door', inplace = True)
data['model'].replace('vans', 'van', inplace = True)

### We Have cars whose price are zero and less than 100 also. 

In [None]:
data['price'].value_counts().sort_index().head(10)

In [None]:
data[(data['price'] == 0) | (data['price'] == 25)]

##### From above table we can say that most of the cars title_status = 0(Salvage_insurace). **So not changing price**

##### We can also find some anomalies such as indexes 309,322,349,545 whose prices = 0 and miles driven = 0 and age > 10

In [None]:
data.drop(index=[309,322,349,545], inplace = True)

In [None]:
data.isnull().sum()

In [None]:
data_copy = data.copy(deep = True)

In [None]:
num_cols = data_copy.select_dtypes(exclude='object').columns
cat_cols = data_copy.select_dtypes(include='object').columns

### OutLier detection

[Isolation forest paper](https://cs.nju.edu.cn/zhouzh/zhouzh.files/publication/icdm08b.pdf)

In [None]:
from sklearn.ensemble import IsolationForest

clf = IsolationForest(random_state = 1)
preds = clf.fit_predict(data_copy[num_cols])  # using only numerical columns
list(preds).count(-1)

##### Using numerical columns only we can find 386 outliers.  -1 is an outlier and 1 is not an outlier

In [None]:
from scipy.stats import zscore

In [None]:
data_copy_scaled = zscore(pd.get_dummies(data_copy))

In [None]:
clf = IsolationForest(random_state = 1)
preds = clf.fit_predict(data_copy_scaled) # using total data
list(preds).count(-1)

We cannot find any outliers using Isolation Forest Method.

## EDA

In [None]:
fig = px.box(data,x =  data['title_status'].replace({0:'salvage_insurance',1:'clean_vehicle'}), y ='price', template='plotly_dark')
fig.show()

In [None]:
fig = px.box(data, x = 'brand', y ='price', template='plotly_dark' ,color = 'title_status')
fig.show()

In [None]:
fig = px.box(data, x = 'color', y ='price', template='plotly_dark' ,color = 'title_status')
fig.show()

In [None]:
fig = px.box(data, x = 'state', y ='price', template='plotly_dark')
fig.show()

In [None]:
fig = px.scatter(data, x="vehicle_age", y="price", color="brand", size='price',hover_data=['model'], template = 'plotly_dark')
fig.show()

In [None]:
# Double click on brand to select one brand at a time to get good understanding
# In this graph we can observe price variation of different models in a brand.

fig = px.scatter(data, x="model", y="price", color="brand", size='vehicle_age',hover_data=['vehicle_age'], template = 'plotly_dark')
# fig.update_traces(visible= False, selector=dict(type='scatter'))
fig.show()

In [None]:
# Double click on model to select one brand at a time to get good understanding, Select **mpv**
# In this plot we can observe price variation of each model in different states


fig = px.scatter(data, x="state", y="price", color="model", size='vehicle_age',hover_data=['brand','vehicle_age','miles_driven','title_status'], template = 'plotly_dark')
fig.show()

In [None]:
def check_mutlicolinearity(data_x):
    corr = data_x.corr()
    corr = pd.DataFrame(np.tril(corr, k=-1),      # gets Lower triangular matrix
                        columns=data_x.columns,
                        index=data_x.columns)  

    corr = corr.replace(0.000000, np.NAN)
    count_of_total_correlation_values = corr.count().sum()

    for i in [0.5, 0.6, 0.7, 0.8, 0.9]:
        data_corr = corr[abs(corr) > i]
        count_greater_than_thresh = data_corr.count().sum()
        print(f'Percent Values Greater than {i} co-relation : {count_greater_than_thresh/count_of_total_correlation_values}')
    return corr

In [None]:
def plot_corr(threshold, corr):
    data_corr = corr[abs(corr) > threshold]
    sns.heatmap(data_corr, annot=True, cmap="YlGnBu", center=0)
    plt.show()

In [None]:
corr = check_mutlicolinearity(data[num_cols].drop(columns = 'price'))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plot_corr(0, corr)

In [None]:
data.head()

In [None]:
X = data.drop(columns='price')
X_scaled = zscore(pd.get_dummies(X))

Y = data['price']
Y_scaled = zscore(Y)

## Feature Selection

In [None]:
data.corr()['price']

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_regression,f_regression,f_oneway

In [None]:
fs = SelectKBest(score_func=f_regression, k='all')
fs.fit(pd.get_dummies(X[cat_cols]), Y)

In [None]:
fig = px.bar(x =pd.get_dummies(X[cat_cols]).columns, y = fs.scores_, template = 'plotly_dark')
fig.show()

In [None]:
fs = SelectKBest(score_func=mutual_info_regression, k='all')
fs.fit(pd.get_dummies(X), Y)

In [None]:
fig = px.bar(x = pd.get_dummies(X).columns, y =fs.scores_, template = 'plotly_dark')
fig.show()

## Modelling

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import BaggingRegressor,AdaBoostRegressor,GradientBoostingRegressor,RandomForestRegressor,VotingRegressor
from sklearn.model_selection import cross_val_score,GridSearchCV,KFold


In [None]:
# GB_bias=[]
# GB_ve=[]
# for n in np.arange(1,100):
#     GB=GradientBoostingRegressor(n_estimators=n,random_state=0)
#     scores=cross_val_score(GB,X_scaled,Y_scaled,cv=3,scoring='neg_mean_squared_error')
#     rmse=np.sqrt(np.abs(scores))
#     GB_bias.append(np.mean(rmse))
#     GB_ve.append((np.std(rmse,ddof=1)))

#np.argmin(GB_bias)

In [None]:
# bias=[]
# ve=[]
# for n in np.arange(1,100):
#     mod=AdaBoostRegressor(base_estimator=LR,n_estimators=n,random_state=0)
#     scores=cross_val_score(mod,X_scaled,Y_scaled,cv=3,scoring='neg_mean_squared_error')
#     rmse=np.sqrt(np.abs(scores))
#     bias.append(np.mean(rmse))
#     ve.append((np.std(rmse,ddof=1)))

#np.argmin(bias)

In [None]:
LR=LinearRegression()
LR_AB=AdaBoostRegressor(base_estimator=LR,n_estimators = 94 ,random_state=0)
DT_AB=AdaBoostRegressor(n_estimators = 8 ,random_state=0)
LR_GB=GradientBoostingRegressor(n_estimators = 97, random_state=0)
RF=RandomForestRegressor(criterion='mse',random_state=0)

In [None]:
models = []
# models.append(('LinearRegression', LR))
# models.append(('Adaboost',LR_AB))
models.append(('DT_boost',DT_AB))
models.append(('GBoost',LR_GB))
models.append(('RF',RF))



In [None]:
# evaluate each model in turn
results = []
names = []
for name, model in models:
    kfold = KFold(shuffle=True,n_splits=3,random_state=0)
    cv_results = cross_val_score(model, X_scaled, Y_scaled,cv=kfold, scoring='neg_mean_squared_error', n_jobs = 3)
    results.append(np.sqrt(np.abs(cv_results)))
    names.append(name)
    print("%s: %f (%f)" % (name, np.mean(np.sqrt(np.abs(cv_results))),np.std(np.sqrt(np.abs(cv_results)),ddof=1)))
    
#     break


In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X_scaled,Y_scaled, random_state = 0)

In [None]:
model = AdaBoostRegressor(n_estimators = 8 ,random_state=0)
model.fit(x_train, y_train)

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
y_predict = model.predict(x_train) #train error
mean_squared_error(y_train, y_predict)

In [None]:
y_predict = model.predict(x_test)  # test error
mean_squared_error(y_test, y_predict)

In [None]:
from sklearn import neighbors
knn=neighbors.KNeighborsRegressor()

param_grid={
    'n_neighbors':np.arange(2,5),
    'weights':['uniform', 'distance']}

kfold= KFold(n_splits=3,shuffle=True,random_state=1)
model= GridSearchCV(estimator=knn,
                        param_grid=param_grid,
                        scoring='neg_mean_squared_error',
                        cv=kfold,
                        refit=True,
                        verbose=5,
                        n_jobs=3)
                        
model.fit(X_scaled,Y_scaled)

print()
print('Best Scorer{}'.format(model.best_score_))
print('Best Parameters{}'.format(model.best_params_))

In [None]:
res = pd.DataFrame(model.cv_results_)
res.sort_values('rank_test_score').head(3)


KNeighborsRegressor gives the least bias error(0.45) and least variance error(0.063)