In [None]:
!pip install jcopml
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import collections

from sklearn.model_selection import GridSearchCV,RandomizedSearchCV,train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,MinMaxScaler,RobustScaler,PolynomialFeatures,OneHotEncoder,OrdinalEncoder,PowerTransformer
from sklearn.linear_model import LinearRegression,ElasticNet
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error

from jcopml.tuning import grid_search_params as gsp, random_search_params as rsp, bayes_search_params as bsp
from jcopml.feature_importance import mean_score_decrease
from jcopml.plot import plot_actual_vs_prediction,plot_classification_report,plot_confusion_matrix,plot_correlation_matrix,plot_residual

warnings.filterwarnings("ignore")

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv("/kaggle/input/used-car-dataset-ford-and-mercedes/audi.csv")
df.head()

# EDA

In [None]:
df.isna().sum()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.describe(include='O')

In [None]:
for i in df[['model','transmission','fuelType']]:
    print("Unique Values Colums : ", i)
    print(df[i].unique())

In [None]:
cat_mask = (df.dtypes == np.object)
num_mask = (df.dtypes == np.int64) | (df.dtypes == np.float64)

cols_cat = df.columns[cat_mask].to_list()
cols_num = df.columns[num_mask].to_list()

print(cols_cat)
print(cols_num)

In [None]:
fig,axs = plt.subplots(ncols=3,figsize=(12,8))

for ax,i in zip(axs.flat, df[cols_cat]):
    df[i].value_counts().plot(kind='bar',ax=ax)
    ax.set_title(f'{df[i].name}')
    ax.tick_params(axis='x')
    ax.tick_params(axis='y')
plt.tight_layout()
plt.show()

In [None]:
fig,axs = plt.subplots(nrows=2,ncols=3,figsize=(15,8))

for ax,i in zip(axs.flat, df[cols_num]):
    df[i].plot(kind='hist',ax=ax, bins=10)
    ax.set_title(f'{df[i].name}')
    ax.tick_params(axis='x')
    ax.tick_params(axis='y')
plt.tight_layout()
plt.show()

In [None]:
plt.bar(df['fuelType'],df['price'])
plt.xlabel("Fuel Type")
plt.ylabel("Price");

In [None]:
plt.bar(df['transmission'],df['price'])
plt.xlabel("Transmission")
plt.ylabel("Price");

In [None]:
plt.figure(figsize=(15,8))
plt.bar(df['model'],df['price'])
plt.xlabel("Transmission")
plt.ylabel("Price")
plt.tight_layout();

In [None]:
plt.figure(figsize=(10,8))
sns.boxplot(data=df);

In [None]:
pearson_corr = df.corr()

figure = plt.figure(figsize=(15,10))
sns.heatmap(pearson_corr,annot=True, vmin=-1, vmax=+1)
plt.title("PEARSON CORRELATION")
plt.show()

In [None]:
pd.DataFrame({"skew":df[cols_num].skew(axis=0)})

# Splitting Data

In [None]:
X = df.drop(columns='price')
y = df['price']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

# Pipeline

In [None]:
num_pipe = Pipeline([("scaler",RobustScaler()),
                    ])

cat_pipe = Pipeline([("encoder",OneHotEncoder())
                    ])

ordinal_pipe = Pipeline([("encoder",OrdinalEncoder())])

preprosesor = ColumnTransformer([("numeric",num_pipe,['mileage','tax','mpg','engineSize']),
                                 ("categorical",cat_pipe,['transmission','fuelType']),])

# Training

In [None]:
pipeline = Pipeline([("prep",preprosesor),
                    ("algo",RandomForestRegressor(random_state=42))])

lr = GridSearchCV(pipeline,gsp.rf_params,cv=3,n_jobs=-1,verbose=1)
lr.fit(X_train,y_train)

# Evaluation
print(lr.best_params_)
print(lr.score(X_train,y_train),lr.best_score_,lr.score(X_test,y_test))

In [None]:
df_imp = mean_score_decrease(X_train, y_train, lr, plot=True, topk=10)

In [None]:
plot_residual(X_train, y_train, X_test, y_test, lr, lowess=False)

# Tuning Parameter and Poly

In [None]:
num_pipe = Pipeline([("scaler",RobustScaler()),
                    ("transform",PowerTransformer()),
                    ("poly",PolynomialFeatures(degree=2))])

cat_pipe = Pipeline([("encoder",OneHotEncoder())
                    ])

ordinal_pipe = Pipeline([("encoder",OrdinalEncoder())])

preprosesor = ColumnTransformer([("numeric",num_pipe,['mileage','mpg','engineSize'])])

pipeline = Pipeline([("prep",preprosesor),
                    ("algo",RandomForestRegressor(random_state=42))])

parameter = {'prep__numeric__poly__degree': [1, 2, 3],
             'prep__numeric__poly__interaction_only': [True, False],
             'algo__n_estimators': [100, 103, 105],
             'algo__max_depth': [10, 11, 12],
             'algo__max_features': [0.1, 0.11, 0.12],
             'algo__min_samples_leaf': [1, 3, 5]}


lr1 = GridSearchCV(pipeline,parameter,cv=3,n_jobs=-1,verbose=1)
lr1.fit(X_train,y_train)

# Evaluation
print(lr1.best_params_)
print(lr1.score(X_train,y_train),lr1.best_score_,lr1.score(X_test,y_test))

In [None]:
plot_residual(X_train, y_train, X_test, y_test, lr1, lowess=False)

In [None]:
plot_actual_vs_prediction(X_train, y_train, X_test, y_test, lr1)

In [None]:
y_pred = lr1.predict(X_test)
print("Mean Squared Error : ", mean_squared_error(y_test,y_pred))