In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Import Modules
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as st
from sklearn.metrics import mean_squared_error,mean_absolute_error
import warnings
warnings.simplefilter("ignore")
# Load Data
data = pd.read_csv("../input/car-price-prediction/CarPrice_Assignment.csv")
print("Shape of Data  :  {}".format(data.shape))

In [None]:
# First look data
data.head(10)

In [None]:
data.drop("car_ID",axis = 1,inplace = True)

In [None]:
#categorical feat.
categorical = data.select_dtypes("object").columns
#numeric
numeric = data.select_dtypes(["int64","float"]).columns
###
print("Number of Categorical Data :{}".format(len(categorical)))
print("Number of Numeric Data :{}".format(len(numeric)))

In [None]:
# NA values 
print("Number of NA Values for all of data : {}".format(data.isna().sum().sum()))

In [None]:
data.describe().T

In [None]:
# Distribution of Target 
sns.displot(data.price,color="red")
#
st.skew(data.price)

* The price variable has a skewed distribution, I will try transformation techniques to simulate a normal distribution.

In [None]:
# Log transform
log_price = np.log1p(data.price)
sns.distplot(log_price,color = "red")
print("P-Value : {}".format(st.kstest(log_price,"norm")[1]))

In [None]:
# Root Square Transform
sqrtprice = np.sqrt(data.price)
sns.distplot(sqrtprice)
print("P-Value : {}".format(st.kstest(sqrtprice,"norm")[1]))

In [None]:
#boxcox transform
boxcox_trans = st.boxcox(data.price)
results = boxcox_trans[0] #values
lam = boxcox_trans[1] #lambda 
sns.distplot(results)
print("P-Value : {}".format(st.kstest(results,"norm")[1]))

* The technique best suited to the p-value and chart is the boxcox technique. I will convert the price variable with boxcox.

In [None]:
#data.price = results
#lambda_price = lam

In [None]:
data

#### Distributions of categorical features

In [None]:
for col in categorical:
    print("####",col,"####")
    print(data[col].value_counts(),'\n')

In [None]:
for col in categorical:
    print(col,":::",data[col].nunique())

* I'll take the car brands from the CarName variable, add them to the dataset and drop the CarName variable.

In [None]:
data_carname = list(data.CarName.astype("str"))
split_car = []
for i in range(0,len(data_carname)):
    split_car.append(data_carname[i].split()[0])
    
data["brand"] = split_car
data.drop("CarName",axis = 1,inplace = True)
data.brand.replace("maxda","mazda",inplace=True)
data.brand.replace("maxda","mazda",inplace=True)
data.brand.replace("Nissan","nissan",inplace=True)
data.brand.replace("porcshce","porsche",inplace=True)
data.brand.replace("vokswagen","volkswagen",inplace=True)
data.brand.replace("vw","volkswagen",inplace=True)
data.brand.replace("toyouta","toyota",inplace=True)
data.brand.replace("alfa-romero","alfa-romeo",inplace=True)


In [None]:
print(data.brand.value_counts())

### Feature Selection

In [None]:
corr = data.corr()
corr.style.background_gradient(cmap='coolwarm')

## Encoding

In [None]:
categorical = data.select_dtypes("object").columns
dummy = pd.get_dummies(data[categorical],drop_first=True)
data.drop(categorical,axis = 1,inplace=True)
data = pd.concat([dummy,data],axis = 1)

In [None]:
X = data.drop("price",axis = 1)
y = data.price

# Model

In [None]:
from sklearn.preprocessing import StandardScaler,PolynomialFeatures
from sklearn.model_selection import KFold,cross_val_predict
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.metrics import r2_score
from sklearn.pipeline import Pipeline
from scipy.special import inv_boxcox
kf = KFold(shuffle=True,random_state=42,n_splits=5)
scale = StandardScaler()

In [None]:
# Simple Linear Regression
scores = []
lr = LinearRegression()
for train_index,test_index in kf.split(X):
    X_train,X_test,y_train,y_test = (X.iloc[train_index,:],X.iloc[test_index,:],
                                     y.iloc[train_index],y.iloc[test_index])
    model = lr.fit(X_train,y_train)
    pred = model.predict(X_test)
    #pred = inv_boxcox(pred,lam)
    scores.append(r2_score(pred,y_test))
    plt.scatter(y_test.values,pred)
    plt.show()
print(scores)

In [None]:
# Lasso Regression
alphas = np.geomspace(0.01,20,num = 15) 
scores = []
for alpha in alphas:
    lasso = Lasso(alpha=alpha,max_iter = 10000)
    estimator = Pipeline([("scaler",scale),("lasso_regression",lasso)])
    predictions = cross_val_predict(estimator,X,y,cv=kf)
    scores.append(r2_score(y,predictions))


plt.semilogx(alphas,scores,"-*")  # Alphalara karşılık R2 sonucu için grafik
r2_lasso = pd.DataFrame(zip(alphas,scores),columns=["Alpha","R2_Score"])

In [None]:
r2_lasso

In [None]:
# Lasso with add Polynomial Features
pf = PolynomialFeatures(degree = 2)
scores = []
alphas = np.geomspace(1,20,num = 5) 

for alpha in alphas:
    lasso = Lasso(alpha=alpha,max_iter = 100000)
    estimator = Pipeline([("polynomial_feature",pf),("scaler",scale),("lasso_regression",lasso)])
    predictions = cross_val_predict(estimator,X,y,cv=kf)
    scores.append(r2_score(y,predictions))


plt.semilogx(alphas,scores,"-*")
pf_lasso_r2 = pd.DataFrame(list(zip(alphas,scores)),columns=["Alpha","R2_Score"])

In [None]:
pf_lasso_r2

In [None]:
# Ridge Regression
pf = PolynomialFeatures(degree = 3)
scores = []
alphas = np.geomspace(0.1,20,num = 20) 

for alpha in alphas:
    ridge = Ridge(alpha=alpha,max_iter = 100000)
    estimator = Pipeline([("polynomial_feature",pf),("scaler",scale),("ridge_regression",ridge)])
    predictions = cross_val_predict(estimator,X,y,cv=kf)
    print("For Alpha :: {}".format(alpha),"----> Root Mean Squared Error : {}".format(np.sqrt(mean_squared_error(y,predictions))))
    scores.append(r2_score(y,predictions))
plt.semilogx(alphas,scores,"-o")
ridge_r2 = pd.DataFrame(list(zip(alphas,scores)),columns=["Alpha","R2_Score"])

In [None]:
ridge_r2

In [None]:
from sklearn.model_selection import GridSearchCV
estimator = Pipeline([("polynomial",PolynomialFeatures(include_bias=False)),
                      ("scale",scale),
                      ("ridge_regression",Ridge())])
params = {"polynomial__degree":[1,2,3],
          "ridge_regression__alpha":np.geomspace(4,20,30)}
grid = GridSearchCV(estimator,params,cv = kf)
grid.fit(X,y)
grid.best_score_

## Main Model

In [None]:
grid.best_params_

## Conclusion 
* The R2 scores of the model established by linear regression, Lasso and Ridge regression are close to each other. However, the model with the highest score was Ridge regression and Lasso can be preferred because it works faster than regression. Optimal parameters degree = 3, alpha = 20.00000004

In [None]:
from sklearn.metrics import mean_squared_error
estimator = Pipeline([("polynomial",PolynomialFeatures(degree = 3,include_bias=False)),
                      ("scale",scale),
                      ("ridge_regression",Ridge(alpha=20.000000000000004))])
estimator.fit(X_train,y_train)
predict = estimator.predict(X_test)

print("R2 Score for Ridge Regression : {}".format(r2_score(predict,y_test)))
print("Root Mean Squared Error : {}".format(np.sqrt(mean_squared_error(y_test,predict))))
print("Mean Absolute Error : {}".format(mean_absolute_error(predict,y_test)))
