In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.simplefilter("ignore")

##### **import data**

In [None]:
df = pd.read_csv("/kaggle/input/car-price-prediction/CarPrice_Assignment.csv")
df.head()

In [None]:
df.info()

#### **drop useless columns**

In [None]:
df.drop(['car_ID', 'symboling', 'CarName'], axis=1, inplace=True)

##### quick eda

In [None]:
df.describe().T

In [None]:
df.corr()['price'].sort_values()

In [None]:
df_num_cols = df.select_dtypes(exclude='object').columns
fig, axes = plt.subplots(figsize=(4,52),nrows=len(df_num_cols))
i = 0
for col in df_num_cols:
    sns.scatterplot(data=df, x=col, y='price', ax=axes[i])
    i+=1

* enginesize, curbweight, horsepower are highly related to car price

##### feature engineering

##### 1. missing data

In [None]:
df.isnull().sum()

* no missing data(if it have missing values then have to fill, drop or fix the data)

##### 2. outlier

In [None]:
sns.scatterplot(data=df, x='enginesize', y='price');

In [None]:
sns.scatterplot(data=df, x='curbweight', y='price');

In [None]:
# drop outliers
df_drop_idx = df[(df['price']>30000) & (df['curbweight']<3000)].index
df.drop(df_drop_idx, axis=0)

In [None]:
sns.scatterplot(data=df, x='horsepower', y='price');

In [None]:
# drop outliers
df_drop_idx = df[(df['price']>40000) & (df['horsepower']<200)].index
df.drop(df_drop_idx, axis=0)

* In these scatter plots, we can find positive linearity and some outliers(e.g. when horsepower goes up but low price)

##### 3. categorical data processing

In [None]:
df_str_cols = df.select_dtypes(include = 'object')
df_num_cols = df.select_dtypes(exclude = 'object')

In [None]:
# make str columns to numerical columns using get_dummies
df_str_dummies = pd.get_dummies(df_str_cols, drop_first=True)
df_str_dummies

In [None]:
final_df = pd.concat([df_num_cols, df_str_dummies], axis=1)
final_df

In [None]:
final_df.info()

In [None]:
final_df.corr()['price'].sort_values()

* still enginesize, curbweight, horsepower are top 3 correlated columns

##### train/ test set split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = final_df.drop('price', axis=1)
y= final_df['price']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=101)

##### scaling data

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

##### fit model using ElasticNet and GridSearchCV

In [None]:
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV

In [None]:
base_elastic_net_model = ElasticNet()

In [None]:
param_grid = {'alpha':[0.1,0.01, .05, .001], 
             'l1_ratio':[.65, .66, .67]}

In [None]:
grid_model = GridSearchCV(estimator= base_elastic_net_model,
                          param_grid = param_grid,
                          scoring='neg_mean_squared_error',
                          cv=5, verbose=0)

In [None]:
grid_model.fit(X_train, y_train)

In [None]:
grid_model.best_params_

* Accoring to Grid Search, best parameters are alpha: 0.1, l1_ratio: 0.65 

In [None]:
y_pred = grid_model.predict(X_test)

##### estimate mae, rmse, r2_score

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [None]:
mae = mean_absolute_error(y_test, y_pred)
mae

In [None]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
rmse

In [None]:
from sklearn.metrics import r2_score

In [None]:
r2 = r2_score(y_test, y_pred)
r2

In [None]:
report = pd.DataFrame(data=[r2],columns=['R-SQUARED'])
report['RMSE']=rmse
report['MAE']=mae
report.index=['grid_model']
report

### conclusion
Through ElasticNet model and Grid Search, we get 86% r2 score (when alpha: 0.1, l1_ratio: 0.65 ).
Top 5 features that effect on price are carlength, carwidth, horsepower, curbweight, and enginesize.