#1. Importing data and python packages

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df=pd.read_csv('/kaggle/input/car-price-prediction/CarPrice_Assignment.csv', index_col='car_ID')

#2. Data overview

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
#target column value distribution
fig=plt.figure(figsize=(8,4))
sns.distplot(df['price'])
plt.title("Sales data distribution")

In [None]:
numerical_features=[col for col in df if df[col].dtype!='object']
categorical_features=[col for col in df if col not in numerical_features]

In [None]:
f, axes = plt.subplots(3,5 , figsize=(30, 30))
for i, feature in enumerate(numerical_features):
    sns.scatterplot(data=df, x = feature, y= "price",ax=axes[i%3, i//3])

In [None]:
#categorical columns distribution
f, axes = plt.subplots(2,5 , figsize=(30, 30))
for i, feature in enumerate(categorical_features):
    sns.countplot(data = df, x = feature,ax=axes[i%2, i//2])

#3. Cleaning and transforming data

In [None]:
# adding new column - car brand (instead of car name)
df['brand']=df['CarName'].apply(lambda x: x.split()[0])

In [None]:
df.head()

In [None]:
df.brand.value_counts()

In [None]:
#finding and correcting spelling errors
df[df.brand=='vokswagen']

In [None]:
df=df.replace('vokswagen','volkswagen')

In [None]:
df=df.replace('toyouta','toyota')

In [None]:
df=df.replace('porcshce', 'porsche')

In [None]:
df=df.replace('Nissan','nissan')

In [None]:
df=df.replace('maxda', 'mazda')

In [None]:
df=df.replace('vw','volkswagen')

In [None]:
df.brand.value_counts()

In [None]:
#deleting carname column
del df['CarName']

In [None]:
df.head()

In [None]:
df.groupby('brand')['price'].mean().sort_values().plot(kind='bar')

In [None]:
#dropping duplicates
df=df.drop_duplicates()

In [None]:
#assigning variables
y=df['price']
X=df.drop(['price'], axis=1)

In [None]:
X=pd.get_dummies(X, drop_first=True)

In [None]:
X.head()

In [None]:
len(X.columns)

In [None]:
#power transforming target variable
y=np.log1p(y)

In [None]:
fig=plt.figure(figsize=(8,4))
sns.distplot(y)
plt.title("Sales data distribution")

In [None]:
#scaling data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X=scaler.fit_transform(X)

#4. Building model

In [None]:
#dividing dataset into train and test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2)

In [None]:
#LassoCV
from sklearn.linear_model import LassoCV
lassocv=LassoCV()
lassocv.fit(X_train, y_train)

In [None]:
lassocv.score(X_test, y_test)

In [None]:
lassocv.alpha_

In [None]:
#ElasticNet
from sklearn.linear_model import ElasticNetCV

In [None]:
en=ElasticNetCV()

In [None]:
en.fit(X_train, y_train)

In [None]:
en.score(X_test, y_test)

In [None]:
en.alpha_

I got best score with LassoCV model - with alpha=0.007066135267376986.