In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# read csv file
dataset= pd.read_csv('/kaggle/input/car-price-prediction/CarPrice_Assignment.csv')

In [None]:
# print top-5 records
dataset.head()

In [None]:
# Check shape
dataset.shape

In [None]:
# Check statistic 
dataset.describe()

In [None]:
# check datatypes and number of records
dataset.info()

In [None]:
# Check Missing value
dataset.isnull().sum()

# Exploratory Data Analysis

### Numeric Features

In [None]:
# here car_ID column not important so drop car_ID column
dataset.drop('car_ID',axis=1,inplace=True)

In [None]:
# First check how many numerics,categoricals,temporal (Date, time eg..) features

# Create list of numeric features
numeric_features= list(dataset.select_dtypes(include=['int64','float64']).keys())

In [None]:
# print top-5 numeric records
dataset[numeric_features].head()

In [None]:
# Check how many descrete variables in numerics features
descrete_features=[feature for feature in numeric_features if len(dataset[feature].unique())<25]
dataset[descrete_features].head()

In [None]:
# now check the relation between decrete variables and dependent feature price
for feature in descrete_features:
    df=dataset.copy()
    df.groupby(feature)['price'].median().plot.bar()
    plt.title(feature)
    plt.xlabel(feature)
    plt.ylabel('Price')
    plt.show()

In [None]:
# now create list for continous varibles
continous_features=[feature for feature in numeric_features if feature not in descrete_features]
dataset[continous_features].head()

In [None]:
# crete histogram for continous variables
for feature in continous_features:
    df=dataset.copy()
    df[feature].hist(bins=25)
    plt.title(feature)
    plt.xlabel(feature)
    plt.ylabel('count')
    plt.show()

* > *Here you can see most of the features are not follow normal distribution so convert into normal distribution*

In [None]:
# Now Check outliers 
for feature in continous_features:
    df=dataset.copy()
    if 0 in df[feature].unique():
        pass
    else:
        df[feature]=np.log(df[feature])
        df.boxplot(column=feature)
        plt.title(feature)
        plt.ylabel(feature)
        plt.show()

### categorical features

In [None]:
#create list for categorical variables
categorical_features= list(dataset.select_dtypes(include=['object']).keys())

#print top-5 records
dataset[categorical_features].head()

In [None]:
# check how many categories present in each feature
for feature in categorical_features:
    print(f"The feature is {feature} and number of categories {len(dataset[feature].unique())}")
    print("----------------------------------------------")

In [None]:
# Now check relation between categorical features and dependent features
categorical_features.remove('CarName')
for feature in categorical_features:
    df=dataset.copy()
    df.groupby(feature)['price'].median().plot.bar()
    plt.title(feature)
    plt.xlabel(feature)
    plt.ylabel('Price')
    plt.show()

In [None]:
# Take only company name from the care name
data=dataset.copy()
new=data['CarName'].str.split(" ",n = 1, expand = True)
data['c_name']=new[0]
data.head()

In [None]:
# now check company name and price relaion
plt.figure(figsize=(15,10))
data.groupby('c_name')['price'].median().plot.bar()

In [None]:
d=data['c_name'].value_counts()
plt.figure(figsize=(20,15))
plt.pie(d.values,labels=d.keys(), shadow = True,autopct='%1.2f%%')
plt.legend(loc='upper right')

# Feature Engineering

In [None]:
# Transfrom Skewed data into log normal transformation
# create list for skewed features
num_features= ['wheelbase','carlength','carwidth','carheight','curbweight','enginesize','boreratio','stroke','compressionratio','horsepower','citympg','highwaympg','price']
for feature in num_features:
    dataset[feature]=np.log(dataset[feature])

In [None]:
dataset.head()

In [None]:
#now crete new feature with help of carname
dataset['company_name']=data['CarName'].str.split(" ",n = 1, expand = True)[0]
dataset.head()

In [None]:
#drop the carname column
dataset.drop('CarName',axis=1,inplace=True)

In [None]:
dataset.head()

In [None]:
# check categories in company_name column
dataset['company_name'].value_counts()

In [None]:
# replace missplled company_name using loc
# for volkswagen
dataset.loc[(dataset.company_name=='vw'),'company_name']='volkswagen'
dataset.loc[(dataset.company_name=='vokswagen'),'company_name']='volkswagen'

# for porsche
dataset.loc[(dataset.company_name=='porcshce'),'company_name']='porsche'

#for toyota
dataset.loc[(dataset.company_name=='toyouta'),'company_name']='toyota'

#for nissan
dataset.loc[(dataset.company_name=='Nissan'),'company_name']='nissan'

#for mazda
dataset.loc[(dataset.company_name=='maxda'),'company_name']='mazda'

In [None]:
dataset.head()

In [None]:
dataset['company_name'].value_counts()

In [None]:
# now Handle rare categorical feature
# remove the feature that are present less then 1% oberservation
cat_features=list(dataset.select_dtypes(include=['object']).keys())

In [None]:
cat_features

In [None]:
for feature in cat_features:
    temp=dataset.groupby(feature)['price'].count()/len(dataset)
    temp_df=temp[temp>0.01].index
    print(temp_df)

# Feature Scaling


In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
for feature in cat_features:
    le=LabelEncoder()
    dataset[feature]=le.fit_transform(dataset[feature])

In [None]:
# import minmaxscaler
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

# Data Prepration

In [None]:
X=dataset.drop('price',axis=1)
y=dataset[['price']]

In [None]:
X.head()

In [None]:
y.head()

In [None]:
col=X.columns
x=pd.DataFrame(scaler.fit_transform(X),columns=col)
x.head()

# Feature Selection

In [None]:
# Importing statsmodels module as sm
import statsmodels.api as sm
# Adding a constant column to our X_train dataframe
X_train = sm.add_constant(X_train)
# create a first fitted model
model=sm.OLS(y_train,X_train)
lm_1 = model.fit()

In [None]:
print(lm_1.summary())

In [None]:
plt.figure(figsize=(16,10))
sns.heatmap(dataset.corr(),annot=True)
plt.show()

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
# Define vif_scores function as stated above
def vif_score(X):
  vif_data=pd.DataFrame()
  vif_data['Variables']=X.columns
  vif_data['VIF']=[variance_inflation_factor(X.values,i) for i in range(len(X.columns))]
  return vif_data




# print vif scores for all current input features
print(vif_score(x))

Droping the stroke,boreration and compression ratio

In [None]:
x.drop(columns=['wheelbase','cylindernumber','boreratio','stroke','compressionratio'],axis=1,inplace=True)
print(vif_score(x))

In [None]:
x.drop(columns=['drivewheel','peakrpm'],axis=1,inplace=True)
print(vif_score(x))

In [None]:
# split the data into train_test
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=41)

In [None]:
print("shape of X_train: ",X_train.shape)
print("shape of X_test: ",X_test.shape)
print("Shape of y_train: ",y_train.shape)
print("Shape of y_test: ",y_test.shape)

In [None]:
#import libraries
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

In [None]:
#Initialize lasso
lasso=Lasso()

#crete list of alpha 
params = {'alpha': [0.0001, 0.001, 0.01, 0.05, 0.1, 
 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 2.0, 3.0, 
 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 20, 50, 100, 500, 1000 ]}

#inintialize gridsearchcv
grid_lasso=GridSearchCV(estimator=lasso,param_grid=params,scoring='neg_mean_absolute_error',cv=5,return_train_score=True,verbose=1)

#fit model
grid_lasso.fit(X_train,y_train)

In [None]:
#create dataframe for cv results
cv_results=pd.DataFrame(grid_lasso.cv_results_)

#print cv_results
cv_results.head()

In [None]:
#ploting mean test and train score with alpha
# change param_alpha datatype to float
cv_results['param_alpha'] = cv_results['param_alpha'].astype(int)

# plotting
plt.figure(figsize=(10,7))
plt.xlabel('alpha',fontsize=15)
plt.ylabel('Negative mean absolute error',fontsize=15)
plt.title('Negative mean absolute error and alpha',fontsize=20)
plt.plot(cv_results['param_alpha'],cv_results['mean_train_score'])
plt.plot(cv_results['param_alpha'],cv_results['mean_test_score'])
plt.legend(['Train Score','TestScore'])

In [None]:
grid_lasso.best_params_

In [None]:
#let's take alpha value 0.01 and select best features

feature_sel_model=SelectFromModel(Lasso(alpha=0.001,random_state=0))
feature_sel_model.fit(X_train,y_train)

In [None]:
feature_sel_model.get_support()

In [None]:
selected_feat=X_train.columns[(feature_sel_model.get_support())]
selected_feat

In [None]:
print(f"Total features: {X_train.shape[1]}")
print(f"selected features: {len(selected_feat)}")
print(f"features with coefficient shrank to 0 : {np.sum(feature_sel_model.estimator_.coef_==0)}:")

In [None]:
# crete model with selected features
from sklearn.linear_model import LinearRegression

In [None]:
model=LinearRegression()

In [None]:
#fit the data
model.fit(X_train[selected_feat],y_train)

In [None]:
y_pred=model.predict(X_test[selected_feat])

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
#calculate and print RMSE
mse=mean_squared_error(y_test,y_pred)
print("Root Mean Squared error(RMSE) is: ",np.sqrt(mse))

In [None]:
#calculate and print MSE
print("Mean squared Error(MSE) is: ",mse)

In [None]:
#calculate and print MAE
print("Mean Absolute error(MAE) is: ",mean_absolute_error(y_test,y_pred))

# Verify assumptions

## Normality of residuals

In [None]:
residual=y_test-y_pred
sns.distplot(residual)

In [None]:
np.mean(residual)

## Homoscedasticity

In [None]:
fig,ax=plt.subplots(figsize=(6,2.5))
_=ax.scatter(y_pred,residual)

## No auto correlation of residual

In [None]:
import statsmodels.tsa.api as smt
acf=smt.graphics.plot_acf(residual,lags=40,alpha=0.05)
acf.show()