In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# importing libraries
import matplotlib.pyplot as plt 
import seaborn as sns
import statsmodels.api as sm # For OLS(Ordinary Least Square)
%matplotlib inline
pd.pandas.set_option('display.max_columns',None)
import warnings
warnings.simplefilter("ignore") 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.metrics import r2_score

In [None]:
# reading the kaggle dataset
data=pd.read_csv("/kaggle/input/car-price-prediction/CarPrice_Assignment.csv")
data.head()

In [None]:
# drop uncecesary features
data.drop(['car_ID','symboling','CarName'],axis=1,inplace=True)

# **EXPLORATORY DATA ANALYSIS**

In [None]:
# find missing values
data.info()

**OBSERVATION : we see no NULL values**

In [None]:
#Exploring Numerical Features
numerical_feature = [feature for feature in data.columns if data[feature].dtypes!="O"]
numerical_feature

In [None]:
# Extracting Discrete Numerical Features from Numerical Features(threshould=25)
discrete_feature = [feature for feature in numerical_feature if len(data[feature].unique())<=25]
discrete_feature 

In [None]:
# Checking dependency of "price" on "peakrpm" Discrete Numerical Feature
for feature in discrete_feature:
    sns.barplot(data[feature],data['price'],errwidth=0)
    plt.xticks(rotation=90)
    plt.show()

In [None]:
# Extracting Continuous Numerical Features from Numerical Features
continuous_feature = [feature for feature in numerical_feature if feature not in ['peakrpm','car_ID','price']]
continuous_feature

In [None]:
# Checking distribution of Continuous Feature
for feature in continuous_feature:
    sns.distplot(data[feature])
    plt.show()

**OBSERVATION :we see enginesize is right skewed**

In [None]:
# Performing logarithmic Transformation on 'enginesize'
df=data.copy()
df['enginesize']=np.log(df['enginesize'])
df['price']=np.log(df['price'])
sns.distplot(df['enginesize'])
plt.show()

In [None]:
# Scatter Plot
for feature in continuous_feature:
    sns.scatterplot(df[feature],df['price'])
    plt.show()

**OBSERVATION : Feature such as [ 'curbweight' , 'enginesize' , 'horsepower' ] has strong correlation with 'price'**

In [None]:
# Checking for Outliers
for feature in continuous_feature:
    sns.boxplot(df[feature])
    plt.show()

**OBSERVATION : [ 'wheelbase', 'carlength' , 'carwidth' , 'enginesize' , 'stroke' , 'compressionratio' , 'horsepower' , 'citympg' ,'highwaympg'] are ahving outliers**

In [None]:
# Exploring Categorical Features
categorical_feature = [feature for feature in df.columns if data[feature].dtypes=="O"]
categorical_feature

In [None]:
# Plotting barplot to Visualize Relation between Categorical variable and Output variable
for feature in categorical_feature:
    data.groupby(feature)['price'].median().plot.bar()
    plt.xticks(rotation=90)
    plt.xlabel(feature)
    plt.ylabel('price')
    plt.title(feature)
    plt.show()

In [None]:
# Distribution of Output Variable
sns.distplot(data['price'],color='g')

In [None]:
# Checking for Multi-Collinearity
corr=data.corr()
corr.style.background_gradient(cmap="inferno")

In [None]:
plt.figure(figsize=(15,15))
sns.heatmap(data.corr(),annot=True,cmap='inferno',mask=np.triu(data.corr(),k=1))

In [None]:
#Seperating Independent and Dependent Numerical features
X=data[numerical_feature]
X.drop(['price'],axis=1,inplace=True)
Y=data[['price']]

In [None]:
X=sm.add_constant(X)
model=sm.OLS(Y,X).fit()

In [None]:
model.summary()

**OBSERVATION :  after studying Correlation Matrix and OLS we observed that 'highwaympg' and 'citympg' have strong correlation i.e. 0.97, so we can consider dropping one of these feature. As 'highwaympg' has high P-Value than 'citympg' we can drop 'highwaympg' feature in Feature Engineering**

# **FEATURE ENGINEERING/DATA PREPROCESSING**

In [None]:
# Dropping 'highwaympg' 
data.drop(['highwaympg'],axis=1,inplace=True)

In [None]:
Outliers=[ 'wheelbase', 'carlength' , 'carwidth' , 'enginesize' , 'stroke' , 'compressionratio' , 'horsepower' , 'citympg']

In [None]:
data[Outliers].describe()

In [None]:
data.head()

In [None]:
# Separating Independent and Dependent variables
X=data.drop(['price'],axis=1)
y=data['price']

In [None]:
# Applying Logarithmic Transformation of 'enginesize'
df=X.copy()
df['enginesize'] = np.log(df['enginesize'])

In [None]:
# Handling Rare Categorical Variables
for feature in categorical_feature:
    temp=data.groupby(feature)['price'].count()/len(df)
    temp_df=temp[temp>0.01].index
    df[feature]=np.where(df[feature].isin(temp_df),df[feature],'Rare_var')

In [None]:
# Handling Categorical Variables(ENCODING)
label = LabelEncoder()
for feature in categorical_feature:
    df[feature] = label.fit_transform(df[feature])
df = pd.get_dummies(df, columns=['fueltype', 'aspiration', 'doornumber', 'carbody', 'drivewheel', 
                                 'enginelocation', 'enginetype', 'cylindernumber', 'fuelsystem'],drop_first=True)

In [None]:
# Feature Scaling
scaler=MinMaxScaler()
scaler.fit(df)
dataset=pd.DataFrame(scaler.transform(df),columns=df.columns)
dataset.head()

In [None]:
X=dataset.copy()

# **MODEL BUILDING**

In [None]:
# Applying Train Test Split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)

In [None]:
LR=LinearRegression()
LR.fit(X_train,y_train)

In [None]:
LR.fit(X_train, y_train)
y_predLR = LR.predict(X_test)

In [None]:
r2_score(y_test, y_predLR)