In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Read the csv data
df = pd.read_csv('../input/vehicle-dataset-from-cardekho/Car details v3.csv')

In [None]:
# Analyze the data
df.info()

In [None]:
# Check the skew of the Selling Price
print(df['selling_price'].skew())
sns.distplot(df['selling_price'], kde=True)

In [None]:
# Log transform the data since the kew is very high
df['selling_price'] = np.log1p(df['selling_price'])

# Check the skew of the Selling Price again after tranforming the data
print(df['selling_price'].skew())
sns.distplot(df['selling_price'], kde=True)

In [None]:
# Check for null values in the data
df.isnull().sum().sort_values(ascending=False)

In [None]:
col_null_data = df.isnull().sum().to_frame().rename({0:"Sum"}, axis=1)
col_null_data = col_null_data[col_null_data['Sum'] > 0]
col_null_data.head()

In [None]:
# Splitiing the values to get just the values by ignoring the kmpl
df['mileage'] = df['mileage'].apply(lambda x: str(x).split(" ")[0])

# Coverting the string to float
df['mileage'] = df['mileage'].astype(float)

In [None]:
# Check the null values in mileage and impute them if any
sns.distplot(df['mileage'], hist=True)
print(df['mileage'].skew())

# Since the skew is just -0.14, which is normal and we can impute the missing values with mean
df['mileage'].fillna(df['mileage'].mean(), inplace=True)

In [None]:
# Check the null values for engine
df['engine'].isnull().sum()

# Remove the CC from engine and extarct just the number
df['engine'] = df['engine'].apply(lambda x : str(x).split(" ")[0])

# Convert object to float
df['engine'] = df['engine'].astype(float)

# Impute the nan value
sns.distplot(df['engine'], kde =True)
print(df['engine'].skew())

# Since the skew is 1.1, which is positive, replace the nan with median or mode
df['engine'].fillna(df['engine'].median(), inplace = True)

In [None]:
# Check the null values for engine
print(df['max_power'].isnull().sum())

# Remove the bhp from engine and extarct just the number
df['max_power'] = df['max_power'].apply(lambda x : str(x).split(" ")[0])

# Convert the object to numeric
df['max_power'] = pd.to_numeric(df['max_power'], errors='coerce')

# Find the skew
print(df['max_power'].skew())

# Since the skew is high replace the nan with median or mode
df['max_power'].fillna(df['max_power'].median(), inplace=True)

In [None]:
# Check the null values for engine
df['seats'].isnull().sum()

# Find the skew in order to determin whether to impute nan with mean, median or mode
print(df['seats'].skew())

# Since the skew is 1.96, which is very high, replace the nan with median or mode
df['seats'].fillna(df['seats'].median(), inplace = True)

# Convert to float
df['seats'].astype(float)

In [None]:
# moving the target variable into Y
Y = df['selling_price']

# Moving the independent varibales into X
X = df.drop('selling_price', axis=1)

In [None]:
Y = pd.DataFrame(Y)
Y.head()

In [None]:
#  Dropping name as it is not necesaary
X.drop(['name', 'torque'], axis = 1, inplace=True)

In [None]:
# Lets analyze numeric features and remove outliers
def plot_numeric(feature):
    fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 3), dpi=110)
    sns.distplot(X[feature], kde=True, ax = ax1)
    sns.scatterplot(X[feature], Y['selling_price'], ax = ax2)
    sns.boxplot(X[feature], ax = ax3, orient='v', width = 0.2)
    return plt

In [None]:
def get_numeric_cols(df_num):
    for col in df_num.columns:
        print(df_num[col].dtypes)
        if df_num[col].dtypes != 'object':
            plot_numeric(col)
            
get_numeric_cols(X)

In [None]:
# Get the indexes of the outliers and drop them
index_outliers = X['year'][X['year'] < 1990].index
index_outliers = np.append(index_outliers, X['mileage'][X['mileage'] > 40].index)
index_outliers = np.append(index_outliers, X['max_power'][X['max_power'] > 300].index)

In [None]:
X.drop(index_outliers)

In [None]:
X_encoded = pd.get_dummies(X, columns=['fuel', 'seller_type', 'transmission', 'owner'], drop_first = True)

In [None]:
# Feature Selection
from sklearn.feature_selection import RFECV
from xgboost import XGBRegressor
rfecv = RFECV(estimator = XGBRegressor(), cv = 3, n_jobs = -1)
rfecv = rfecv.fit(X_encoded, Y)
print(f"No. of highly important features: {rfecv.n_features_}")

In [None]:
imp_features = X_encoded.columns.values[rfecv.support_]
X_imp = X_encoded[imp_features]

In [None]:
# Scaling the data
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X_imp)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, Y, test_size = 0.3)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import r2_score

In [None]:
model = LinearRegression()
model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)
print('R2 SCORE ', r2_score(Y_test, Y_pred))
print('MAE ', MAE(Y_test, Y_pred))
print('MSE ', MSE(Y_test, Y_pred))

In [None]:
model = XGBRegressor()
model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)
print('R2 SCORE ', r2_score(Y_test, Y_pred))
print('MAE ', MAE(Y_test, Y_pred))
print('MSE ', MSE(Y_test, Y_pred))