In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/vehicle-dataset-from-cardekho/Car details v3.csv')

In [None]:
df.head()

In [None]:
# shape of data
df.shape

In [None]:
# missing values
df.isnull().sum()

In [None]:
# info
df.info()

In [None]:
# duplicate data
df.duplicated().sum()

In [None]:
# describe function
df.describe()

In [None]:
1.000000e+07

In [None]:
# Observation
# 1. Missing values in some cols
# 2. Seats is float, should be int
# 3. more than 1000 rows are duplicates
# 4. Outliers in year,selling price,seats
# 5. Torque,engine,mileage and max_power have unnecceary units


In [None]:
# drop any row with missing values
df.dropna(inplace=True)

In [None]:
df.shape

In [None]:
# no of dropped rows
8128 - 7906
df.isnull().sum()

In [None]:
# remove duplicate rows
df = df.drop_duplicates(keep='first')

In [None]:
df.shape

In [None]:
df.duplicated().sum()

In [None]:
# change data type of seats col
df['seats'] = df['seats'].astype('int32')

In [None]:
df.info()

In [None]:
# Handling mileage col
df['mileage'] = df['mileage'].str.split(expand=True)[0]
df['mileage'] = df['mileage'].astype('float64')

In [None]:
# Handling engine col
df['engine'] = df['engine'].str.split(expand=True)[0]
df['engine'] = df['engine'].astype('int32')

In [None]:
# Handling max_power col
df['max_power'] = df['max_power'].str.split(expand=True)[0]
df['max_power'] = df['max_power'].astype('float64')

In [None]:
df.head()

In [None]:
# dropping the torque col
df.drop(columns=['torque'],inplace=True)

In [None]:
df.head()

In [None]:
df['name'].unique().shape

In [None]:
# extracting brand from name
df['brand'] = df['name'].str.split(expand=True)[0]

In [None]:
df.drop(columns=['name'],inplace=True)
df.head()

In [None]:
df['brand'].value_counts()

In [None]:
freq_brands = df['brand'].value_counts()[df['brand'].value_counts()>100].index.tolist()

In [None]:
freq_brands 

In [None]:
 df[df['brand'].isin(freq_brands)]

In [None]:
df = df[df['brand'].isin(freq_brands)]

In [None]:
df.head()

In [None]:
df['fuel'].value_counts()

In [None]:
freq_fuel = ['Diesel','Petrol']
df = df[df['fuel'].isin(freq_fuel)]

In [None]:
df['seller_type'].value_counts()

In [None]:
df = df[df['seller_type'].isin(['Individual','Dealer'])]

In [None]:
df = df[df['owner'].isin(['First Owner','Second Owner','Third Owner'])]

In [None]:
df.shape

In [None]:
df.head()

In [None]:
import seaborn as sns
sns.boxplot(df['km_driven'])

In [None]:
q1 = df['km_driven'].quantile(0.25)
q3 = df['km_driven'].quantile(0.75)
iqr = q3 - q1

In [None]:
max_val_km_driven = q3 + 1.5*iqr

In [None]:
max_val_km_driven

In [None]:
df.head()

In [None]:
# no. of outliers in col 'km_driven'
df[df['km_driven']>max_val_km_driven]

In [None]:
# capping-we have to replace all outliers with value 193000(max_val_km_driven) in col 'km_driven'
np.where(df['km_driven']>max_val_km_driven,max_val_km_driven,df['km_driven'])

In [None]:
df['km_driven'] = np.where(df['km_driven']>max_val_km_driven,max_val_km_driven,df['km_driven'])

In [None]:
sns.boxplot(df['km_driven'])

In [None]:
sns.boxplot(df['year'])

In [None]:
df.shape

In [None]:
df[df['year'] > 2000].shape

In [None]:
df[df['year'] >= 2000].shape

In [None]:
df = df[df['year'] >= 2000]

In [None]:
sns.boxplot(df['year'])

In [None]:
df.head()

In [None]:
sns.boxplot(df['mileage'])

In [None]:
df[df['mileage']==0]

In [None]:
mean_mileage = df[df['mileage'] !=0]['mileage'].median()
mean_mileage

In [None]:
df['mileage'].describe()

In [None]:
df['mileage'] = np.where(df['mileage'] == 0,mean_mileage,df['mileage'])

In [None]:
sns.boxplot(df['mileage'])

In [None]:
sns.boxplot(df['engine'])

In [None]:
sns.boxplot(df['max_power'])

In [None]:
sns.boxplot(df['seats'])

In [None]:
df[df['seats']>=8].shape

In [None]:
df.shape

In [None]:
df.head()

In [None]:
# we can't give string data to ML model, here fuel,seller_type,transmission,owner,brand cols have not numerical data
# so we have to convert these datas into number.
# this is called categorical data encoding

In [None]:
# split train,test cols
X = df.drop(columns=['selling_price'])
y = df['selling_price']
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)

In [None]:
X_train

In [None]:
y_train

In [None]:
# Ordinal encoding on Owner col
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder
from sklearn.compose import ColumnTransformer

In [None]:
tnf = ColumnTransformer(
    [
        ('ordinal',OrdinalEncoder(),['owner']),
        ('nominal',OneHotEncoder(drop='first',sparse=False),['fuel','seller_type','transmission','brand'])
    ], remainder='passthrough'
)

In [None]:
#X_train_tnf = tnf.fit_transform(X_train)
#X_test_tnf = tnf.transform(X_test)

In [None]:
#X_train_tnf

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
#lr = LinearRegression()

In [None]:
#lr.fit(X_train_tnf,y_train)

In [None]:
#y_pred = lr.predict(X_test_tnf)

In [None]:
from sklearn.metrics import r2_score
#r2_score(y_test,y_pred)

In [None]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2)

In [None]:
#X_train_poly = poly.fit_transform(X_train_tnf)
#X_test_poly = poly.transform(X_test_tnf)

In [None]:
lr = LinearRegression()
#lr.fit(X_train_poly,y_train)
#y_pred = lr.predict(X_test_poly)
#r2_score(y_test,y_pred)

In [None]:
# pipeline
from sklearn.pipeline import Pipeline

In [None]:
pipe= Pipeline(
    [
        ('col-transformer',tnf),
        ('poly',poly),
        ('lr',lr)
    ]
)

In [None]:
pipe.fit(X_train,y_train)

In [None]:
y_pred = pipe.predict(X_test)

In [None]:
r2_score(y_test,y_pred)

In [None]:
import pickle
# syntax
pickle.dump(pipe,open('pipe.pkl','wb'))

In [None]:
X_train