In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename)) 

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
! pip install openpyxl

In [None]:
train = pd.read_csv("/kaggle/input/used-cars-price-prediction/train.csv")
test = pd.read_csv("/kaggle/input/used-cars-price-prediction/test.csv")
data_dict = pd.read_excel("/kaggle/input/used-cars-price-prediction/data dictionary.xlsx")

In [None]:
train.head()

In [None]:
print(train.shape)

In [None]:
test.shape

In [None]:
train.drop(columns = ['New_Price'], inplace = True)
test.drop(columns = ['New_Price'], inplace = True)

In [None]:
train.isnull().sum()

There are some 36 entries missing from all the columns and all of them are common. So if we were to drop these columns we will be losing out on 36 out of the 6019 data points in the train set which is about 0.5% of the train data.

Incase of the test set there are 11 null values out of 1234 total entries. We wil be missing out on around 0.8% of th data if we drop the null values.

In [None]:
train.shape

In [None]:
train.dropna(inplace = True)
test.dropna(inplace = True)

In [None]:
train.shape

In [None]:
train.reset_index(inplace = True)
test.reset_index(inplace = True)

In [None]:
train.drop(columns = ['index'], inplace = True)
test.drop(columns = ['index'], inplace = True)

In [None]:
train.head()

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
mileage_split_train  = train['Mileage'].str.split(' ', n = 1, expand = True )
train['mileage'] = mileage_split_train[0]

mileage_split_test  = test['Mileage'].str.split(' ', n = 1, expand = True )
test['mileage'] = mileage_split_test[0]

km/kg is equivalent to kmpl

In [None]:
engine_split = train['Engine'].str.split(' ', n = 1, expand = True)
train['engine'] = engine_split[0]

power_split = train['Power'].str.split(' ', n = 1, expand = True)
train['power'] = power_split[0]

In [None]:
engine_split_test = test['Engine'].str.split(' ', n = 1, expand = True)
test['engine'] = engine_split_test[0]

power_split_test = test['Power'].str.split(' ', n = 1, expand = True)
test['power'] = power_split_test[0]

In [None]:
train.drop(columns = ['Name','Mileage','Engine','Power'], inplace = True)
test.drop(columns = ['Name','Mileage','Engine','Power'], inplace = True)

In [None]:
cols = ['mileage', 'engine', 'power']
for col in cols:
    train[col] = train[col].apply(pd.to_numeric, errors = 'coerce')
    test[col] = test[col].apply(pd.to_numeric, errors = 'coerce')

In [None]:
train.dropna(inplace = True)
test.dropna(inplace = True)

In [None]:
train['Kilometers_Driven'] = np.log(train['Kilometers_Driven'])
train['engine'] = np.log(train['engine'])

In [None]:
test['Kilometers_Driven'] = np.log(test['Kilometers_Driven'])
test['engine'] = np.log(test['engine'])

In [None]:
cat_cols = ['Location', 'Fuel_Type', 'Transmission', 'Owner_Type', 'Seats']

In [None]:
location_ohe_train = pd.get_dummies(train['Location'])
fuel_type_ohe_train = pd.get_dummies(train['Fuel_Type'])
transmission_ohe_train = pd.get_dummies(train['Transmission'])
owner_type_ohe_train = pd.get_dummies(train['Owner_Type'])
seats_ohe_train = pd.get_dummies(train['Seats'])

In [None]:
location_ohe_test = pd.get_dummies(test['Location'])
fuel_type_ohe_test = pd.get_dummies(test['Fuel_Type'])
transmission_ohe_test = pd.get_dummies(test['Transmission'])
owner_type_ohe_test = pd.get_dummies(test['Owner_Type'])
seats_ohe_test = pd.get_dummies(test['Seats'])

In [None]:
location_ohe_train.head()

In [None]:
location_ohe_train.shape

In [None]:
train.drop(columns = cat_cols, inplace = True)
test.drop(columns = cat_cols, inplace = True)

In [None]:
test.isnull().sum()

In [None]:
train = pd.concat(objs = [train,
                          location_ohe_train,
                          fuel_type_ohe_train,
                          transmission_ohe_train,
                          owner_type_ohe_train,
                          seats_ohe_train], axis = 1)

In [None]:
test = pd.concat(objs = [test,
                         location_ohe_test,
                          fuel_type_ohe_test,
                          transmission_ohe_test,
                          owner_type_ohe_test,
                          seats_ohe_test], axis = 1)

In [None]:
train_y = train['Price']
train.drop(columns = ['Price'], inplace = True)

In [None]:
print(train.shape)
print(test.shape)

In [None]:
for c in train.columns:
    if c not in test.columns:
        train.drop([c], axis = 1, inplace = True)

## Model Building and Evaluation

In [None]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsRegressor

In [None]:
param_grid = {'n_neighbors':[1,2,3,4,5]}
grid_search = GridSearchCV(KNeighborsRegressor(), param_grid, cv = 5)

X_train,X_val,y_train,y_val = train_test_split(train,train_y, random_state = 100)

In [None]:
grid_search.fit(X_train,y_train)

In [None]:
print("Score KNN : {:.2f}".format(grid_search.score(X_val,y_val)))
print("Best parameters : ",grid_search.best_params_)