In [None]:
#import the libraries

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



In [None]:
#import the dataset

df = pd.read_csv('../input/vehicle-dataset-from-cardekho/CAR DETAILS FROM CAR DEKHO.csv')
df.head()

In [None]:
#Looking at the shape of the dataset

df.shape

In [None]:
#Drop the null rows
df = df.dropna()
df.shape

In [None]:
#Check whether there are missing values in any columns

df.isnull().sum()

In [None]:
#Check the data types of every column
df.info()

**We can see the given dataset has no null values**

**now we inspect the categorical columns and find out how many category these columns have**

In [None]:
#select all the categorical columns
categorical = df.select_dtypes('object').columns
categorical

In [None]:
def check_category(df, columns):
    for var in columns:
        print(df[var].value_counts())
    
categorical = df.select_dtypes('object').columns
check_category(df,categorical)

Most of the cars run by Diesel and petrol fuel. Only one car can be drived by electricity.
Most of the transmission is manual
Here we can see there are different types of cars enlisted

# Visualization

In [None]:
#Find the correlation between the numeric columns
plt.figure(figsize=(12,8))
sns.heatmap(df.corr(), annot=True)
plt.title('Correlation')

### here we can see we have got the predicted correlation between year vs selling_price and km_driven vs selling_price

In [None]:
plt.figure(figsize=(16,10))
sns.pairplot(data=df)

In [None]:
#As there are many types of model of the cars we can select the cars which frequency is greater than 10
df = df.groupby('name').filter(lambda x: len(x)>10)
df.shape

In [None]:
df.head()

In [None]:
print('Value Counts of Name Column: ', df.name.value_counts())
print('\n')
print('Value Counts of Fuel Column: ', df.fuel.value_counts())
print('\n')
print('Value Counts of Sellet_type Column: ', df.seller_type.value_counts())
print('\n')
print('Value Counts of Transmission Column: ', df.transmission.value_counts())
print('\n')
print('Value Counts of Owner Column: ', df.owner.value_counts())

### Preprocessing the data

In [None]:
from sklearn.preprocessing import LabelEncoder
df[['fuel','transmission']] = df[['fuel','transmission']].apply(LabelEncoder().fit_transform)
df.head()

In [None]:
#Changing Other categorical Columns to Numeric Value
cat_to_num = {"seller_type": {"Individual":1, "Dealer":2, "Trustmark Dealer":3},
              "owner": {"First Owner": 1, "Second Owner":2, "Third Owner":3, "Fourth & Above Owner": 4}
                }
print(cat_to_num)
print('\n')
df.replace(cat_to_num, inplace=True)
df.head()

In [None]:
print('Value Counts of Fuel Column: ', df.fuel.value_counts())
print('\n')
print('Value Counts of Sellet_type Column: ', df.seller_type.value_counts())
print('\n')
print('Value Counts of Transmission Column: ', df.transmission.value_counts())
print('\n')
print('Value Counts of Owner Column: ', df.owner.value_counts())

In [None]:
#Drop the name column and the take the X and y parts
X = df.drop(['name','selling_price'], axis=1)
X.shape

In [None]:
y = df.selling_price

In [None]:
#First split the dataset into training and testing subset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=2)

In [None]:
#First apply Linear Regression to predict the price
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()
linreg.fit(X_train,y_train)
linreg.score(X_test,y_test)

In [None]:
#Now do the same with Random Forest Tree Classifier
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state=123)
model.fit(X_train,y_train)
model.score(X_test, y_test)

In [None]:
#Now do the same with cross validation 
from sklearn.model_selection import cross_val_score
score = cross_val_score(linreg, X, y, cv=10)
score.mean()