In [None]:
# for linear algebra and scientific calculation
import numpy as np
# for data processing and manipulation of data structure
import pandas as pd
# for Box-Cox Transformation
from scipy import stats
# for min_max scaling
from mlxtend.preprocessing import minmax_scaling
# plotting modules
import seaborn as sns
import matplotlib.pyplot as plt
# set seed for reproducibility
np.random.seed(0)
# split a dataset into train and test sets
from sklearn.model_selection import train_test_split
# sklearn is machine learning library for python
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVR
#Linear Regression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import ExtraTreesRegressor
# importing the kaggle input to read the data set.
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# ** Reading first data csv file and showing the output. **

In [None]:
car_data_first=pd.read_csv("../input/vehicle-dataset-from-cardekho/car data.csv")
car_data_first.head()

** Check for Null/Missing Values/Duplicate Values(drop if any) **

In [None]:
car_data_first.isnull()

In [None]:
car_data_first.isnull().sum()

** By running above code, we can see there is no missing/null values in our first dataset/csv. **

# ** Reading Second data csv file and showing the output. **

In [None]:
car_data_second=pd.read_csv("../input/vehicle-dataset-from-cardekho/CAR DETAILS FROM CAR DEKHO.csv")
car_data_second.head()

** Check for Null/Missing Values/Duplicate Values(drop if any) **

In [None]:
car_data_second.isnull()

In [None]:
car_data_second.isnull().sum()

** By running above code, we can see there is no missing/null values in our second dataset/csv. **

# ** Reading Third data csv file and showing the output. **# 

In [None]:
car_data_third=pd.read_csv("../input/vehicle-dataset-from-cardekho/Car details v3.csv")
car_data_third.head()

** Check for Null/Missing Values/Duplicate Values(drop if any) **

In [None]:
car_data_second.isnull()

In [None]:
car_data_second.isnull().sum()

#By running above code, we can see there is no missing/null values in our third dataset/csv.

# ** Rescale the features. **

In [None]:
# select the Present_Price or Selling_Price column
_Present_Price = car_data_first.Present_Price

# scale the goals from 0 to 1
scaled_data = minmax_scaling(_Present_Price, columns = [0])

# plot the original & scaled data together to compare
fig, ax=plt.subplots(1,2)
sns.histplot(car_data_first.Present_Price, ax=ax[0])
ax[0].set_title("Original Data")
sns.histplot(scaled_data, ax=ax[1])
ax[1].set_title("Scaled data")

#Notice that the shape of the data doesn't change, but that instead of ranging from 0 to 75+, it now ranges from 0 to 1

# **** Extract X as all columns except the last column and Y as the last column. ****

In [None]:
_X_=car_data_first.loc[:, car_data_first.columns != 'Owner']
_X_.head()

In [None]:
_Y_=car_data_first.loc[:, car_data_first.columns == 'Owner']
_Y_.head()

Note: Since the last column has all values as zero, so out is not reflecting anything.

# ** Split the data into a training set and testing set. **

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=2)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

# ** Perform 10-fold cross-validation **

In [None]:
X = car_data_first.iloc[:, [1,2,3,4]]
Y = car_data_first.iloc[:, 3]
sc= MinMaxScaler(feature_range=(0,1))
X= sc.fit_transform(X)

scores = []
best_svr = SVR(kernel='rbf')
cv = KFold(n_splits=10, random_state=42, shuffle=True)
for train_index, test_index in cv.split(X):
    print("Train Index: ", train_index, "\n")
    print("Test Index: ", test_index)
    
X_train, X_test, Y_train, Y_test = X[train_index], X[test_index], Y[train_index], Y[test_index]
best_svr.fit(X_train, Y_train)
scores.append(best_svr.score(X_test, Y_test))
best_svr.fit(X_train, Y_train)
scores.append(best_svr.score(X_test, Y_test))
print(np.mean(scores))

# ** Train a Linear regression model for the dataset. **

In [None]:
lr=LinearRegression()
lr.fit(x_train,y_train)
y_pred=lr.predict(x_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

# ** Visualize training and test results. **

In [None]:
plt.figure(figsize=[15,10])
plt.plot(y_pred,label='Predicted')
plt.plot(y_test,label="Actual_test")
plt.legend()
plt.title("Linear Regression Model")

# ** Compute the accuracy **

In [None]:
from sklearn.metrics import r2_score
lr_r2=r2_score(y_test, y_pred)
print(lr_r2)

Root mean square value of our model is 0.45 which is quite less and it means that our model fits the predection. As per RMS defination, less the vakue of RMS, more accurate is the model.