# IMPORT LIBRARY

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,GridSearchCV
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
sns.set_style(style='darkgrid')

%matplotlib inline

# PRE-MADE FUNCTIONS

## EVALUATORS

In [None]:
from sklearn.metrics import mean_absolute_error

def eval1(predic,ytest):
  print("-----------------------------------------------------------------")
  print(" price mean absolute error (mae) = "+ str(mean_absolute_error(predic,ytest)))
  print("-----------------------------------------------------------------")

## MODEL FUNCTIONS

In [None]:
def lrmod(x_train,y_train,x_test):
  lr1=LinearRegression()
  lr1.fit(x_train,y_train)
  pred=lr1.predict(x_test)
  return (pred,lr1)

In [None]:
def rfmod(x_train,y_train,x_test,p_grid):
  rf1=GridSearchCV(RandomForestRegressor(),param_grid=p_grid,verbose=3,n_jobs=-1,refit=True)
  rf1.fit(x_train,y_train)
  pred=rf1.predict(x_test)
  return (pred,rf1)

In [None]:
def svmod(x_train,y_train,x_test,p_grid):
  svg=GridSearchCV(SVR(),param_grid=p_grid,verbose=3,n_jobs=-1,refit=True,cv=4)
  svg.fit(x_train,y_train)
  pred=svg.predict(x_test)
  return (pred,svg)

## DATA SCALERS

In [None]:
def transformer1(x_train,x_test):
  m1=MinMaxScaler()
  x_train=m1.fit_transform(x_train)
  x_test=m1.transform(x_test)
  return x_train,x_test

In [None]:
def transformer2(x_train,x_test):
  m1=StandardScaler()
  x_train=m1.fit_transform(x_train)
  x_test=m1.transform(x_test)
  return x_train,x_test

# GETTING DATA

In [None]:
data=pd.read_csv('../input/used-bikes-prices-in-india/Used_Bikes.csv')


In [None]:
data

# DATA DESCRIPTION

In [None]:
data.info()

In [None]:
data.describe(include='all')

# NULL CHECK

In [None]:
data.isna().any()

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(data.isna().transpose(),cmap='viridis',xticklabels=False,cbar=False)

# EDA(Exploratory Data Analysis)

In [None]:
plt.figure(figsize=(20,6))
sns.countplot(data=data,x='brand',palette='rainbow')
plt.tight_layout()

In [None]:
f,axes=plt.subplots(2,2,figsize=(15,7))

sns.histplot(x='price',data=data,kde=True,color='red',ax=axes[0,0],bins=50)

sns.scatterplot(y='price',x='kms_driven',data=data,ax=axes[0,1])
sns.scatterplot(y='price',x='power',data=data,ax=axes[1,0])
sns.scatterplot(y='price',x='age',data=data,ax=axes[1,1])

plt.tight_layout()

In [None]:
f,axes2=plt.subplots(1,2,figsize=(15,7))

sns.scatterplot(y='kms_driven',x='power',data=data,ax=axes2[0])
sns.scatterplot(y='kms_driven',x='age',data=data,ax=axes2[1])

plt.tight_layout()

# DATA PREPROCESS

In [None]:
data.drop(['bike_name','city'],axis=1,inplace=True)

In [None]:
owndum=pd.get_dummies(data['owner'],drop_first=True)
brandum=pd.get_dummies(data['brand'],drop_first=True)

In [None]:
data=pd.concat([data,owndum,brandum],axis=1)

In [None]:
data.drop(['brand','owner'],axis=1,inplace=True)

# X,Y,SPLITTING

In [None]:
y=data['price'].values
X=data.drop(['price'],axis=1).values

In [None]:
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.15,random_state=42)

# DATA SCALING

In [None]:
x_train,x_test=transformer1(x_train,x_test)

# MODEL RUN

## LINEAR REGRESSION

In [None]:
pred1,lrob=lrmod(x_train,y_train,x_test)

## RANDOM FOREST REGRESSOR (GRID SEARCH)

In [None]:
n_est=list(range(100,700,100))
n_estimators=dict(n_estimators=n_est)
pred2,rfob=rfmod(x_train,y_train,x_test,n_estimators)

## SVR (GRID SEARCH)

In [None]:
C=[100,1000];Ga=[1,0.1,0.01]
p=dict(C=C,gamma=Ga)
pred3,svob=svmod(x_train,y_train,x_test,p)

# EVALUATIONS

## LINEAR REGRESSION


In [None]:
eval1(pred1,y_test)

plt.figure(figsize=(10,8))
plt.xlabel('Linear Regression predictions')
plt.ylabel('Actual Values')
sns.scatterplot(x=pred1,y=y_test)

## RANDOM FOREST

In [None]:
eval1(pred2,y_test)
print()
print(rfob.best_params_)
print()
plt.figure(figsize=(10,8))
plt.xlabel('Random Forest Regression Predictions')
plt.ylabel('Actual Values')
sns.scatterplot(x=pred2,y=y_test)

## SVR 

In [None]:
eval1(pred3,y_test)
print()
print(svob.best_params_)
print()
plt.figure(figsize=(10,8))
plt.xlabel('SVM Predictions')
plt.ylabel('Actual Values')
sns.scatterplot(x=pred3,y=y_test)

# NOTE:
## **If extrapolation is not the issue then I would go for Random Forest Regressor**