In [None]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import seaborn as sns
import math 
import plotly.express as px

import matplotlib.pyplot as plt
%matplotlib inline 

In [None]:
df = pd.read_csv('../input/used-bike-price/bike_data.csv')
print("Data frame has {}rows and {}columns".format(df.shape[0],df.shape[1]))
df.head()

In [None]:
df.drop('Unnamed: 0',axis=1,inplace=True)
df.rename(columns = {'Name':'bike_name', 'Location':'city',
                    'Running':'kms_driven','Price':'price','Owner':'owner'}, inplace = True)
df.head()

In [None]:
df['year'] = df.bike_name.str[-4:]
df.bike_name = (df.bike_name.str[:-4]).str.rstrip()
df['year'] = pd.to_numeric(df['year'],errors='coerce')

y=2021
df['age']= y-df['year']
df = df.drop('year',axis=1)

df.head()

In [None]:
df['power'] = df.bike_name.str.extract(r"(\d+)cc").fillna("")

# with pd.option_context('display.max_rows', None,
#                        'display.max_columns', None,
#                        'display.precision', 3,
#                        ):
#     print(df[['bike_name','power']].value_counts())

In [None]:
df['power1'] = df.bike_name.str.extract(r"(\d+)").fillna("")

df['power1'] = pd.to_numeric(df['power1'])
df['power1'] = df.power1.apply(lambda x: x if x>= 100 else 0)

df=df.replace('', np.nan)
df=df.replace(0.0, np.nan)

df.power.fillna(df['power1'], inplace=True)

In [None]:
df=df.drop('power1',axis=1)

df['power'] = df['power'].astype(float)

# with pd.option_context('display.max_rows', None,
#                        'display.max_columns', None,
#                        'display.precision', 3,
#                        ):
#     print(df[['bike_name','power']].value_counts())

In [None]:
df.price = df.price.replace('\n','', regex=True)
df['price'] = df['price'].astype(str).str.split(" ").str[0]

df['price'] = df['price'].str.replace(r'[^\d.]+', '', regex=True)

df['price'] = df['price'].astype('float64', errors = 'raise')

df.head()

In [None]:
df["bike_name"] = df["bike_name"].str.replace(" New ", " ")
df['brand']=df.bike_name.str.split(' ').str.get(0)

df=df.replace('', np.nan)

df.loc[(df.brand == 'Royal'),'brand'] = 'Royal Enfield'
df.brand.value_counts()

In [None]:
df['kms_driven'] = df['kms_driven'].str.replace(r'[^\d.]+', '', regex=True)

df['kms_driven'] = df['kms_driven'].astype('float64', errors = 'raise')

In [None]:
df=df.replace('', np.nan)
df=df.dropna()
df.info()

In [None]:
df.describe()

In [None]:
df.to_csv('clean.csv',index=False)

In [None]:
df = df[(df['price']>10000) & (df['kms_driven']>1000) & (df['age']<25) & (df['kms_driven']<100000) & (df['price']<150000)]

In [None]:
def removeOutliers(data, col):
    Q3 = np.quantile(data[col], 0.75)
    Q1 = np.quantile(data[col], 0.25)
    IQR = Q3 - Q1
      
    print("IQR value for column %s is: %s" % (col, IQR))
    global outlier_free_list
    global filtered_data
      
    lower_range = Q1 - 1.5 * IQR
    upper_range = Q3 + 1.5 * IQR
    outlier_free_list = [x for x in data[col] if (
        (x > lower_range) & (x < upper_range))]
    filtered_data = data.loc[data[col].isin(outlier_free_list)]

out_columns = df[['kms_driven','price', 'power','age']]  
for i in out_columns:
    removeOutliers(df, i)
  
# Assigning filtered data back to our original variable'

df = filtered_data
print("Shape of data after outlier removal is: ", df.shape)

In [None]:
bike=df.copy()

In [None]:
figure = plt.figure(figsize=(20,8))
sns.boxplot(x='brand',y='price',data=bike)

In [None]:
plt.figure(figsize=(20,10))
sns.lineplot(x='age',y='price',data=bike)
plt.ticklabel_format(style='plain')

In [None]:
sns.countplot(x='owner', data=bike)

In [None]:
plt.figure(figsize=(40,40))
fig = px.scatter_3d(bike, x='age', y='kms_driven', z='price', color='brand')
fig.show()

In [None]:
sns.heatmap(bike.corr(), annot=True, cmap="RdBu")
plt.show()

In [None]:
bike=pd.get_dummies(bike,columns=['owner','brand'],drop_first=True)
bike.head()

In [None]:
from sklearn.model_selection import train_test_split

X=bike.drop(columns=['price','bike_name','city'],axis=1)
y=bike['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [None]:
print("x train: ",X_train.shape)
print("x test: ",X_test.shape)
print("y train: ",y_train.shape)
print("y test: ",y_test.shape)

In [None]:
# linear regression feature importance

from sklearn.linear_model import LinearRegression
from matplotlib import pyplot

model = LinearRegression()
# fit the model
model.fit(X, y)
# get importance
importance = model.coef_
# summarize feature importance
for i,v in enumerate(importance):
	print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance

pyplot.bar([x for x in range(len(importance))], importance)
pyplot.show()

In [None]:
from statsmodels.api import OLS

model= OLS(y_train, X_train).fit()
print(model.summary())

In [None]:
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from sklearn import metrics

CV = []
R2_train = []
R2_test = []

def rent_pred_model(model,model_name):
    # Training model
    model.fit(X_train,y_train)
            
    # R2 score of train set
    y_pred_train = model.predict(X_train)
    R2_train_model = r2_score(y_train,y_pred_train)
    R2_train.append(round(R2_train_model,2))
    
    # R2 score of test set
    y_pred_test = model.predict(X_test)
    R2_test_model = r2_score(y_test,y_pred_test)
    R2_test.append(round(R2_test_model,2))
    
    # R2 mean of train set using Cross validation
    cross_val = cross_val_score(model ,X_train ,y_train ,cv=3)
    cv_mean = cross_val.mean()
    CV.append(round(cv_mean,2))
    
    # MAE
    mae = metrics.mean_absolute_error(y_test,y_pred_test)
    
    # RMSE
    rmse = math.sqrt(metrics.mean_squared_error(y_test,y_pred_test))
    
    
    # Printing results
    print("Train R2-score :",round(R2_train_model,2))
    print("Test R2-score :",round(R2_test_model,2))
    print("Train CV scores :",cross_val)
    print("Train CV mean :",round(cv_mean,2))
    print("MAE :", round(mae,5))
    print("RMSE :", round(rmse,5))
    
    # Plotting Graphs 
    # Residual Plot of train data
    fig, ax = plt.subplots(1,2,figsize = (10,4))
    ax[0].set_title('Residual Plot of Train samples')
    sns.distplot((y_train-y_pred_train),hist = False,ax = ax[0])
    ax[0].set_xlabel('y_train - y_pred_train')
    
    # Y_test vs Y_train scatter plot
    ax[1].set_title('y_test vs y_pred_test')
    ax[1].scatter(x = y_test, y = y_pred_test)
    ax[1].set_xlabel('y_test')
    ax[1].set_ylabel('y_pred_test')
    
    plt.show()

In [None]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
rent_pred_model(lr,"Linear_regressor.pkl")

In [None]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import RandomizedSearchCV

# Creating Ridge model object
rg = Ridge()
# range of alpha 
alpha = np.logspace(-3,3,num=14)

# Creating RandomizedSearchCV to find the best estimator of hyperparameter
rg_rs = RandomizedSearchCV(estimator = rg, param_distributions = dict(alpha=alpha))

rent_pred_model(rg_rs,"ridge.pkl")

In [None]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import RandomizedSearchCV

ls = Lasso()
alpha = np.logspace(-3,3,num=14) # range for alpha

ls_rs = RandomizedSearchCV(estimator = ls, param_distributions = dict(alpha=alpha))

rent_pred_model(ls_rs,"lasso.pkl")