In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
#Modules for EDA
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
plt.style.use("seaborn")
%matplotlib inline

In [None]:
#Machine learning packages
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score,GridSearchCV
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error,r2_score
import joblib

In [None]:
df = pd.read_csv('../input/used-bikes-prices-in-india/Used_Bikes.csv')
df.shape

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df.head()

In [None]:
df.drop('bike_name',axis=1,inplace=True)

# **Let's Figure Out unique bike brands**

In [None]:
df['brand'].value_counts()

# **Replacing Bike brands to others which are less than 1000**

In [None]:
brands = df['brand'].value_counts()
bike_brands_less_than_100 = brands[brands<=1000]
bike_brands_less_than_100

In [None]:
print("Other brands total",sum(bike_brands_less_than_100))

In [None]:
others = bike_brands_less_than_100.keys()
others

In [None]:
df['brand'].replace(others,"Others",inplace=True)
df['brand'].value_counts().plot(kind="barh")
plt.gca().invert_yaxis()
plt.show()

In [None]:
bike_groups = df.groupby('brand')

In [None]:
def get_average_plot_data(col,scale=None):
    brands = df['brand'].unique()
    avgs = []
    for brand in brands:
        average = bike_groups.get_group(brand)[col].mean()
        avgs.append(average)
    df1 = pd.DataFrame({"Brand":brands,f"Average {col}":avgs})
    
    x = df1[df1.columns[1]]
    y = df1[df1.columns[0]]
    sns.barplot(data=df1,x=x,y=y)
    plt.title(f"Average {col} of various brands")
    if scale:
        plt.xscale(scale)
    plt.show()

# **Average age of each bike brand**

In [None]:
get_average_plot_data('age')

# **Average price of each bike brand**

In [None]:
get_average_plot_data('price','symlog')

# **Average KMs driven of each bike brand**

In [None]:
get_average_plot_data('kms_driven')

# **Average power of each bike brand**

In [None]:
get_average_plot_data('power')

# **Pair Plot**

In [None]:
cols_to_plot = ['brand', 'price', 'kms_driven', 'age', 'power']
plt.figure(figsize=(10,10))
sns.pairplot(df[cols_to_plot], hue='brand')
plt.show()

# **City Counts**

In [None]:
df['city'].value_counts()

# **Popular cities**

In [None]:
city_counts = df['city'].value_counts()
city_counts[city_counts>=500]

# **Setting cities to others where city frequency is < 500**

In [None]:
other_cities = city_counts[city_counts<500]
df['city'].replace(other_cities.keys(),'Others',inplace=True)

In [None]:
plt.figure(figsize=(10,10))
df['city'].value_counts().plot(kind='barh')
plt.gca().invert_yaxis()
plt.show()

In [None]:
df.head()

In [None]:
df['owner'].value_counts()

In [None]:
df['owner'].replace(['Second Owner','Third Owner','Fourth Owner Or More'],'Second Owner or more',inplace=True)
df['owner'].value_counts()

# **Feature Engineering**

## **One Hot encoding**

In [None]:
cols_to_encode = ['brand', 'city', 'owner']
dummies = pd.get_dummies(df[cols_to_encode],drop_first=True)
dummies.sample(10)

## **Feature Scaling** 

In [None]:
cols_to_scale = ['kms_driven', 'age', 'power']
scale = MinMaxScaler()
scalled = scale.fit_transform(df[cols_to_scale])

In [None]:
i = 0
for col in cols_to_scale:
    df[col] = scalled[:,i]
    i += 1

In [None]:
df.head()

In [None]:
df.drop(cols_to_encode,axis=1,inplace=True)
df.head()

In [None]:
new_df = pd.concat([dummies,df],axis=1)
new_df.shape

In [None]:
new_df.head()

In [None]:
sum(new_df.isna().sum())

# **Splitting and Training data**

In [None]:
x ,y = new_df.drop(['price'],axis=1), new_df['price']
x.shape, y.shape

In [None]:
y.head()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

In [None]:
x_train.shape, x_test.shape

In [None]:
y_train.shape, y_test.shape

# **Model Building and predictions**

In [None]:
model = LinearRegression()
model.fit(x_train, y_train)

In [None]:
model.score(x_test,y_test)

In [None]:
model.score(x_train,y_train)

# **That's a descent score**

# **Cross Validation scores**

In [None]:
models = [LinearRegression(), Ridge(), Lasso(), KNeighborsRegressor()]

In [None]:
mean_scores = []
for model in models:
    print("Model:",model)
    cv_scores = cross_val_score(model, x, y, cv=5)
    print("Cross Val Scores:",cv_scores)
    print("Mean score:", cv_scores.mean())
    mean_scores.append(cv_scores.mean())
    print('\n')

In [None]:
mds = []
for i in range(len(models)):
    mds.append(str(models[i]))
mds

In [None]:
mean_df = pd.DataFrame({'Model':mds, 'Mean CVScore':mean_scores})
sns.barplot(data=mean_df,y='Model', x='Mean CVScore')
plt.show()

In [None]:
svm_model = SVR()
svm_model.fit(x_train,y_train)
svm_model.score(x_test,y_test)

In [None]:
model = LinearRegression()
model.fit(x_train, y_train)
y_pred_test = model.predict(x_test)
mean_squared_error(y_test, y_pred_test)

# **Actual vs Predicted**

In [None]:
def actual_vs_predicted(model,data,y_true,title=None):
    pred = model.predict(data)
    apdf = pd.DataFrame({'Actual':y_true, 'Predicted':np.round(pred)})
    plt.figure(figsize=(10, 10))
    sns.scatterplot(data=apdf, x='Actual', y='Predicted') 
    plt.title(title)
    plt.show()

In [None]:
actual_vs_predicted(model,x_test,y_test,'Linear Regression Test Data')

# **Let's use RandomForestRegressor**

In [None]:
rfr_model = RandomForestRegressor()
rfr_model.fit(x_train, y_train)

In [None]:
rfr_model.score(x_test, y_test)

In [None]:
rfr_model.score(x_train, y_train)

In [None]:
actual_vs_predicted(rfr_model,x_test, y_test, "RandomForestRegressor Test data")

In [None]:
actual_vs_predicted(rfr_model,x_train, y_train, "RandomForestRegressor Train data")

# **Saving RandomForestRegressor model as file**

In [None]:
joblib.dump(rfr_model, 'RFR-Model')

In [None]:
!ls