# I have tested various regression models for car pricing prediction dataset, below code represents EDA, Feature Selection and Model training 

Do check it out and looking forward to your review on the same!

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy
import seaborn as sns

%matplotlib inline

In [1]:
df = pd.read_csv('../input/bmwcsv/bmw.csv')
df.head()

In [1]:
df.info()

In [1]:
print(df.transmission.unique())
print(df.model.unique())
print(df.fuelType.unique())

In [1]:
df.corr()

In [1]:
plt.figure(figsize=(10,10))
sns.set_style("darkgrid")
sns.boxplot(data=df[['price','mileage']])
plt.show

In [1]:
plt.figure(figsize=(10,10))
sns.set_style("darkgrid")
sns.boxplot(data=df['mpg'])
plt.xlabel("mpg")
plt.show

In [1]:
plt.figure(figsize=(10,10))
plt.scatter(x=df['year'], y=df['price'])
plt.show

In [1]:
plt.figure(figsize=(10,10))
plt.scatter(x=df['mileage'], y=df['price'])
plt.show

In [1]:
plt.figure(figsize=(10,10))
plt.scatter(x=df['engineSize'], y=df['price'])
plt.show

In [1]:
plt.figure(figsize=(16,5))
sns.set_style("darkgrid")
sns.stripplot(x="model", y="price", data=df)
plt.show

In [1]:
df.describe()

In [1]:
plt.figure(figsize=(16,5))
sns.set_style("darkgrid")
sns.stripplot(x="transmission", y="price", data=df)
plt.show

In [1]:
plt.figure(figsize=(16,5))
sns.set_style("darkgrid")
sns.stripplot(x="fuelType", y="price", data=df)
plt.show

# Model 1


In [1]:
features= ['year','transmission','mileage','fuelType','engineSize']

df2= df.drop(['model','tax','mpg'], axis=1)
print(df2)

In [1]:
df2= df.drop(['model','tax','mpg'], axis=1)

d = {'Diesel':0, 'Petrol':1, 'Other':2 , 'Hybrid':3, 'Electric':4}
df2['fuelType'] = df2['fuelType'].map(d)

d1 ={'Automatic':0, 'Manual':1, 'Semi-Auto':2}
df2['transmission'] = df2['transmission'].map(d1)

print(df2)

In [1]:
X = df2[['year','mileage','transmission','fuelType','engineSize']]
y = df2['price']

from sklearn import linear_model

regr = linear_model.LinearRegression()

regr.fit(X,y)

print(regr.coef_)

prdictedprice = regr.predict([[2021,100000,2,0,2]])

print(prdictedprice)

# Model 2

In [1]:
import statsmodels.formula.api as smf

model = smf.ols('price ~ year + mileage + transmission + fuelType + engineSize', data=df2)

results =model.fit()

print(results.summary())

# Model 3

In [1]:
from scipy import stats

In [1]:
X = df['mileage']
y = df['price']

slope, intercept, r, p ,std_err = stats.linregress(X,y)

def myfunc(X):
    return slope*X  + intercept

mymodel = list(map(myfunc, X))

fig, ax =plt.subplots()

ax.scatter(X,y)
ax.plot(X, mymodel)
fig.set_size_inches(15,8)
fig.show

print("r value of given problem:", r)
print("p value of given problem:", p)
print(std_err)
print(slope, intercept)

# Model 4

In [1]:
from sklearn import linear_model

X = df[['mileage', 'year']]
y = df['price']

regr = linear_model.LinearRegression()

regr.fit(X,y)

print(regr.coef_)
print(regr.predict([[100000,2021]]))


test_df = df.loc[7000:,['mileage','year']]

y_test = df.loc[7000:,'price']

X_test = test_df[['mileage','year']]

y_pred = regr.predict(X_test)

print(y_pred)

from sklearn.metrics import r2_score

r2 = r2_score(y_test, y_pred)

print(r2)

from sklearn.metrics import mean_squared_error

MSE = mean_squared_error(y_test, y_pred)

print(MSE)

# Model 5

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error

X=df2[['year','transmission','mileage','fuelType','engineSize']]
y= df2['price']

# selecting training and testing data
train_X, val_X,train_y, val_y = train_test_split(X,y, random_state=0)

#model training and fiting data
model = DecisionTreeRegressor(random_state=0)

model.fit(train_X,train_y)

#predicting and validaiton model
val_predictions = model.predict(val_X)

mae = mean_absolute_error(val_y,val_predictions)

print(mae)

In [1]:
X = df2[['year','mileage','transmission','fuelType','engineSize']]
y = df2['price']

from sklearn.feature_selection import mutual_info_regression

miscores = mutual_info_regression(X,y, random_state=0)

miscores = pd.Series(miscores, name="MI_SCORE", index=X.columns)
print(miscores)

# Model 6

In [1]:
# Optimization for number of leaf nodes

def get_mae(max_leaf_nodes,train_X, val_X,train_y, val_y):
        nmodel = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
        nmodel.fit(train_X,train_y)
        val_predict = nmodel.predict(val_X)
        mae = mean_absolute_error(val_y,val_predict)
        print(mae)
        
        
# getting various MAE from decision tree regressor
 
for max_leaf_nodes in [10,30,50,80,100,150,200]:
    get_mae(max_leaf_nodes,train_X,val_X,train_y,val_y)
    print(max_leaf_nodes)

# As per above results at **leaf_nodes=100** we have a least **MAE= 3374.9724**

# Model 7

In [1]:
from xgboost import XGBRegressor

X=df2[['year','transmission','mileage','fuelType','engineSize']]
y= df2['price']

train_X, val_X, train_y, val_y = train_test_split(X,y, random_state=0)


my_model1 = XGBRegressor(n_estimators=1000, learning_rate=0.05,  random_state=0)

my_model1.fit(train_X,train_y)

preds = my_model1.predict(val_X)

#mae

MAE = mean_absolute_error(val_y, preds)

print(MAE)