In [None]:
#import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import math
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score

## Load Data

In [None]:
#Read data
df = pd.read_csv("../input/vehicle-dataset-from-cardekho/Car details v3.csv")
print(df.head())
df.info()
df.shape

## Data Preprocessing

In [None]:
#drop duplicate rows
df=df.drop_duplicates()
df.shape
df.info()

In [None]:
#check null values
df.isnull().any(axis=0)
df.loc[df.isnull().any(axis=1)]
df.isnull().sum(axis=0)


In [None]:
#Drop null values
df=df.dropna()
df.shape
df.duplicated()

In [None]:
years_driven=2021-df['year']
df['years_driven']=years_driven

In [None]:
df.head()

In [None]:
#drop 'name' , 'year','torque','brand','model'
del df['name']
del df['year']
del df['torque']
del df['seats']
df.head()

In [None]:
#Clean mileage ,engine and max_power columns
a=df['mileage'].str.split(n=1,expand=True)
del df['mileage']
df['mileage(kmpl)']=a[0]

a=df['engine'].str.split(n=1,expand=True)
del df['engine']
df['engine(CC)']=a[0]

a=df['max_power'].str.split(n=1,expand=True)
del df['max_power']
df['max_power(bhp)']=a[0]

In [None]:
#change 3 columns mileage(kmpl),engine(CC),max_power(bhp) datatypes to numeric 
df['mileage(kmpl)']=df['mileage(kmpl)'].astype(float)
df['engine(CC)']=df['engine(CC)'].astype(int)
df['max_power(bhp)']=df['max_power(bhp)'].astype(float)

### Feature Engineering(get dummies)

In [None]:
# get dummies
df= pd.get_dummies(data = df,drop_first=True) 
df.head()

In [None]:
# MINMAXSCALER
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
num_vars = ['km_driven', 'years_driven','mileage(kmpl)','engine(CC)','max_power(bhp)']
df[num_vars] = scaler.fit_transform(df[num_vars])

In [None]:
#correlation to selling_price
df.corr()['selling_price']

In [None]:
#delete features with low correlation to selling_price
del df['fuel_LPG']
del df['seller_type_Trustmark Dealer']
del df['owner_Fourth & Above Owner']
df.head()

In [None]:
#Check dataset info after changing columns
df.info()
df.shape
df.describe()

# IDA

In [None]:
#columns correlation
df.corr()

In [None]:
df.describe()

In [None]:
plt.figure(figsize=(15,8))
sns.lineplot(data=df, x="km_driven", y="selling_price")

In [None]:
sns.pairplot(df[['selling_price','km_driven','mileage(kmpl)','engine(CC)','years_driven']],diag_kind='kde')

## Train-Test Split

In [None]:
#Split data and labels
Y=df['selling_price']
X=df.drop('selling_price',1)

In [None]:
#Split data into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=5310)
print("x train: ",X_train.shape)
print("x test: ",X_test.shape)
print("y train: ",Y_train.shape)
print("y test: ",Y_test.shape)

## Training Models

In [None]:
#Def model training function
CV = []
R2_train = []
R2_test = []
def price_predict_model(model):
    # Training model
    model.fit(X_train,Y_train)
            
    # R2 score of train set
    y_pred_train = model.predict(X_train)
    R2_train_model = r2_score(Y_train,y_pred_train)
    R2_train.append(round(R2_train_model,2))
    
    # R2 score of test set
    y_pred_test = model.predict(X_test)
    R2_test_model = r2_score(Y_test,y_pred_test)
    R2_test.append(round(R2_test_model,2))
    
    # R2 mean of train set using Cross validation
    cross_val = cross_val_score(model ,X_train ,Y_train ,cv=10)
    cv_mean = cross_val.mean()
    CV.append(round(cv_mean,2))
    
                        
    # Printing results
    print("Train R2-score :",round(R2_train_model,2))
    print("Test R2-score :",round(R2_test_model,2))
    print("Train CV scores :",cross_val)
    print("Train CV mean :",round(cv_mean,2))
    
    # summary
    import statsmodels.api as sm
    from scipy import stats

    X2 = sm.add_constant(X_train)
    est = sm.OLS(Y_train, X2)
    est2 = est.fit()
    print(est2.summary())
    
    #Plot
    plt.scatter(Y_test, y_pred_test,  color='blue')
    plt.xlabel('Y_test')
    plt.ylabel('Y-pred_test')

    plt.xticks(())
    plt.yticks(())

    plt.show()
    #plot2
    print('\nResidual plot for training data (blue) and test data (green):')
    _ = plt.scatter(y_pred_train, Y_train-y_pred_train, c='blue', s=40, alpha=0.5, edgecolor='white')
    _ = plt.scatter(y_pred_test, Y_test-y_pred_test, c='green', s=40, alpha=0.5, edgecolor='white')
    _ = plt.plot([-10,10], [0,0], c='black')
    _ = plt.ylabel('Residuals ($y - \hat{y}$)')
    _ = plt.xlabel('Predicted values ($\hat{y}$)')
    
    plt.show()
    
    #hypothesis
    from scipy import stats
    import numpy as np

    mwu_result = stats.mannwhitneyu(Y_test,y_pred_test)
    print(mwu_result)
    print('Can we reject H0:The prediction and real price is close:', 'Yes' if mwu_result.pvalue<0.05 else 'No')



### Linear Regression

In [None]:
# Linear Regression
lm = LinearRegression()
lmfit = lm.fit(X_train, Y_train)

In [None]:
print('Intercept:', lm.intercept_)
print('Coefficients:\n', lm.coef_)


# We use the score method to get r-squared
print('\nR-squared:', lm.score(X_train, Y_train))


# We can also calculate the standard error
stderr = math.sqrt(np.mean((Y_train - lm.predict(X_train))**2))
print('\nStandard error:', stderr)

price_predict_model(lm)

### Ridge Regression

In [None]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import RandomizedSearchCV

# Creating Ridge model object
rg = Ridge()
# range of alpha 
alpha = np.logspace(-2,2)

# Creating RandomizedSearchCV to find the best estimator of hyperparameter
ridge = RandomizedSearchCV(estimator = rg, param_distributions = dict(alpha=alpha))

price_predict_model(ridge)



In [None]:
#best estimator for ridge
print("Best estimator:",ridge.best_estimator_)

### Lasso Regression

In [None]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import RandomizedSearchCV

ls = Lasso()
alpha = np.logspace(-2,2) # range for alpha

lasso = RandomizedSearchCV(estimator = ls, param_distributions = dict(alpha=alpha))
price_predict_model(lasso)



In [None]:
#best estimator for lasso
print("Best estimator:",lasso.best_estimator_)

In [None]:
# Random Forest before tuning
from sklearn.ensemble import RandomForestRegressor
rf=RandomForestRegressor(random_state = 5310)
rf_fit=rf.fit(X_train, Y_train)
price_predict_model(rf_fit)

### Random Forest Regression

In [None]:
# Random Forest parameter tunning
from sklearn.model_selection import GridSearchCV
param_grid = {"n_estimators":range(10,500,20)}
grid_search = GridSearchCV(RandomForestRegressor(),param_grid,cv = 3)

grid_search.fit(X_train,Y_train)
grid_search.best_params_, grid_search.best_score_

In [None]:
#random forest after tunning
from sklearn.ensemble import RandomForestRegressor
rf=RandomForestRegressor(random_state = 5310,n_estimators=50)
rf_fit=rf.fit(X_train, Y_train)
price_predict_model(rf_fit)

After parameter tuning RF score raised to 0.98 train score and 0.91 test score.

# Conclusion

In [None]:
Models = ["LinearRegression","Ridge Regression","Lasso Regression","Random Forest","Random Forest(Tuned)"]
results=pd.DataFrame({'Model': Models,'R Squared(Train)': R2_train,'R Squared(Test)': R2_test,'CV score mean(Train)': CV})
display(results)

I used three regression model to predict car price, they all get very familiar result. The highest R^2 train and R^2 test are Linear Regression and Ridge Regression, the highest CV score mean Train is Ridge 0.62. In conclusion, Ridge Regression is the best model in this prediction.

#### References
stackoverflow,’Find p-value (significance) in scikit-learn LinearRegression’,viewed 26 May 2021, https://stackoverflow.com/questions/27928275/find-p-value-significance-in-scikit-learn-linearregression

Scikit learn, ‘Linear Regression Example’,viewed 28 May 2021, https://scikit-learn.org/stable/auto_examples/linear_model/plot_ols.html

Kaggle, ‘Car price prediction’, viewed 28 May 2021, https://www.kaggle.com/mohaiminul101/car-price-prediction

Stack abuse, ‘Linear Regression in Python with Scikit-Learn’, viewed 30 May 2021,
https://stackabuse.com/linear-regression-in-python-with-scikit-learn/

CSDN,’【机器学习小论文】sklearn随机森林RandomForestRegressor代码及调参’, viewed 30 may 2021, https://blog.csdn.net/xiaohutong1991/article/details/108178143

cnblogs,’scikit-learn随机森林调参小结’, viewed 31 May 2021, https://www.cnblogs.com/pinard/p/6160412.html

Numpy, ‘numpy.logspace’, viewed 31 May 2021,
https://numpy.org/doc/stable/reference/generated/numpy.logspace.html