**In this notebook I will predict car prices using machine learning algorithms.**
The scope of this notebook:
1. Reading the data.
2. Understanding the data & initial cleaning.
3. Preprocessing and and more cleaning of the data.
4. Generating rating for class features. (explained later in the chapter)
5. Fitting algorithms using CV to check best fit- with and without the ratings from line item no. 4. 
6. Decide to predict with or without the ratings.
7. Choosing best model for predictions.
8. Predicting on test data after it has been processed in the same manner as the train data.

Let's put the pedal to the metal...


Import libraries

In [None]:
import numpy as np 
import pandas as pd
from scipy.stats import kurtosis, skew
import matplotlib.pyplot as plt
from sklearn.preprocessing import  StandardScaler
from sklearn.model_selection import train_test_split, KFold, cross_validate
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
import xgboost as xgb
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')


**1. Get data function- reads the downloaded csv file.**

In [None]:
data = pd.read_csv('../input/car-prediction-using-regression/Car details v3.csv')

**2. Understanding the data and initial cleaning.**

Unprocessed data:

In [None]:
data

In [None]:
# start by converting ruppees to dollars
data['selling_price'] = data['selling_price']/74.93

# remove torque- almost same as power (related mathematically- and linearly)
data.drop('torque', axis='columns', inplace=True)

# show unique values in class columns
print('\n unique values in transmission column \n',data['transmission'].unique())
print('\n unique values in seller type column \n', data['seller_type'].unique())
print('\n unique values in owner column \n', data['owner'].unique())
print('\n unique values in seats column\n', data['seats'].unique())

In [None]:
# change names for convenience
data.rename(columns = {'selling_price':'price', 'km_driven':'miles', 'mileage':'consumption', 'max_power':'power', 'seller_type':'seller', 'name':'manuf.'}, inplace=True )
data = data[['manuf.', 'fuel','seller','transmission','owner','year','miles', 'engine','consumption','power','seats','price']]

# convert mileage to floats
data['consumption'] = data['consumption'].apply(lambda x: float(str(x).split()[0]) if pd.notnull(x) else x)

# get car manufactur   data.rename(columns = {'selling_price':'price', 'km_driven':'mileage', 'mileage':'consumption', 'max_power':'power', 'seller_type':'seller', 'name':'manuf.'}, inplace=True )
data = data[['manuf.', 'fuel','seller','transmission','owner','year','miles', 'engine','consumption','power','seats','price']]
data['manuf.'] = data['manuf.'].apply(lambda x: (str(x).split()[0]) if pd.notnull(x) else x)

# convert max_power to floats
data['engine'] = data['engine'].apply(lambda x: float(str(x).split()[0]) if pd.notnull(x) else x)

# deal with problematic cell (containing 'bhp' only)
data['power'].replace(' bhp',np.nan, inplace=True)
data['power'] = data['power'].apply(lambda x: str(x) if pd.notnull(x) else x)

# convert engine volume to float
data['power'] = data['power'].apply(lambda x: float(x.split()[0]) if pd.notnull(x) else x)

# convert owner type
data['owner'] = data['owner'].apply(lambda x: (str(x).split()[0]) if pd.notnull(x) else x)

# transform years from year issued to years old for convenience
data['year'] = 2021.0 - data['year']

**3. Preprocessing and cleaning the train data.**

In [None]:
""" group together all instances with manuf. that has 15 and less occurences in data
I am assuming that 15 and less occurences in this dataset is not enough for a decent evaluation
Therefore I am aknowledging thess manuf. as "Other" in my evaluation."""

manuf_occurences_in_data = data['manuf.'].value_counts().sort_values()
print("\nmanuf. occurences in data: \n\n", manuf_occurences_in_data)
print("\n")

# manufactureres with 15 and less occurunces are merged as others
manuf_single_occurence = manuf_occurences_in_data[manuf_occurences_in_data <= 15].index
for manuf in manuf_single_occurence:
    data = data.replace(to_replace=manuf, value='Other')

# show that "Other" field in manuf. has been formed
mew_manuf_occurences_in_data = data['manuf.'].value_counts().sort_values()
print("processed manuf. occurences in data \n\n", mew_manuf_occurences_in_data)
print("\n")

# show column with nan values
na_cols = data.isna().any()
na_cols = na_cols[na_cols == True] 
print("these are nan cols \n")
print(na_cols)
print("\n")

#find columns with zero cells
print("columns with zero cells \n")
print((data==0).sum(axis=0))
print("\n")

# we can now see that consumption here is 0 which is not posiible at all, same for power
cols_with_zero = list(((data==0).sum(axis=0)!=0).index[((data==0).sum(axis=0)!=0).values])
for zero_col in cols_with_zero:    
    col_median = data[zero_col].median()
    data[zero_col].replace(to_replace=0, value=col_median, inplace=True)

print("columns with zero cells after processing \n")
print((data==0).sum(axis=0))
print("\n")

# input median into nan values
for col in na_cols.index:
    median = data[col].median(skipna=True)
    data[col] = data[col].fillna(median)
    
print("show that there are no more nan values- is null: \n")
print(data.isnull().values.any())

# show column with nan values
na_cols = data.isna().any()
na_cols = na_cols[na_cols == True] 
print("these are nan cols \n")
print(na_cols)
print("\n")

Processed data:

In [None]:
data

set data and labels.

In [None]:
numeric_data = data.iloc[:,5:]
num_of_cols = (len(numeric_data.columns))
# generate skewness vectors
skew_vec = []
kurt_vec = []
mean_vec = []
# iterate over numeric columns and for each one calculate skewness and kurtosis    
for col in range(num_of_cols):

    a_skew = skew(numeric_data.iloc[:,col])
    a_kurt = kurtosis(numeric_data.iloc[:,col])
    a_mean = np.mean(numeric_data.iloc[:,col])
    skew_vec.append(a_skew)
    kurt_vec.append(a_kurt)
    mean_vec.append(a_mean)
    
# cast to array   
skew_vec = np.array(skew_vec)
kurt_vec = np.array(kurt_vec)
mean_vec = np.array(mean_vec)

# show histograms for each columns
for col in range(num_of_cols):

    plt.figure()
    plt.grid()
    fig = sns.histplot(data = numeric_data.iloc[:,col])
    plt.text(0.8, 0.95,'skew. = '+"{:.2f}".format(skew_vec[col]), fontsize=9,transform=fig.transAxes)
    plt.text(0.8, 0.9,'kurt.  = '+"{:.2f}".format(kurt_vec[col]), fontsize=9,transform=fig.transAxes)
    plt.text(0.8, 0.85,'mean  = '+"{:.2f}".format(mean_vec[col]), fontsize=9,transform=fig.transAxes)

    plt.show()

**ADDITIONAL FUNCTIONS NEEDED**

This function calculates the rating for the features: manuf., owner, and seller
The rating is based on occurnces in data as well as price:
1. If a car manuf. occures once and the price is high for instance, it might be misleading for it can be the top prices model for that particular manuf. therefore the mean price and the occurences in the data play a role in calculating the rating.
2. The formula for the rating is  **manuf_weight * mean_price_of_manuf**
* manuf_weight : There are 36 manuf., most prevalent manuf. will get a a weight of  1.0 and least prevalent will get a rating of 1/36.

In [None]:
def get_feature_rating(train_data_X_Y, feature):

    # get all manuf list    
    feature_list = data[str(feature)].unique()
    feature_length = len(feature_list)
    
    # how many times each manuf appeared
    feature_appearences_in_data = data[str(feature)].value_counts().sort_values().sort_index()
    feature_ratio_in_data = feature_appearences_in_data/len(data)
    
    # show which manuf. are priciest
    feature_price_mean = data.groupby([str(feature),]).mean()['price']
    
    # set to sort
    feature_appearence_in_data_sorted = feature_appearences_in_data.sort_values(ascending=True) 
    feature_appearence_in_data_sorted = feature_appearence_in_data_sorted.index
    
    # set weights in accordance to appearnces
    order = np.arange(1,feature_length+1, dtype='float64')
    weights = order/feature_length
    
    # calculate rating
    weights = pd.DataFrame(data=weights, index=feature_appearence_in_data_sorted)
    feature_rating = weights.sort_index()
    feature_rating = feature_price_mean.multiply(feature_rating[0])
    feature_order = feature_rating.sort_values().index
    feature_rating_final = pd.DataFrame(data=order, index=feature_order)
    
    return feature_rating_final

The next function replaces manuf. with the rating calculated beforehand. 

In [None]:
def replace_features_with_ratings(data, feature_rating):
            
    feature = feature_rating.index
    order = feature_rating.iloc[:,0]
    data = data.replace(to_replace=feature, value=order)

    return data

The next function prints the score for each model - along with the kfold scores using R2 and RMSE


In [None]:
def print_scores(model_name, model):
    print("\n"+model_name+" train rmse score\n", model['train_neg_root_mean_squared_error'])
    print(model_name+" val RMSE score\n",model['test_neg_root_mean_squared_error'])
    print("\n"+model_name+"  train r2 score\n",model['train_r2'])
    print(model_name+" val R2 score\n",model['test_r2'])
    print("")


The next function executes one hot encoding 

In [None]:
def one_hot(data):
    data = pd.get_dummies(data, drop_first=True)
    return data

The next function standatdizes the input data and returns the scaler object. 

In [None]:
def standardize(data):
    stand_scaler = StandardScaler()
    data = stand_scaler.fit_transform(data)
    return data, stand_scaler

The next function plots the scores with and without ratings for each model used in the predictions. 

In [None]:
def compare_w_wo_feature_scores(cv_all_models, model_names):
    
    model_train_wo_ratings = []
    model_train_w_ratings = []
    model_test_wo_ratings = []
    model_test_w_ratings = []
    
    number_of_models = len(cv_all_models)
    kfolds = np.arange(1,11,1)
    for model_index, model_name in zip(range(int(number_of_models/2)), model_names):
        for error_type,train_label, test_label in zip(['Negative RMSE', 'R2 Score'],['train_neg_root_mean_squared_error','train_r2'],['test_neg_root_mean_squared_error','test_r2']):
            plt.figure()
            plt.grid()
            
            # plot train w vs wo ratings
            plt.scatter(x=kfolds, y=cv_all_models[model_index][train_label], marker='^',c='b', label='Without Ratings')
            plt.scatter(x=kfolds, y=cv_all_models[model_index+6][train_label], marker='s',c='r',label='With Ratings')
            plt.xlabel(xlabel='kfold')
            plt.ylabel(ylabel=error_type)
            plt.title(model_name+" Train " +error_type)
            plt.legend()
            plt.show()
            model_wo_rating_train_mean =  np.mean(cv_all_models[model_index][train_label])
            model_train_wo_ratings.append(model_wo_rating_train_mean)
            print(model_name + " Train WO ratings "+ str(model_wo_rating_train_mean) )
            model_w_rating_train_mean = np.mean(cv_all_models[model_index+6][train_label])
            model_train_w_ratings.append(model_w_rating_train_mean)
            print(model_name + " Train W ratings " +str(model_w_rating_train_mean))
            print("\n")

            # plot test w vs wo ratings
            plt.figure()
            plt.grid()
            plt.scatter(x=kfolds, y=cv_all_models[model_index][test_label], marker='^',c='b',label='Without Ratings')
            plt.scatter(x=kfolds, y=cv_all_models[model_index+6][test_label], marker='s',c='r',label='With Ratings')
            plt.xlabel(xlabel='kfold')
            plt.ylabel(ylabel=error_type)
            plt.title(model_name +" Test " +error_type)
            plt.legend()
            plt.show()
            model_wo_rating_train_mean =  np.mean(cv_all_models[model_index][train_label])
            model_test_wo_ratings.append(model_wo_rating_train_mean)
            print(model_name + " Test WO ratings "+ str(model_wo_rating_train_mean) )
            model_w_rating_train_mean = np.mean(cv_all_models[model_index+6][train_label])
            model_test_w_ratings.append(model_w_rating_train_mean)
            print(model_name + " Test W ratings " +str(model_w_rating_train_mean))
            print("\n")

    
    return model_train_wo_ratings, model_train_w_ratings,model_test_wo_ratings,model_test_w_ratings

**----END OF ADDITIONL FUNCTIONS----**

Moving forward.

In [None]:
data_x = data.iloc[:,0:10] 
data_y = data.iloc[:,11]

Split into train and test.

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(data_x, data_y, test_size = 0.3, random_state = 2)

From here on forward I will use only the train data for analysis/scaling etc. and later make sure that the test data will undergo the same process.
To do so I will concat together X_train and Y_train together and continue to investigate the data.

In [None]:
train_data_X_Y = pd.concat([X_train,Y_train], axis=1) 


create correlation matrix to see mot important features

In [None]:
    # set correlation matrix
    correlation_mat = train_data_X_Y.corr()
    # setup of triangle matrix
    mask = np.triu(np.ones_like(correlation_mat, dtype=bool))
    # setup the matplotlib figure
    plt.figure(figsize=(10,10))
    # Generate a custom diverging colormap
    cmap = sns.diverging_palette(230, 20, as_cmap=True)
    # Draw the heatmap with the mask and correct aspect ratio
    sns.heatmap(correlation_mat, mask=mask, cmap=cmap, vmax=1,vmin=-1, center=0,
                square=True, linewidths=1, cbar_kws={"shrink": .5})

Key points:
1. We can see that "power" is heavily positive correlated, the same can be said to engine (which generates the power).
2. "year" feature is negatively correlated which could be very intuitively explained- the newer the car the more expensive it is.  
3. The same as line item no. 2 can be said for the "miles" feature, less miles means less wear and tear.

**4. Generating rating for class features.**


At this stage I will check whether class features should get numeric rating rather than One-Hot encoding. This is under the notion that:
1. The manufacturer of the car affects the price of the car e.g. a Mercedes is usually more luxurious than a fiat and therefore is more expensive.
2. The owner also affects the price, first hand is preferable to second which is better than third etc.
3. In addition, seller is also a key feature. who sells the car? Individual, dealer or trustmark dealer. 

I will rate them all-according to the data, then fit the models along with the ratings.
Later I will use One-Hot encoding and again, fit all the models from scratch.
Finally I will compare the two options and concure which results in better outcomes. 

All next predictions will be done twice due to this idea. The test stage will include only the better option. 

**5. Fitting algorithms using CV to check best fit- with and without the ratings**


In [None]:
# store all models for future comparison
cv_all_models = []
model_names = ["Linear Regression", "Regression Tree","Random Forest","KNN","ADABoost", "Gradient Boosting"]
feature_rating_dict = {}

# copy the data for using the two options stated above
train_data_X_Y_copy = train_data_X_Y.copy()
X_test_copy = X_test.copy()

# firstly with no ratings - one-hot. second iteration with the ratings.
for get_rating in [False, True]:
    
    # Predict using rating for features, 
    # This will replace classes with the rating leaving one hot encoding to the rest of the features - transmission and fuel
    # When set to false, one hot encoding will replace ALL features with one hot encoding.
    
    if get_rating == True:
        print("\n\n   Fit using rating for manuf. owner and seller \n\n")
        for feature in ['manuf.','owner','seller']:
            feature_rating = get_feature_rating(train_data_X_Y_copy, feature)
            train_data_X_Y_copy = replace_features_with_ratings(train_data_X_Y_copy, feature_rating)
            feature_rating_dict[feature] = feature_rating
    else:
        print("\n\n   Fit using One-hot for manuf. owner and seller \n\n")
      
    # in order to keep the original data
    train_data_X_Y = train_data_X_Y_copy
    
    # needed price to get ratings - no need after getting the ratings
    X_train= train_data_X_Y.drop('price', axis='columns')
    X_train= one_hot(X_train)
    X_train_cols = X_train.columns
    
    # scaling the train data- later scaling the test data according to same scaler
    X_train, stand_scaler = standardize(X_train)
    X_train_df = pd.DataFrame(data=X_train, columns=X_train_cols)
    
    # casting to numpy arrays
    X_train = np.array(X_train_df)
    Y_train = np.array(Y_train)
    Y_test = np.array(Y_test)    
    
    # toggle rating and no rating for test (same as before)
    if get_rating:
        for feature in feature_rating_dict.keys():
            X_test_copy = replace_features_with_ratings(X_test_copy, feature_rating_dict[feature])
    X_test = X_test_copy
    
    # one hot for test 
    X_test = one_hot(X_test)
    
    # standardize test 
    X_test = stand_scaler.fit_transform(X_test)

    # 5. FITTING ALGORIGHMS USING CV TO CHECK FOR BEST FIT- WITH AND WITHOUT THE RATINGS.
    
    
    """LINEAR REGRESSION"""
        
    print("\n\n---LINEAR REGRESSION---")
    # fit linear regression

    lin_reg = LinearRegression()
    cv_lin_reg = cross_validate(lin_reg, X_train, Y_train, cv=10, 
                                scoring=('r2','neg_root_mean_squared_error'),
                                return_train_score=True, return_estimator=True)
    print_scores('Linear Regression', cv_lin_reg)

    # append to result list
    cv_all_models.append(cv_lin_reg)

    
    """ REGRESSION TREE"""

    print("\n\n---REGRESSION TREE---")
    dec_tree_reg = DecisionTreeRegressor(splitter='best', max_depth = 10)
    cv_dec_tree_reg = cross_validate(dec_tree_reg, X_train, Y_train, cv=10, 
                                scoring=('r2','neg_root_mean_squared_error'),
                                return_train_score=True, return_estimator=True)
    print_scores('Regression Tree', cv_dec_tree_reg)
    
    # append to result list
    cv_all_models.append(cv_dec_tree_reg)
    
    """RANDOM FOREST"""

    print("\n\n---RANDOM FOREST---")
    random_forest_regressor = RandomForestRegressor(max_depth=32, n_estimators = 400)
    cv_random_forest_regressor = cross_validate(random_forest_regressor, X_train, Y_train, cv=10, 
                                scoring=('r2','neg_root_mean_squared_error'),
                                return_train_score=True, return_estimator=True)
    print_scores('Random Forest', cv_random_forest_regressor)
    
    # append to result list
    cv_all_models.append(cv_random_forest_regressor)
    
    """KNN"""

    print("\n\n---KNN---")
    knn_regressor = KNeighborsRegressor(n_neighbors=5)
    cv_knn_regressor = cross_validate(knn_regressor, X_train, Y_train, cv=10, 
                                scoring=('r2','neg_root_mean_squared_error'),
                                return_train_score=True, return_estimator=True)
    print_scores('KNN', cv_knn_regressor)
    
    # append to result list
    cv_all_models.append(cv_knn_regressor)
    
    """ADABOOST"""
    
    print("\n\n---ADABOOST---")
    adaboost_regressor = AdaBoostRegressor(n_estimators=100)
    cv_adaboost_regressor = cross_validate(adaboost_regressor, X_train, Y_train, cv=10, 
                                scoring=('r2','neg_root_mean_squared_error'),
                                return_train_score=True, return_estimator=True)
    print_scores('AdaBoost Regressor', cv_knn_regressor)
    
    # append to result list
    cv_all_models.append(cv_adaboost_regressor)
    
    """XGBOOST"""
    
    print("\n\n---XGBOOST---")
    xgb_regressor= xgb.XGBRegressor (objective="reg:squarederror", random_state=5, eval_metric="rmse")
    cv_xgboost_regressor = cross_validate(xgb_regressor, X_train, Y_train, cv=10, 
                                          scoring=('r2','neg_root_mean_squared_error'), return_train_score=True, return_estimator=True)
    print_scores('XGBoost', cv_xgboost_regressor)
    
    # append to result list
    cv_all_models.append(cv_xgboost_regressor)
    
    

**6. Choosing to predict with or without the ratings.**

**7. Choosing best model for predictions.**


In [None]:
    # plot differences
    model_train_wo_ratings, model_train_w_ratings,model_test_wo_ratings,model_test_w_ratings = compare_w_wo_feature_scores(cv_all_models, model_names)
    

**8. Predicting on test data after it has been processed in the same manner as the train data.**

In [None]:
    # two best models- Random Forest and XGBoost both with rating
    
    # retrain Random Forest Model from scratch on all data
    final_rf_regressor = RandomForestRegressor(max_depth=32, n_estimators = 400)
    final_rf_regressor.fit(X_train, Y_train)
    final_rf_score = final_rf_regressor.score(X_test, Y_test)
    final_rf_predictions = final_rf_regressor.predict(X_test)
    
    # retrain XGBoost Model from scratch on all data
    final_xgb_regressor = xgb.XGBRegressor (objective="reg:squarederror", random_state=5, eval_metric="rmse") 
    final_xgb_regressor.fit(X_train, Y_train)
    final_xgb_predictions = final_xgb_regressor.predict(X_test)
    final_xgb_score = final_xgb_regressor.score(X_test, Y_test)
    
    # create final predictions 
    predictions_df = pd.DataFrame(data = {'Random Forest Predictions' :final_rf_predictions,'XGBoost Predictions': final_xgb_predictions,'Targets': Y_test})
    

In [None]:
    predictions_df