# Decision Tree for Car Price Prediction

In [1]:
import numpy as np
from sklearn import tree
from sklearn import metrics
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, KFold, cross_val_predict

In [3]:
def decision_tree(X_train, X_test, y_train, y_test, cv_folds=5, decimal_places=2):
    # Standardizing the features
    scale_X = StandardScaler()
    scale_y = StandardScaler()
    
    # Fit the scaler on the training data and transform the training data
    X_train_scaled = scale_X.fit_transform(X_train)
    y_train_scaled = scale_y.fit_transform(y_train.reshape(-1, 1)).flatten()
    X_test_scaled = scale_X.transform(X_test)
    y_test_scaled = scale_y.transform(y_test.reshape(-1, 1)).flatten()

    dtree = tree.DecisionTreeRegressor()
    kf = KFold(n_splits=cv_folds, shuffle=True, random_state=42)
    
    # Cross-validation scores on training data
    mae_scores = -cross_val_score(dtree, X_train_scaled, y_train_scaled, cv=kf, scoring='neg_mean_absolute_error')
    medae_scores = -cross_val_score(dtree, X_train_scaled, y_train_scaled, cv=kf, scoring='neg_median_absolute_error')
    r2_scores = cross_val_score(dtree, X_train_scaled, y_train_scaled, cv=kf, scoring='r2')
    
    # prediction via cross validation
    y_pred_scaled_cv = cross_val_predict(dtree, X_train_scaled, y_train_scaled, cv=kf)
    y_pred_cv = scale_y.inverse_transform(y_pred_scaled_cv.reshape(-1, 1)).flatten()
    
    # Cross Validation errors on training set
    mae_cv = metrics.mean_absolute_error(y_train, y_pred_cv)
    medAE_cv = metrics.median_absolute_error(y_train, y_pred_cv)
    
    # fitting model on all data
    dtree.fit(X_train_scaled, y_train_scaled)
    y_pred_scaled_test = dtree.predict(X_test_scaled)
    y_pred_test = scale_y.inverse_transform(y_pred_scaled_test.reshape(-1, 1)).flatten()

    # Errors on the test set
    mae_test = metrics.mean_absolute_error(y_test, y_pred_test)
    medae_test = metrics.median_absolute_error(y_test, y_pred_test)
    dtree_score = dtree.score(X_test_scaled, y_test_scaled)
    
    # Dictionary of errors then convert to dataframe
    error_metrics = {
        'MAE CV': mae_cv,
        'medAE CV': medAE_cv,
        'R2 CV': r2_scores.mean(),
        'MAE CV (mean)': mae_scores.mean(),
        'medAE Train (mean)': medae_scores.mean(),
        'MAE Test': mae_test,
        'medAE Test': medae_test,
        'Dec. Tree Score': dtree_score
    }
    
    # Convert the dictionary to a pandas DataFrame and round the values
    df_errors = pd.DataFrame([error_metrics]).round(decimal_places)
    
    return y_pred_test, df_errors, scale_X, scale_y

In [3]:
# tree_plot = tree.plot_tree(dtree, feature_names=features)

### Visualising

In [4]:
"""
# graphviz converts the decision tree classifier into a dot file
from sklearn.tree import export_graphviz
from six import StringIO
from IPython.display import Image
# pydotplus converts this dot file to png or displayable form
import pydotplus
"""

'\n# graphviz converts the decision tree classifier into a dot file\nfrom sklearn.tree import export_graphviz\nfrom six import StringIO\nfrom IPython.display import Image\n# pydotplus converts this dot file to png or displayable form\nimport pydotplus\n'

In [5]:
"""
import pandas as pd
from sklearn.model_selection import train_test_split

selected_df = pd.read_csv('final_car_data.csv')
X = selected_df.drop(columns='price').values  # Assuming 'price' is the target variable
y = selected_df['price'].values
features = ['year', 'power_kw', 'mileage_in_km', 'brand']
X_train, X_test, y_train, y_test = train_test_split(X,y)
dtree = tree.DecisionTreeClassifier()
dtree = dtree.fit(X_train,y_train)
#print(selected_df.drop(columns='price').columns)
"""

"\nimport pandas as pd\nfrom sklearn.model_selection import train_test_split\n\nselected_df = pd.read_csv('final_car_data.csv')\nX = selected_df.drop(columns='price').values  # Assuming 'price' is the target variable\ny = selected_df['price'].values\nfeatures = ['year', 'power_kw', 'mileage_in_km', 'brand']\nX_train, X_test, y_train, y_test = train_test_split(X,y)\ndtree = tree.DecisionTreeClassifier()\ndtree = dtree.fit(X_train,y_train)\n#print(selected_df.drop(columns='price').columns)\n"

In [6]:
"""
dot_data = StringIO()
export_graphviz(dtree, out_file=dot_data,
               filled=True, rounded=True,
               special_characters=True,
               feature_names=X,
               class_names=['0','1'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('test.png')
Image(graph.create_png())
"""

"\ndot_data = StringIO()\nexport_graphviz(dtree, out_file=dot_data,\n               filled=True, rounded=True,\n               special_characters=True,\n               feature_names=X,\n               class_names=['0','1'])\ngraph = pydotplus.graph_from_dot_data(dot_data.getvalue())\ngraph.write_png('test.png')\nImage(graph.create_png())\n"