In [1]:
# install required packages
!pip install dmba
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from dmba import classificationSummary, regressionSummary

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
no display found. Using non-interactive Agg backend


In [2]:
# read dataset into a pandas dataframe
train_df = pd.read_csv('Training_DataSet.csv')
test_df = pd.read_csv('Test_Dataset.csv')

In [3]:
# drop rows with null outcome values and reset index
train_df = train_df.dropna(subset=['Vehicle_Trim','Dealer_Listing_Price'])

In [4]:
# add identifier for rows in each set
train_df['Source'] = ['train' for row in train_df.index]
test_df['Source'] = ['test' for row in test_df.index]

In [5]:
# combine the two datasets
cars_df = train_df.append(test_df)
cars_df = cars_df.reset_index()

In [6]:
cars_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6841 entries, 0 to 6840
Data columns (total 31 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   index                 6841 non-null   int64  
 1   ListingID             6841 non-null   int64  
 2   SellerCity            6841 non-null   object 
 3   SellerIsPriv          6841 non-null   bool   
 4   SellerListSrc         6839 non-null   object 
 5   SellerName            6841 non-null   object 
 6   SellerRating          6841 non-null   float64
 7   SellerRevCnt          6841 non-null   int64  
 8   SellerState           6841 non-null   object 
 9   SellerZip             6839 non-null   float64
 10  VehBodystyle          6841 non-null   object 
 11  VehCertified          6841 non-null   bool   
 12  VehColorExt           6792 non-null   object 
 13  VehColorInt           6307 non-null   object 
 14  VehDriveTrain         6709 non-null   object 
 15  VehEngine            

In [7]:
# replace null values
cars_df = cars_df.fillna('0')

In [8]:
# check values in VehYear column
cars_df['VehYear'].value_counts()

2018    2248
2017    2115
2015    1779
2016     562
2019     137
Name: VehYear, dtype: int64

In [9]:
# create a new column VehAge based off of VehYear
cars_df['VehAge'] = 2020 - cars_df['VehYear']
cars_df = cars_df.drop('VehYear', axis = 1)

In [10]:
# ensure that quantitative columns are integers
cars_df[['SellerRating','SellerRevCnt','VehListdays','VehMileage','VehAge']] = cars_df[['SellerRating','SellerRevCnt','VehListdays','VehMileage','VehAge']].astype(int)

In [11]:
# convert zip column to string since it is not a quantitative variable
cars_df['SellerZip'] = cars_df['SellerZip'].astype(str)

In [12]:
# standardize the quantitative data
scaler = MinMaxScaler()
scaler.fit(cars_df[['SellerRating','SellerRevCnt','VehListdays','VehMileage','VehAge']])

MinMaxScaler()

In [13]:
norm_data = pd.concat([pd.DataFrame(scaler.transform(cars_df[['SellerRating','SellerRevCnt','VehListdays','VehMileage','VehAge']]),
                                  columns = ['zSellerRating','zSellerRevCnt','zVehListdays','zVehMileage','zVehAge']),
                     cars_df[['SellerCity','SellerIsPriv','SellerListSrc','SellerName','SellerState','SellerZip','VehBodystyle','VehCertified','VehColorExt','VehColorInt','VehDriveTrain','VehEngine','VehFeats','VehFuel','VehHistory','VehMake','VehModel','VehPriceLabel','VehType','VehTransmission','Vehicle_Trim','Dealer_Listing_Price','Source']]], axis = 1)

Predict Trim Type

In [14]:
# separate predictors and outcome for trims
trim_predictors = ['zSellerRating','zSellerRevCnt','zVehListdays','zVehMileage','zVehAge','SellerCity','SellerIsPriv','SellerListSrc','SellerName','SellerState','SellerZip','VehBodystyle','VehCertified','VehColorExt','VehColorInt','VehDriveTrain','VehEngine','VehFeats','VehFuel','VehHistory','VehMake','VehModel','VehPriceLabel','VehType','VehTransmission']
trim_outcome = 'Vehicle_Trim'

In [15]:
# get dummy variables for categorical variables
predictor_dummies = pd.get_dummies(norm_data[trim_predictors])
data = pd.concat([norm_data, predictor_dummies], axis = 1)

# split datasets
train_data = data.loc[data['Source'] == 'train']
train_data = train_data.drop(['SellerCity','SellerIsPriv','SellerListSrc','SellerName','SellerState','SellerZip','VehBodystyle','VehCertified','VehColorExt','VehColorInt','VehDriveTrain','VehEngine','VehFeats','VehFuel','VehHistory','VehMake','VehModel','VehPriceLabel','VehType','VehTransmission','Source'], axis = 1)
test_data = data.loc[data['Source'] == 'test']
test_data = test_data.drop(['SellerCity','SellerIsPriv','SellerListSrc','SellerName','SellerState','SellerZip','VehBodystyle','VehCertified','VehColorExt','VehColorInt','VehDriveTrain','VehEngine','VehFeats','VehFuel','VehHistory','VehMake','VehModel','VehPriceLabel','VehType','VehTransmission','Source'], axis = 1)

In [16]:
# partition data
X = train_data.drop([trim_outcome, 'Dealer_Listing_Price'], axis=1)
y = train_data[trim_outcome]
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.4, random_state = 1)

In [17]:
# run a decision tree model
dtc = DecisionTreeClassifier(random_state = 1)
dtc.fit(X_train, y_train)

DecisionTreeClassifier(random_state=1)

In [18]:
# evaluate the model
classificationSummary(y_valid, dtc.predict(X_valid))

Confusion Matrix (Accuracy 0.8045)

       Prediction
Actual   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20  21  22  23  24  25  26
     0   0   0   2   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
     1   0   0   0   0   0   0   0   0   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
     2   2   0  61   0   0   1  48   0  12   1   0   0   0   0   0   0   0   0   0   0   1   0   0   2   1   0   1
     3   0   0   0  53   0   0   0   0   0   0   0  14   0   0   0   0   0   4   0   0   0   0   0   0   0   0   0
     4   0   0   0   0  19   0   0   0   0   0   0   1   0   2   0   0   0   0   0   1   0   0   0   0   0   0   0
     5   0   0   2   0   0   5   2   0   8   0   0   0   0   0  13   0   0   0   0   0   2   0   0   4   0   3   0
     6   1   0  36   0   0   3 214   0  36   0   0   0   0   0   3   0   0   0   0   0   0   0   0   1   0   0   0
     7   0   0   3   0   0

In [19]:
# run a random forest model
rfc = RandomForestClassifier(random_state = 1)
rfc.fit(X_train, y_train)

RandomForestClassifier(random_state=1)

In [20]:
# evaluate the model
classificationSummary(y_valid, rfc.predict(X_valid))

Confusion Matrix (Accuracy 0.7942)

       Prediction
Actual   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20  21  22  23  24  25
     0   0   0   0   0   0   0   0   0   2   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
     1   0   0   0   0   0   0   0   0   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
     2   0   0  34   0   0   0  64   1  29   0   0   0   0   0   1   0   0   0   0   0   0   0   1   0   0   0
     3   0   0   0  48   1   0   0   0   0   0   0  20   0   0   0   0   0   2   0   0   0   0   0   0   0   0
     4   0   0   0   0  21   0   0   0   0   0   0   1   0   1   0   0   0   0   0   0   0   0   0   0   0   0
     5   0   0   0   0   0   1   1   0  16   0   0   0   0   0  17   0   0   0   0   0   1   0   1   0   2   0
     6   0   0   9   0   0   0 207   1  77   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
     7   0   0   1   0   0   0   3   8  13   0   0   0   0

In [21]:
# # Call the random forest classifier constructor and make the expected list of all the parameters.
# rfc = RandomForestClassifier()
# parameters = {
#   "n_estimators":[5,10,50,100,150,200,250],
#   "max_depth":[2,4,8,16,32,None]
# }

# # Use the GridSearchCV model selection for cross-validation
# from sklearn.model_selection import GridSearchCV
# cv = GridSearchCV(rfc,parameters,cv=2)
# cv.fit(X_train,y_train)

# # Print the best Parameters.
# def display(results):
#   print(f'Best parameters are: {results.best_params_}')
#   print("\n")
#   mean_score = results.cv_results_['mean_test_score']
#   std_score = results.cv_results_['std_test_score']
#   params = results.cv_results_['params']
#   for mean,std,params in zip(mean_score,std_score,params):
#     print(f'{round(mean,3)} + or - {round(std,3)} for the {params}')

# display(cv)


Predict Dealer Listing Price

In [22]:
# separate predictors and outcome for price
price_predictors = ['zSellerRating','zSellerRevCnt','zVehListdays','zVehMileage','zVehAge','SellerCity','SellerIsPriv','SellerListSrc','SellerName','SellerState','SellerZip','VehBodystyle','VehCertified','VehColorExt','VehColorInt','VehDriveTrain','VehEngine','VehFeats','VehFuel','VehHistory','VehMake','VehModel','VehPriceLabel','VehType','VehTransmission']
price_outcome = 'Dealer_Listing_Price'

In [23]:
# partition the data
X = train_data.drop([price_outcome, 'Vehicle_Trim'], axis=1)
y = train_data[price_outcome]
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.4, random_state = 1)

In [24]:
# run a decision tree model
dtr = DecisionTreeRegressor(random_state = 1)
dtr.fit(X_train, y_train)

DecisionTreeRegressor(random_state=1)

In [25]:
# evaluate the model
regressionSummary(y_valid, dtr.predict(X_valid))


Regression statistics

                      Mean Error (ME) : 262.1896
       Root Mean Squared Error (RMSE) : 3844.0444
            Mean Absolute Error (MAE) : 2374.1964
          Mean Percentage Error (MPE) : 0.0655
Mean Absolute Percentage Error (MAPE) : 7.2733


In [26]:
# run a random forest model
rfr = RandomForestRegressor(random_state = 1)
rfr.fit(X_train, y_train)

RandomForestRegressor(random_state=1)

In [27]:
# evaluate the model
regressionSummary(y_valid, rfr.predict(X_valid))


Regression statistics

                      Mean Error (ME) : 148.4482
       Root Mean Squared Error (RMSE) : 3126.2443
            Mean Absolute Error (MAE) : 1846.4083
          Mean Percentage Error (MPE) : -0.3556
Mean Absolute Percentage Error (MAPE) : 5.6268


In [28]:
# prepare test data for predictions
X = test_data.drop(['Vehicle_Trim', 'Dealer_Listing_Price'], axis=1)

In [29]:
# form trim predictions
predicted_trim = pd.DataFrame(rfc.predict(X))

In [30]:
# form price predictions
predicted_price = pd.DataFrame(rfr.predict(X))

In [31]:
# get the listing ids
id_df = pd.DataFrame(test_df['ListingID'])

In [32]:
# combine listing ids with the results of the predictions
results_df = pd.concat([id_df, predicted_trim, predicted_price], axis = 1)

In [33]:
# rename the columns
results_df.columns =['ListingID', 'Vehicle_Trime', 'Dealer_Listing_Price']

In [34]:
# export to csv
results_df.to_csv('daniel_wang.csv')