title | author | date | output | ||||||||
---|---|---|---|---|---|---|---|---|---|---|---|
ML_Predicting-Vehicle-Fuel-Efficiency |
Reinp |
2020-06-07 |
|
knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
#echo=FALSE indicates that the code will not be shown in the final document
#(though any results/output would still be displayed).
#include=FALSE to have the chunk evaluated, but neither the code nor its output displayed
# warning=FALSE and message=FALSE suppress any R warnings or messages from being included
#in the final document
library(tidyverse)
## tidyverse includes readr, ggplot2, dplyr, forcats, tibble, tidyr, purrr, stringr
## Reading our dataset
setwd('E:/Documents/Reinp/GitHub Respositories/ML_Predicting-Vehicle-Fuel-Efficiency')
cars2020 <- read.csv("cars2020.csv")
attach(cars2020)
View(cars2020)
#cars2020 #The input data consists of 1,164 observations of 14 variables
head(cars2020)
## ï..make model mpg transmission gears drive displ cylinders
## 1 Toyota Corolla 34.2793 CVT 10 FWD 2.0 4
## 2 Toyota Corolla Hybrid 52.0000 CVT 1 FWD 1.8 4
## 3 Toyota Corolla 31.8162 Manual 6 FWD 2.0 4
## 4 Toyota Corolla XSE 33.6766 CVT 10 FWD 2.0 4
## 5 Toyota Corolla 33.0496 CVT 1 FWD 1.8 4
## 6 Toyota Corolla 33.1228 Manual 6 FWD 1.8 4
## class lv2 lv4 sidi aspiration fuelType1 atvType startStop
## 1 Compact 0 13 Y Natural Regular Gasoline None N
## 2 Compact 0 13 N Natural Regular Gasoline Hybrid Y
## 3 Compact 0 13 Y Natural Regular Gasoline None N
## 4 Compact 0 13 Y Natural Regular Gasoline None N
## 5 Compact 0 13 N Natural Regular Gasoline None N
## 6 Compact 0 13 N Natural Regular Gasoline None N
tail(cars2020)
## ï..make model mpg transmission gears drive displ cylinders
## 1159 Mercedes-Benz S560e 22.8263 Automatic 9 RWD 3 6
## 1160 BMW M2 CS Coupe 19.5165 Manual 6 RWD 3 6
## 1161 BMW M2 CS Coupe 18.7179 Manual 7 RWD 3 6
## 1162 Audi SQ7 17.1357 Automatic 8 AWD 4 8
## 1163 Audi SQ8 17.0474 Automatic 8 AWD 4 8
## 1164 Bentley Bentayga 18.7319 Automatic 8 AWD 3 6
## class lv2 lv4 sidi aspiration fuelType1 atvType
## 1159 Large 0 9 Y Turbo Premium Gasoline Plug-in Hybrid
## 1160 Subcompact 10 0 Y Turbo Premium Gasoline None
## 1161 Subcompact 10 0 Y Turbo Premium Gasoline None
## 1162 Std SUV 0 0 Y Turbo Premium Gasoline None
## 1163 Std SUV 0 0 Y Turbo Premium Gasoline None
## 1164 Std SUV 0 0 Y Turbo Premium Gasoline Plug-in Hybrid
## startStop
## 1159 Y
## 1160 Y
## 1161 Y
## 1162 Y
## 1163 Y
## 1164 Y
# How many variables and observations are there?
ncol(cars2020)
## [1] 16
nrow(cars2020)
## [1] 1164
#learn more about the dataset
help(cars2020)
## No documentation for 'cars2020' in specified packages and libraries:
## you could try '??cars2020'
??cars2020
str(cars2020)
## 'data.frame': 1164 obs. of 16 variables:
## $ ï..make : chr "Toyota" "Toyota" "Toyota" "Toyota" ...
## $ model : chr "Corolla" "Corolla Hybrid" "Corolla" "Corolla XSE" ...
## $ mpg : num 34.3 52 31.8 33.7 33 ...
## $ transmission: chr "CVT" "CVT" "Manual" "CVT" ...
## $ gears : int 10 1 6 10 1 6 1 1 1 6 ...
## $ drive : chr "FWD" "FWD" "FWD" "FWD" ...
## $ displ : num 2 1.8 2 2 1.8 1.8 1.8 2 2 2 ...
## $ cylinders : int 4 4 4 4 4 4 4 4 4 4 ...
## $ class : chr "Compact" "Compact" "Compact" "Compact" ...
## $ lv2 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ lv4 : int 13 13 13 13 13 13 13 24 24 24 ...
## $ sidi : chr "Y" "N" "Y" "Y" ...
## $ aspiration : chr "Natural" "Natural" "Natural" "Natural" ...
## $ fuelType1 : chr "Regular Gasoline" "Regular Gasoline" "Regular Gasoline" "Regular Gasoline" ...
## $ atvType : chr "None" "Hybrid" "None" "None" ...
## $ startStop : chr "N" "Y" "N" "N" ...
class(cars2020)
## [1] "data.frame"
typeof(cars2020)
## [1] "list"
length(cars2020)
## [1] 16
names(cars2020) #display variable names
## [1] "ï..make" "model" "mpg" "transmission" "gears"
## [6] "drive" "displ" "cylinders" "class" "lv2"
## [11] "lv4" "sidi" "aspiration" "fuelType1" "atvType"
## [16] "startStop"
#attributes(cars2020) #names(cars2020), class(cars2020), row.names(cars2020)
which(!complete.cases(cars2020))
## integer(0)
library(knitr)
library(mosaic)
library(psych)
names(cars2020)[1] <- "car_make" #rename by index column name with base r functions
#names(cars2020)[names(cars2020) == "ï..make"] <- "car_make"
#summary statistics
summary(cars2020) ##summarizes the dataset
## car_make model mpg transmission
## Length:1164 Length:1164 Min. :10.59 Length:1164
## Class :character Class :character 1st Qu.:19.32 Class :character
## Mode :character Mode :character Median :22.63 Mode :character
## Mean :23.55
## 3rd Qu.:26.36
## Max. :57.78
## gears drive displ cylinders
## Min. : 1.000 Length:1164 Min. :1.000 Min. : 3.00
## 1st Qu.: 6.000 Class :character 1st Qu.:2.000 1st Qu.: 4.00
## Median : 8.000 Mode :character Median :3.000 Median : 6.00
## Mean : 7.303 Mean :3.077 Mean : 5.57
## 3rd Qu.: 8.000 3rd Qu.:3.600 3rd Qu.: 6.00
## Max. :10.000 Max. :8.000 Max. :16.00
## class lv2 lv4 sidi
## Length:1164 Min. : 0.000 Min. : 0.000 Length:1164
## Class :character 1st Qu.: 0.000 1st Qu.: 0.000 Class :character
## Mode :character Median : 0.000 Median : 0.000 Mode :character
## Mean : 1.587 Mean : 5.253
## 3rd Qu.: 0.000 3rd Qu.:13.000
## Max. :22.000 Max. :47.000
## aspiration fuelType1 atvType startStop
## Length:1164 Length:1164 Length:1164 Length:1164
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
#1. Dolar sign Syntax
table(cars2020$car_make)
##
## Acura Alfa Romeo Aston Martin Audi
## 18 7 4 40
## Bentley BMW Bugatti Buick
## 8 111 1 14
## Cadillac Chevrolet Chrysler Dodge
## 24 79 6 25
## Ferrari Fiat Ford Genesis
## 7 4 73 15
## GMC Honda Hyundai Infiniti
## 53 45 39 14
## Jaguar Jeep Karma Kia
## 29 36 1 37
## Lamborghini Land Rover Lexus Lincoln
## 7 23 37 23
## Lotus Maserati Mazda Mercedes-Benz
## 4 10 23 89
## MINI Mitsubishi Nissan Porsche
## 17 16 30 40
## Ram Rolls-Royce Roush Performance Subaru
## 14 8 4 24
## Toyota Volkswagen Volvo
## 64 20 21
table(cars2020$transmission)
##
## Automatic CVT Manual
## 812 128 224
table(cars2020$drive)
##
## 4WD AWD FWD PT 4WD RWD
## 164 378 298 30 294
table(cars2020$class)
##
## Compact Large Mid St Wagon Midsize
## 129 103 11 171
## Minicompact Minivan Passenger Van Sm Pickup Truck
## 32 8 2 21
## Sm St Wagon Sm SUV SPV Std Pickup Truck
## 30 237 24 87
## Std SUV Subcompact Two Seater
## 139 118 52
table(cars2020$sidi)
##
## N Y
## 217 947
table(cars2020$aspiration)
##
## Natural Super Super+Turbo Turbo
## 494 41 6 623
table(cars2020$fuelType1)
##
## Diesel Midgrade Gasoline Premium Gasoline Regular Gasoline
## 20 12 593 539
table(cars2020$atvType)
##
## Diesel FFV Hybrid None Plug-in Hybrid
## 20 24 78 1006 36
table(cars2020$startStop)
##
## N Y
## 492 672
summary(cars2020$mpg)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 10.59 19.32 22.63 23.55 26.36 57.78
summary(cars2020$gears)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 6.000 8.000 7.303 8.000 10.000
summary(cars2020$displ)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.000 3.000 3.077 3.600 8.000
summary(cars2020$cylinders)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.00 4.00 6.00 5.57 6.00 16.00
#2. FormulaSyntax
## one categorical
tally(~car_make, data=cars2020)
## car_make
## Acura Alfa Romeo Aston Martin Audi
## 18 7 4 40
## Bentley BMW Bugatti Buick
## 8 111 1 14
## Cadillac Chevrolet Chrysler Dodge
## 24 79 6 25
## Ferrari Fiat Ford Genesis
## 7 4 73 15
## GMC Honda Hyundai Infiniti
## 53 45 39 14
## Jaguar Jeep Karma Kia
## 29 36 1 37
## Lamborghini Land Rover Lexus Lincoln
## 7 23 37 23
## Lotus Maserati Mazda Mercedes-Benz
## 4 10 23 89
## MINI Mitsubishi Nissan Porsche
## 17 16 30 40
## Ram Rolls-Royce Roush Performance Subaru
## 14 8 4 24
## Toyota Volkswagen Volvo
## 64 20 21
tally(~transmission, data=cars2020)
## transmission
## Automatic CVT Manual
## 812 128 224
tally(~drive, data=cars2020)
## drive
## 4WD AWD FWD PT 4WD RWD
## 164 378 298 30 294
tally(~class, data=cars2020)
## class
## Compact Large Mid St Wagon Midsize
## 129 103 11 171
## Minicompact Minivan Passenger Van Sm Pickup Truck
## 32 8 2 21
## Sm St Wagon Sm SUV SPV Std Pickup Truck
## 30 237 24 87
## Std SUV Subcompact Two Seater
## 139 118 52
tally(~sidi, data=cars2020)
## sidi
## N Y
## 217 947
tally(~aspiration, data=cars2020)
## aspiration
## Natural Super Super+Turbo Turbo
## 494 41 6 623
tally(~fuelType1, data=cars2020)
## fuelType1
## Diesel Midgrade Gasoline Premium Gasoline Regular Gasoline
## 20 12 593 539
tally(~atvType, data=cars2020)
## atvType
## Diesel FFV Hybrid None Plug-in Hybrid
## 20 24 78 1006 36
tally(~startStop, data=cars2020)
## startStop
## N Y
## 492 672
## Two categoraical
tally(car_make~transmission, data=cars2020)
## transmission
## car_make Automatic CVT Manual
## Acura 12 0 6
## Alfa Romeo 6 0 1
## Aston Martin 4 0 0
## Audi 18 0 22
## Bentley 3 0 5
## BMW 91 0 20
## Bugatti 0 0 1
## Buick 12 2 0
## Cadillac 24 0 0
## Chevrolet 70 3 6
## Chrysler 5 1 0
## Dodge 20 0 5
## Ferrari 0 0 7
## Fiat 3 0 1
## Ford 58 6 9
## Genesis 14 0 1
## GMC 53 0 0
## Honda 11 25 9
## Hyundai 17 4 18
## Infiniti 10 4 0
## Jaguar 29 0 0
## Jeep 31 0 5
## Karma 1 0 0
## Kia 20 5 12
## Lamborghini 1 0 6
## Land Rover 23 0 0
## Lexus 27 10 0
## Lincoln 22 1 0
## Lotus 2 0 2
## Maserati 10 0 0
## Mazda 20 0 3
## Mercedes-Benz 75 0 14
## MINI 8 0 9
## Mitsubishi 2 12 2
## Nissan 7 20 3
## Porsche 6 0 34
## Ram 14 0 0
## Rolls-Royce 8 0 0
## Roush Performance 3 0 1
## Subaru 1 14 9
## Toyota 36 21 7
## Volkswagen 14 0 6
## Volvo 21 0 0
tally(car_make~drive, data=cars2020)
## drive
## car_make 4WD AWD FWD PT 4WD RWD
## Acura 0 9 9 0 0
## Alfa Romeo 0 3 0 0 4
## Aston Martin 0 0 0 0 4
## Audi 0 38 2 0 0
## Bentley 0 8 0 0 0
## BMW 0 65 2 0 44
## Bugatti 0 1 0 0 0
## Buick 0 8 6 0 0
## Cadillac 1 12 4 0 7
## Chevrolet 28 6 17 0 28
## Chrysler 0 1 3 0 2
## Dodge 0 5 2 0 18
## Ferrari 0 0 0 1 6
## Fiat 0 1 1 0 2
## Ford 0 8 18 16 31
## Genesis 0 7 0 0 8
## GMC 25 4 5 0 19
## Honda 2 6 37 0 0
## Hyundai 0 7 32 0 0
## Infiniti 1 6 2 0 5
## Jaguar 0 19 0 0 10
## Jeep 12 16 7 0 1
## Karma 0 0 0 0 1
## Kia 0 8 27 0 2
## Lamborghini 0 5 0 0 2
## Land Rover 14 9 0 0 0
## Lexus 2 15 8 0 12
## Lincoln 0 10 7 3 3
## Lotus 0 0 0 0 4
## Maserati 0 6 0 0 4
## Mazda 8 0 13 0 2
## Mercedes-Benz 48 0 4 0 37
## MINI 0 6 11 0 0
## Mitsubishi 6 1 9 0 0
## Nissan 3 6 14 2 5
## Porsche 6 27 0 0 7
## Ram 6 0 1 0 7
## Rolls-Royce 0 2 0 0 6
## Roush Performance 1 0 0 0 3
## Subaru 0 21 0 0 3
## Toyota 1 12 36 8 7
## Volkswagen 0 5 15 0 0
## Volvo 0 15 6 0 0
tally(car_make~class, data=cars2020)
## class
## car_make Compact Large Mid St Wagon Midsize Minicompact Minivan
## Acura 7 0 0 2 0 0
## Alfa Romeo 0 0 0 3 0 0
## Aston Martin 0 0 0 0 3 0
## Audi 3 4 0 8 0 0
## Bentley 0 0 0 1 2 0
## BMW 24 8 0 15 0 0
## Bugatti 0 0 0 0 0 0
## Buick 0 0 0 3 0 0
## Cadillac 6 2 0 6 0 0
## Chevrolet 1 2 0 2 0 0
## Chrysler 0 3 0 0 0 3
## Dodge 0 6 0 12 0 1
## Ferrari 0 0 0 0 3 0
## Fiat 0 0 0 0 0 0
## Ford 0 0 0 8 0 0
## Genesis 5 10 0 0 0 0
## GMC 0 0 0 0 0 0
## Honda 6 10 0 10 0 1
## Hyundai 7 10 0 8 0 0
## Infiniti 0 0 0 4 0 0
## Jaguar 4 0 2 5 0 0
## Jeep 0 0 0 0 0 0
## Karma 0 0 0 0 0 0
## Kia 1 6 0 11 0 1
## Lamborghini 0 0 0 0 0 0
## Land Rover 0 0 0 0 0 0
## Lexus 7 0 0 11 0 0
## Lincoln 0 6 0 5 0 0
## Lotus 0 0 0 0 4 0
## Maserati 0 3 0 3 0 0
## Mazda 7 0 0 5 0 0
## Mercedes-Benz 16 9 2 5 0 0
## MINI 0 0 0 9 3 0
## Mitsubishi 4 0 0 0 0 0
## Nissan 2 0 2 9 0 0
## Porsche 0 18 0 0 12 0
## Ram 0 0 0 0 0 0
## Rolls-Royce 1 4 2 1 0 0
## Roush Performance 0 0 0 0 0 0
## Subaru 3 2 0 4 3 0
## Toyota 14 0 0 18 2 2
## Volkswagen 8 0 0 1 0 0
## Volvo 3 0 3 2 0 0
## class
## car_make Passenger Van Sm Pickup Truck Sm St Wagon Sm SUV SPV
## Acura 0 0 0 8 0
## Alfa Romeo 0 0 0 3 0
## Aston Martin 0 0 0 0 0
## Audi 0 0 1 4 0
## Bentley 0 0 0 0 0
## BMW 0 0 0 10 0
## Bugatti 0 0 0 0 0
## Buick 0 0 1 8 0
## Cadillac 0 0 0 8 0
## Chevrolet 0 8 1 11 4
## Chrysler 0 0 0 0 0
## Dodge 0 0 0 1 0
## Ferrari 0 0 0 0 0
## Fiat 0 0 1 1 0
## Ford 2 0 0 11 8
## Genesis 0 0 0 0 0
## GMC 0 6 0 4 4
## Honda 0 1 7 9 0
## Hyundai 0 0 0 12 0
## Infiniti 0 0 0 4 0
## Jaguar 0 0 0 6 0
## Jeep 0 0 0 29 0
## Karma 0 0 0 0 0
## Kia 0 0 8 10 0
## Lamborghini 0 0 0 0 0
## Land Rover 0 0 0 9 0
## Lexus 0 0 0 8 0
## Lincoln 0 0 0 6 0
## Lotus 0 0 0 0 0
## Maserati 0 0 0 0 0
## Mazda 0 0 0 9 0
## Mercedes-Benz 0 0 0 14 6
## MINI 0 0 0 0 0
## Mitsubishi 0 0 0 12 0
## Nissan 0 0 2 5 1
## Porsche 0 0 0 4 0
## Ram 0 0 0 0 1
## Rolls-Royce 0 0 0 0 0
## Roush Performance 0 0 0 0 0
## Subaru 0 0 4 6 0
## Toyota 0 6 0 11 0
## Volkswagen 0 0 2 8 0
## Volvo 0 0 3 6 0
## class
## car_make Std Pickup Truck Std SUV Subcompact Two Seater
## Acura 0 0 0 1
## Alfa Romeo 0 0 0 1
## Aston Martin 0 0 0 1
## Audi 0 5 12 3
## Bentley 0 3 2 0
## BMW 0 14 37 3
## Bugatti 0 0 0 1
## Buick 0 2 0 0
## Cadillac 0 2 0 0
## Chevrolet 23 14 12 1
## Chrysler 0 0 0 0
## Dodge 0 5 0 0
## Ferrari 0 0 0 4
## Fiat 0 0 0 2
## Ford 19 10 14 1
## Genesis 0 0 0 0
## GMC 22 17 0 0
## Honda 1 0 0 0
## Hyundai 0 2 0 0
## Infiniti 0 2 4 0
## Jaguar 0 0 0 12
## Jeep 2 5 0 0
## Karma 0 0 1 0
## Kia 0 0 0 0
## Lamborghini 0 1 0 6
## Land Rover 0 14 0 0
## Lexus 0 4 7 0
## Lincoln 0 6 0 0
## Lotus 0 0 0 0
## Maserati 0 4 0 0
## Mazda 0 0 0 2
## Mercedes-Benz 0 6 21 10
## MINI 0 0 5 0
## Mitsubishi 0 0 0 0
## Nissan 3 2 1 3
## Porsche 0 6 0 0
## Ram 13 0 0 0
## Rolls-Royce 0 0 0 0
## Roush Performance 2 0 2 0
## Subaru 0 2 0 0
## Toyota 2 8 0 1
## Volkswagen 0 1 0 0
## Volvo 0 4 0 0
tally(car_make~sidi, data=cars2020)
## sidi
## car_make N Y
## Acura 1 17
## Alfa Romeo 0 7
## Aston Martin 4 0
## Audi 0 40
## Bentley 0 8
## BMW 0 111
## Bugatti 1 0
## Buick 2 12
## Cadillac 0 24
## Chevrolet 13 66
## Chrysler 6 0
## Dodge 25 0
## Ferrari 0 7
## Fiat 3 1
## Ford 22 51
## Genesis 0 15
## GMC 5 48
## Honda 15 30
## Hyundai 9 30
## Infiniti 0 14
## Jaguar 0 29
## Jeep 25 11
## Karma 0 1
## Kia 7 30
## Lamborghini 2 5
## Land Rover 3 20
## Lexus 3 34
## Lincoln 3 20
## Lotus 4 0
## Maserati 0 10
## Mazda 0 23
## Mercedes-Benz 2 87
## MINI 0 17
## Mitsubishi 12 4
## Nissan 13 17
## Porsche 0 40
## Ram 14 0
## Rolls-Royce 0 8
## Roush Performance 2 2
## Subaru 1 23
## Toyota 20 44
## Volkswagen 0 20
## Volvo 0 21
tally(car_make~aspiration, data=cars2020)
## aspiration
## car_make Natural Super Super+Turbo Turbo
## Acura 13 0 0 5
## Alfa Romeo 0 0 0 7
## Aston Martin 0 0 0 4
## Audi 2 0 0 38
## Bentley 0 0 0 8
## BMW 0 0 0 111
## Bugatti 0 0 0 1
## Buick 5 0 0 9
## Cadillac 7 0 0 17
## Chevrolet 54 2 0 23
## Chrysler 6 0 0 0
## Dodge 20 5 0 0
## Ferrari 2 0 0 5
## Fiat 0 0 0 4
## Ford 31 1 0 41
## Genesis 6 0 0 9
## GMC 38 0 0 15
## Honda 27 0 0 18
## Hyundai 27 0 0 12
## Infiniti 4 0 0 10
## Jaguar 0 14 0 15
## Jeep 23 1 0 12
## Karma 0 0 0 1
## Kia 25 0 0 12
## Lamborghini 6 0 0 1
## Land Rover 0 10 0 13
## Lexus 30 0 0 7
## Lincoln 3 0 0 20
## Lotus 0 4 0 0
## Maserati 0 0 0 10
## Mazda 19 0 0 4
## Mercedes-Benz 5 0 0 84
## MINI 0 0 0 17
## Mitsubishi 12 0 0 4
## Nissan 28 0 0 2
## Porsche 0 0 0 40
## Ram 12 0 0 2
## Rolls-Royce 0 0 0 8
## Roush Performance 0 4 0 0
## Subaru 17 0 0 7
## Toyota 63 0 0 1
## Volkswagen 4 0 0 16
## Volvo 5 0 6 10
tally(car_make~fuelType1, data=cars2020)
## fuelType1
## car_make Diesel Midgrade Gasoline Premium Gasoline Regular Gasoline
## Acura 0 0 18 0
## Alfa Romeo 0 0 7 0
## Aston Martin 0 0 4 0
## Audi 0 0 34 6
## Bentley 0 0 8 0
## BMW 0 0 111 0
## Bugatti 0 0 1 0
## Buick 0 0 4 10
## Cadillac 0 0 19 5
## Chevrolet 5 0 17 57
## Chrysler 0 1 0 5
## Dodge 0 4 13 8
## Ferrari 0 0 7 0
## Fiat 0 0 3 1
## Ford 4 0 3 66
## Genesis 0 0 13 2
## GMC 5 0 8 40
## Honda 0 0 5 40
## Hyundai 0 0 0 39
## Infiniti 0 0 14 0
## Jaguar 0 0 29 0
## Jeep 1 1 2 32
## Karma 0 0 1 0
## Kia 0 0 5 32
## Lamborghini 0 0 7 0
## Land Rover 3 0 20 0
## Lexus 0 0 26 11
## Lincoln 0 0 0 23
## Lotus 0 0 4 0
## Maserati 0 0 10 0
## Mazda 0 0 2 21
## Mercedes-Benz 0 0 89 0
## MINI 0 0 17 0
## Mitsubishi 0 0 1 15
## Nissan 0 0 8 22
## Porsche 0 0 40 0
## Ram 2 6 0 6
## Rolls-Royce 0 0 8 0
## Roush Performance 0 0 4 0
## Subaru 0 0 6 18
## Toyota 0 0 3 61
## Volkswagen 0 0 2 18
## Volvo 0 0 20 1
tally(car_make~startStop, data=cars2020)
## startStop
## car_make N Y
## Acura 9 9
## Alfa Romeo 1 6
## Aston Martin 4 0
## Audi 4 36
## Bentley 0 8
## BMW 0 111
## Bugatti 1 0
## Buick 5 9
## Cadillac 2 22
## Chevrolet 49 30
## Chrysler 3 3
## Dodge 23 2
## Ferrari 0 7
## Fiat 3 1
## Ford 23 50
## Genesis 13 2
## GMC 27 26
## Honda 38 7
## Hyundai 29 10
## Infiniti 14 0
## Jaguar 0 29
## Jeep 7 29
## Karma 0 1
## Kia 26 11
## Lamborghini 4 3
## Land Rover 0 23
## Lexus 28 9
## Lincoln 10 13
## Lotus 4 0
## Maserati 0 10
## Mazda 23 0
## Mercedes-Benz 3 86
## MINI 7 10
## Mitsubishi 15 1
## Nissan 30 0
## Porsche 0 40
## Ram 8 6
## Rolls-Royce 8 0
## Roush Performance 4 0
## Subaru 19 5
## Toyota 44 20
## Volkswagen 4 16
## Volvo 0 21
library(kableExtra)
kable(cbind(tally(car_make~transmission, data=cars2020), tally(car_make~sidi,
data=cars2020), tally(car_make~startStop, data=cars2020)), align = "cccrrrr",
caption = "Group Rows")%>%
add_header_above(c(" ", "Transmission" = 3, "Spark Ignited Direct Ignition" = 2,
"start-stop technology" = 2))
Transmission |
Spark Ignited Direct Ignition |
start-stop technology |
|||||
---|---|---|---|---|---|---|---|
Automatic | CVT | Manual | N | Y | N | Y | |
Acura | 12 | 0 | 6 | 1 | 17 | 9 | 9 |
Alfa Romeo | 6 | 0 | 1 | 0 | 7 | 1 | 6 |
Aston Martin | 4 | 0 | 0 | 4 | 0 | 4 | 0 |
Audi | 18 | 0 | 22 | 0 | 40 | 4 | 36 |
Bentley | 3 | 0 | 5 | 0 | 8 | 0 | 8 |
BMW | 91 | 0 | 20 | 0 | 111 | 0 | 111 |
Bugatti | 0 | 0 | 1 | 1 | 0 | 1 | 0 |
Buick | 12 | 2 | 0 | 2 | 12 | 5 | 9 |
Cadillac | 24 | 0 | 0 | 0 | 24 | 2 | 22 |
Chevrolet | 70 | 3 | 6 | 13 | 66 | 49 | 30 |
Chrysler | 5 | 1 | 0 | 6 | 0 | 3 | 3 |
Dodge | 20 | 0 | 5 | 25 | 0 | 23 | 2 |
Ferrari | 0 | 0 | 7 | 0 | 7 | 0 | 7 |
Fiat | 3 | 0 | 1 | 3 | 1 | 3 | 1 |
Ford | 58 | 6 | 9 | 22 | 51 | 23 | 50 |
Genesis | 14 | 0 | 1 | 0 | 15 | 13 | 2 |
GMC | 53 | 0 | 0 | 5 | 48 | 27 | 26 |
Honda | 11 | 25 | 9 | 15 | 30 | 38 | 7 |
Hyundai | 17 | 4 | 18 | 9 | 30 | 29 | 10 |
Infiniti | 10 | 4 | 0 | 0 | 14 | 14 | 0 |
Jaguar | 29 | 0 | 0 | 0 | 29 | 0 | 29 |
Jeep | 31 | 0 | 5 | 25 | 11 | 7 | 29 |
Karma | 1 | 0 | 0 | 0 | 1 | 0 | 1 |
Kia | 20 | 5 | 12 | 7 | 30 | 26 | 11 |
Lamborghini | 1 | 0 | 6 | 2 | 5 | 4 | 3 |
Land Rover | 23 | 0 | 0 | 3 | 20 | 0 | 23 |
Lexus | 27 | 10 | 0 | 3 | 34 | 28 | 9 |
Lincoln | 22 | 1 | 0 | 3 | 20 | 10 | 13 |
Lotus | 2 | 0 | 2 | 4 | 0 | 4 | 0 |
Maserati | 10 | 0 | 0 | 0 | 10 | 0 | 10 |
Mazda | 20 | 0 | 3 | 0 | 23 | 23 | 0 |
Mercedes-Benz | 75 | 0 | 14 | 2 | 87 | 3 | 86 |
MINI | 8 | 0 | 9 | 0 | 17 | 7 | 10 |
Mitsubishi | 2 | 12 | 2 | 12 | 4 | 15 | 1 |
Nissan | 7 | 20 | 3 | 13 | 17 | 30 | 0 |
Porsche | 6 | 0 | 34 | 0 | 40 | 0 | 40 |
Ram | 14 | 0 | 0 | 14 | 0 | 8 | 6 |
Rolls-Royce | 8 | 0 | 0 | 0 | 8 | 8 | 0 |
Roush Performance | 3 | 0 | 1 | 2 | 2 | 4 | 0 |
Subaru | 1 | 14 | 9 | 1 | 23 | 19 | 5 |
Toyota | 36 | 21 | 7 | 20 | 44 | 44 | 20 |
Volkswagen | 14 | 0 | 6 | 0 | 20 | 4 | 16 |
Volvo | 21 | 0 | 0 | 0 | 21 | 0 | 21 |
#latex, html, markdown, pandoc, and rst
##one continous variable
favstats(~mpg, data=cars2020)[c("max", "mean","sd", "n")]
## max mean sd n
## 57.7824 23.55156 6.40129 1164
favstats(~mpg, data=cars2020)
## min Q1 median Q3 max mean sd n missing
## 10.5921 19.3234 22.6291 26.3573 57.7824 23.55156 6.40129 1164 0
favstats(~gears, data=cars2020)
## min Q1 median Q3 max mean sd n missing
## 1 6 8 8 10 7.303265 1.972718 1164 0
favstats(~displ, data=cars2020)
## min Q1 median Q3 max mean sd n missing
## 1 2 3 3.6 8 3.076546 1.29394 1164 0
favstats(~cylinders, data=cars2020)
## min Q1 median Q3 max mean sd n missing
## 3 4 6 6 16 5.569588 1.826848 1164 0
##one continous one categorical
favstats(mpg~ car_make, data=cars2020)
## car_make min Q1 median Q3 max mean
## 1 Acura 21.0000 22.94900 23.25380 25.75972 28.0000 24.07169
## 2 Alfa Romeo 19.1410 22.36360 25.32660 26.24905 27.9406 24.23336
## 3 Aston Martin 17.0315 17.54953 19.06770 20.41378 20.4155 18.89560
## 4 Audi 15.2575 20.38198 22.65465 25.99520 30.1793 22.71765
## 5 Bentley 13.6613 14.66160 16.16640 18.79032 18.9656 16.49755
## 6 BMW 15.0983 18.81510 22.69820 25.15485 29.6330 22.32187
## 7 Bugatti 10.5921 10.59210 10.59210 10.59210 10.5921 10.59210
## 8 Buick 20.0162 22.45845 24.54265 26.71392 30.7314 24.78502
## 9 Cadillac 16.5637 20.35020 22.20880 24.23393 27.1593 22.17932
## 10 Chevrolet 14.9623 17.00000 19.61160 23.30990 32.9630 20.72874
## 11 Chrysler 19.0620 21.61542 22.24520 22.64615 29.5207 22.87640
## 12 Dodge 15.0483 16.68870 17.76080 20.77540 22.7798 18.36674
## 13 Ferrari 13.3412 14.85070 16.61150 16.90515 18.1715 15.94799
## 14 Fiat 24.9382 25.71632 27.63775 29.39140 29.6662 27.46997
## 15 Ford 14.0853 18.86980 22.85180 24.81560 42.0000 23.24922
## 16 Genesis 17.7991 19.19875 20.09290 20.70120 24.8653 20.26011
## 17 GMC 14.9623 16.94260 18.15210 21.53040 27.4108 19.33331
## 18 Honda 20.5738 25.95250 30.25000 33.23830 51.9827 30.68666
## 19 Hyundai 21.1618 24.89410 29.50960 32.19735 57.7824 31.29421
## 20 Infiniti 15.3391 21.70990 21.97385 22.41270 26.1476 21.62992
## 21 Jaguar 18.1891 20.86350 23.42660 25.66260 28.4037 23.14366
## 22 Jeep 13.3061 20.44058 22.19925 24.75897 26.9515 22.08787
## 23 Karma 25.8000 25.80000 25.80000 25.80000 25.8000 25.80000
## 24 Kia 19.9185 22.95260 27.05040 31.34930 50.4101 28.96986
## 25 Lamborghini 10.7137 12.43430 14.98990 14.99780 15.0057 13.65336
## 26 Land Rover 15.1147 18.09170 20.24570 22.40790 24.3843 20.11077
## 27 Lexus 13.8684 21.00000 22.82060 25.78180 44.0943 24.77066
## 28 Lincoln 17.5802 19.45870 20.98220 23.06780 41.0000 21.87267
## 29 Lotus 18.9178 19.42592 19.71800 19.88577 20.0210 19.59370
## 30 Maserati 15.3607 17.11340 17.75225 19.16457 19.3234 17.71110
## 31 Mazda 22.6002 26.74620 28.32850 29.67095 34.9351 28.36609
## 32 Mercedes-Benz 13.8444 19.69810 21.72890 23.77550 28.4696 21.69211
## 33 MINI 25.8460 27.37610 28.90900 29.79170 31.3799 28.61675
## 34 Mitsubishi 22.0000 25.31330 26.07675 29.26265 38.6074 28.41024
## 35 Nissan 15.0000 20.92748 24.69735 29.75000 35.0880 25.06172
## 36 Porsche 16.5714 19.93535 20.05000 20.98852 22.5030 20.27934
## 37 Ram 16.8732 17.69468 19.36585 22.39708 26.0473 20.31741
## 38 Rolls-Royce 13.8915 13.89150 14.24320 14.32477 14.3910 14.15570
## 39 Roush Performance 12.6756 12.67560 13.45995 14.50355 15.2813 13.71920
## 40 Subaru 18.6117 23.61950 26.33855 29.86630 35.1000 26.60871
## 41 Toyota 14.2639 22.44510 29.04465 34.45507 55.7000 30.08026
## 42 Volkswagen 18.5818 21.28947 24.96860 27.80837 34.0525 25.16221
## 43 Volvo 20.7216 24.41490 25.04910 26.80000 30.3000 25.61402
## sd n missing
## 1 2.1752314 18 0
## 2 3.3010997 7 0
## 3 1.7762265 4 0
## 4 3.8987561 40 0
## 5 2.2336328 8 0
## 6 3.8143732 111 0
## 7 NA 1 0
## 8 2.9760769 14 0
## 9 2.9424019 24 0
## 10 4.7512070 79 0
## 11 3.5118677 6 0
## 12 2.4610517 25 0
## 13 1.8588942 7 0
## 14 2.3674463 4 0
## 15 6.3652742 73 0
## 16 1.8665636 15 0
## 17 3.1824577 53 0
## 18 7.0546366 45 0
## 19 9.2785626 39 0
## 20 2.8580110 14 0
## 21 3.1570395 29 0
## 22 3.1179285 36 0
## 23 NA 1 0
## 24 8.5169456 37 0
## 25 1.9721285 7 0
## 26 2.6820479 23 0
## 27 6.4340354 37 0
## 28 4.6757040 23 0
## 29 0.4831969 4 0
## 30 1.5327897 10 0
## 31 2.8371422 23 0
## 32 3.3337919 89 0
## 33 1.8526389 17 0
## 34 5.2112248 16 0
## 35 5.7281523 30 0
## 36 1.3217165 40 0
## 37 2.9959636 14 0
## 38 0.2280787 8 0
## 39 1.2772481 4 0
## 40 3.8769619 24 0
## 41 10.6627305 64 0
## 42 4.9693528 20 0
## 43 2.4995829 21 0
favstats(mpg~ transmission, data=cars2020)
## transmission min Q1 median Q3 max mean sd n
## 1 Automatic 12.6756 18.74643 21.83710 24.51057 35.0000 21.79283 3.955858 812
## 2 CVT 20.5416 28.21187 31.70975 35.66655 55.7000 33.04857 7.548859 128
## 3 Manual 10.5921 19.26342 23.07110 28.29508 57.7824 24.50010 7.771829 224
## missing
## 1 0
## 2 0
## 3 0
favstats(mpg~ drive, data=cars2020)
## drive min Q1 median Q3 max mean sd n
## 1 4WD 12.6756 17.16615 20.03900 22.56383 29.1658 20.21910 3.668326 164
## 2 AWD 10.5921 19.90560 22.39135 24.93400 40.1750 22.36506 4.331883 378
## 3 FWD 14.9383 24.82985 28.40920 32.44302 57.7824 29.81908 7.405999 298
## 4 PT 4WD 13.3849 17.63443 18.83200 21.48560 50.0000 20.77963 7.196986 30
## 5 RWD 12.6756 17.76080 20.22140 23.27003 29.7239 20.86606 3.779483 294
## missing
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
favstats(mpg~ class, data=cars2020)
## class min Q1 median Q3 max mean
## 1 Compact 13.8915 23.19480 27.01130 30.91110 52.0000 27.49157
## 2 Large 13.8915 19.28750 21.00000 25.06490 57.7824 23.37516
## 3 Mid St Wagon 14.3027 19.68190 22.90020 23.84745 26.0000 21.30856
## 4 Midsize 14.1837 22.58530 26.14910 30.37180 55.7000 27.71872
## 5 Minicompact 13.3849 19.43788 20.00000 21.08908 31.3799 20.92383
## 6 Minivan 19.9421 20.43483 21.79360 22.29650 29.5207 22.30169
## 7 Passenger Van 15.8294 16.02777 16.22615 16.42453 16.6229 16.22615
## 8 Sm Pickup Truck 16.6991 19.41600 20.64570 21.85340 23.3948 20.57527
## 9 Sm St Wagon 23.4414 26.55780 28.81495 30.94125 50.4101 30.88478
## 10 Sm SUV 16.1587 21.99030 23.80190 25.94230 40.5365 24.16257
## 11 SPV 14.9623 16.56610 21.83010 22.79840 25.6715 20.45617
## 12 Std Pickup Truck 12.6756 17.00000 18.63500 20.30245 26.5885 19.04348
## 13 Std SUV 13.3061 16.68870 17.74250 20.93030 35.0205 18.94294
## 14 Subcompact 14.0853 19.70100 22.69175 24.98710 32.9630 22.74225
## 15 Two Seater 10.5921 16.72445 20.12650 23.74470 29.7239 20.30205
## sd n missing
## 1 5.9860021 129 0
## 2 7.9158575 103 0
## 3 3.9909745 11 0
## 4 8.3817478 171 0
## 5 3.9945048 32 0
## 6 3.0812029 8 0
## 7 0.5610892 2 0
## 8 1.6946128 21 0
## 9 7.1356223 30 0
## 10 3.7389130 237 0
## 11 3.9407017 24 0
## 12 2.9037640 87 0
## 13 3.7249644 139 0
## 14 4.1334848 118 0
## 15 5.1427132 52 0
favstats(mpg~ aspiration, data=cars2020)
## aspiration min Q1 median Q3 max mean sd n
## 1 Natural 10.7137 18.66238 22.6291 29.25325 57.7824 24.85538 8.365734 494
## 2 Super 12.6756 15.57740 18.1891 20.02100 22.6919 17.88819 2.823748 41
## 3 Super+Turbo 20.7216 23.05193 24.2162 24.74740 25.0491 23.63190 1.648968 6
## 4 Turbo 10.5921 20.04260 22.9512 25.69235 36.0339 22.88966 4.087568 623
## missing
## 1 0
## 2 0
## 3 0
## 4 0
favstats(mpg~ fuelType1, data=cars2020)
## fuelType1 min Q1 median Q3 max mean sd
## 1 Diesel 19.4160 23.21720 24.0249 24.46850 26.5885 23.82984 1.628788
## 2 Midgrade Gasoline 16.6887 16.91152 17.3101 19.02563 19.0620 17.81245 1.057577
## 3 Premium Gasoline 10.5921 18.48220 21.6894 24.70460 32.0168 21.58884 4.146422
## 4 Regular Gasoline 14.2639 20.54740 24.2002 29.63450 57.7824 25.82837 7.705009
## n missing
## 1 20 0
## 2 12 0
## 3 593 0
## 4 539 0
favstats(mpg~ atvType, data=cars2020)
## atvType min Q1 median Q3 max mean sd
## 1 Diesel 19.4160 23.21720 24.02490 24.46850 26.5885 23.82984 1.628788
## 2 FFV 15.7301 16.36830 17.56585 18.78630 25.6715 18.31084 2.724908
## 3 Hybrid 15.2575 21.50252 27.79440 42.89655 57.7824 32.35495 12.246403
## 4 None 10.5921 19.21265 22.45000 25.96990 38.6074 22.79603 4.936485
## 5 Plug-in Hybrid 18.7319 22.50300 26.57015 30.30000 54.4329 28.92968 9.372533
## n missing
## 1 20 0
## 2 24 0
## 3 78 0
## 4 1006 0
## 5 36 0
##one continous two categorical
favstats(mpg~ car_make+transmission, data=cars2020)
## car_make.transmission min Q1 median Q3 max
## 1 Acura.Automatic 21.3495 22.91180 23.02550 23.39275 24.0000
## 2 Alfa Romeo.Automatic 19.1410 21.33900 24.86970 25.48912 26.9548
## 3 Aston Martin.Automatic 17.0315 17.54953 19.06770 20.41378 20.4155
## 4 Audi.Automatic 15.2575 18.02725 21.05285 22.41137 24.2823
## 5 Bentley.Automatic 13.6613 15.49950 17.33770 18.03480 18.7319
## 6 BMW.Automatic 15.0983 19.59150 24.38790 25.68145 29.6330
## 7 Bugatti.Automatic NA NA NA NA NA
## 8 Buick.Automatic 20.0162 21.96215 24.38220 25.88982 27.5574
## 9 Cadillac.Automatic 16.5637 20.35020 22.20880 24.23393 27.1593
## 10 Chevrolet.Automatic 14.9623 16.96363 19.32325 22.21872 28.8278
## 11 Chrysler.Automatic 19.0620 21.40550 22.24520 22.24520 22.7798
## 12 Dodge.Automatic 15.0483 16.86540 18.41140 21.20870 22.7798
## 13 Ferrari.Automatic NA NA NA NA NA
## 14 Fiat.Automatic 24.9382 25.45695 25.97570 27.63775 29.2998
## 15 Ford.Automatic 15.8294 19.00000 22.26840 24.33380 30.2418
## 16 Genesis.Automatic 17.7991 19.00538 20.04385 20.23080 24.8653
## 17 GMC.Automatic 14.9623 16.94260 18.15210 21.53040 27.4108
## 18 Honda.Automatic 20.5738 21.50430 22.05810 22.57630 27.0324
## 19 Hyundai.Automatic 21.1618 23.25740 24.63670 29.74900 31.7911
## 20 Infiniti.Automatic 15.3391 21.19285 21.90640 22.02340 23.0508
## 21 Jaguar.Automatic 18.1891 20.86350 23.42660 25.66260 28.4037
## 22 Jeep.Automatic 13.3061 20.73180 22.20860 24.45115 26.9515
## 23 Karma.Automatic 25.8000 25.80000 25.80000 25.80000 25.8000
## 24 Kia.Automatic 19.9185 20.88543 23.00380 24.27717 28.7747
## 25 Lamborghini.Automatic 13.9329 13.93290 13.93290 13.93290 13.9329
## 26 Land Rover.Automatic 15.1147 18.09170 20.24570 22.40790 24.3843
## 27 Lexus.Automatic 13.8684 21.00000 21.82850 23.28730 25.5162
## 28 Lincoln.Automatic 17.5802 19.35515 20.94720 22.95000 24.6965
## 29 Lotus.Automatic 19.5953 19.65665 19.71800 19.77935 19.8407
## 30 Maserati.Automatic 15.3607 17.11340 17.75225 19.16457 19.3234
## 31 Mazda.Automatic 22.6002 26.47297 28.11020 29.64447 34.9351
## 32 Mercedes-Benz.Automatic 13.8444 20.05020 21.71460 23.23305 28.3488
## 33 MINI.Automatic 25.8460 26.12343 26.88320 28.38115 29.4601
## 34 Mitsubishi.Automatic 22.0000 22.75000 23.50000 24.25000 25.0000
## 35 Nissan.Automatic 15.0000 16.53575 17.79710 19.46065 21.9494
## 36 Porsche.Automatic 16.5714 17.51053 19.43030 20.12097 20.4628
## 37 Ram.Automatic 16.8732 17.69468 19.36585 22.39708 26.0473
## 38 Rolls-Royce.Automatic 13.8915 13.89150 14.24320 14.32477 14.3910
## 39 Roush Performance.Automatic 12.6756 12.67560 12.67560 13.45995 14.2443
## 40 Subaru.Automatic 27.4825 27.48250 27.48250 27.48250 27.4825
## 41 Toyota.Automatic 14.2639 20.00207 25.02840 28.21743 35.0000
## 42 Volkswagen.Automatic 18.5818 19.43052 22.56705 24.99790 34.0525
## 43 Volvo.Automatic 20.7216 24.41490 25.04910 26.80000 30.3000
## 44 Acura.CVT NA NA NA NA NA
## 45 Alfa Romeo.CVT NA NA NA NA NA
## 46 Aston Martin.CVT NA NA NA NA NA
## 47 Audi.CVT NA NA NA NA NA
## 48 Bentley.CVT NA NA NA NA NA
## 49 BMW.CVT NA NA NA NA NA
## 50 Bugatti.CVT NA NA NA NA NA
## 51 Buick.CVT 27.9466 28.64280 29.33900 30.03520 30.7314
## 52 Cadillac.CVT NA NA NA NA NA
## 53 Chevrolet.CVT 31.5018 32.23240 32.96300 32.96300 32.9630
## 54 Chrysler.CVT 29.5207 29.52070 29.52070 29.52070 29.5207
## 55 Dodge.CVT NA NA NA NA NA
## 56 Ferrari.CVT NA NA NA NA NA
## 57 Fiat.CVT NA NA NA NA NA
## 58 Ford.CVT 39.7753 40.65237 41.25000 41.65000 42.0000
## 59 Genesis.CVT NA NA NA NA NA
## 60 GMC.CVT NA NA NA NA NA
## 61 Honda.CVT 28.2825 31.34820 33.02740 35.60820 51.9827
## 62 Hyundai.CVT 31.5397 33.38493 34.61895 35.48525 36.2273
## 63 Infiniti.CVT 21.6894 22.14885 23.69655 25.35523 26.1476
## 64 Jaguar.CVT NA NA NA NA NA
## 65 Jeep.CVT NA NA NA NA NA
## 66 Karma.CVT NA NA NA NA NA
## 67 Kia.CVT 29.6510 31.34930 33.33950 34.70780 35.8416
## 68 Lamborghini.CVT NA NA NA NA NA
## 69 Land Rover.CVT NA NA NA NA NA
## 70 Lexus.CVT 25.7818 29.15452 30.37870 37.38065 44.0943
## 71 Lincoln.CVT 41.0000 41.00000 41.00000 41.00000 41.0000
## 72 Lotus.CVT NA NA NA NA NA
## 73 Maserati.CVT NA NA NA NA NA
## 74 Mazda.CVT NA NA NA NA NA
## 75 Mercedes-Benz.CVT NA NA NA NA NA
## 76 MINI.CVT NA NA NA NA NA
## 77 Mitsubishi.CVT 25.0000 25.53288 26.07675 27.08755 38.6074
## 78 Nissan.CVT 21.4539 23.97770 28.28915 31.16633 35.0880
## 79 Porsche.CVT NA NA NA NA NA
## 80 Ram.CVT NA NA NA NA NA
## 81 Rolls-Royce.CVT NA NA NA NA NA
## 82 Roush Performance.CVT NA NA NA NA NA
## 83 Subaru.CVT 20.5416 26.08980 29.29480 30.39447 35.1000
## 84 Toyota.CVT 28.8126 34.27930 40.17500 50.00000 55.7000
## 85 Volkswagen.CVT NA NA NA NA NA
## 86 Volvo.CVT NA NA NA NA NA
## 87 Acura.Manual 21.0000 26.47780 26.92955 27.69037 28.0000
## 88 Alfa Romeo.Manual 27.9406 27.94060 27.94060 27.94060 27.9406
## 89 Aston Martin.Manual NA NA NA NA NA
## 90 Audi.Manual 15.9153 24.24600 25.51915 26.74235 30.1793
## 91 Bentley.Manual 14.6616 14.66160 14.99510 18.96560 18.9656
## 92 BMW.Manual 18.5025 18.81510 19.99745 21.65480 25.3226
## 93 Bugatti.Manual 10.5921 10.59210 10.59210 10.59210 10.5921
## 94 Buick.Manual NA NA NA NA NA
## 95 Cadillac.Manual NA NA NA NA NA
## 96 Chevrolet.Manual 15.7384 18.89895 21.53390 29.83953 32.5457
## 97 Chrysler.Manual NA NA NA NA NA
## 98 Dodge.Manual 15.5775 15.57750 16.91490 16.91490 17.9612
## 99 Ferrari.Manual 13.3412 14.85070 16.61150 16.90515 18.1715
## 100 Fiat.Manual 29.6662 29.66620 29.66620 29.66620 29.6662
## 101 Ford.Manual 14.0853 16.19760 18.34570 22.85180 24.3461
## 102 Genesis.Manual 21.6779 21.67790 21.67790 21.67790 21.6779
## 103 GMC.Manual NA NA NA NA NA
## 104 Honda.Manual 24.5785 29.07740 29.71430 30.00000 32.0168
## 105 Hyundai.Manual 24.7112 28.41645 30.05905 44.42865 57.7824
## 106 Infiniti.Manual NA NA NA NA NA
## 107 Jaguar.Manual NA NA NA NA NA
## 108 Jeep.Manual 18.7070 19.33980 19.90110 25.03660 26.1675
## 109 Karma.Manual NA NA NA NA NA
## 110 Kia.Manual 27.0504 29.92597 36.05195 43.98442 50.4101
## 111 Lamborghini.Manual 10.7137 11.94925 14.98990 15.00175 15.0057
## 112 Land Rover.Manual NA NA NA NA NA
## 113 Lexus.Manual NA NA NA NA NA
## 114 Lincoln.Manual NA NA NA NA NA
## 115 Lotus.Manual 18.9178 19.19360 19.46940 19.74520 20.0210
## 116 Maserati.Manual NA NA NA NA NA
## 117 Mazda.Manual 29.0373 29.11010 29.18290 31.63065 34.0784
## 118 Mercedes-Benz.Manual 16.7621 17.61758 25.27310 26.44533 28.4696
## 119 MINI.Manual 27.5119 29.03130 29.79170 31.37990 31.3799
## 120 Mitsubishi.Manual 35.0000 35.34950 35.69900 36.04850 36.3980
## 121 Nissan.Manual 18.3595 19.15845 19.95740 24.97870 30.0000
## 122 Porsche.Manual 18.2114 20.00000 20.13870 21.00000 22.5030
## 123 Ram.Manual NA NA NA NA NA
## 124 Rolls-Royce.Manual NA NA NA NA NA
## 125 Roush Performance.Manual 15.2813 15.28130 15.28130 15.28130 15.2813
## 126 Subaru.Manual 18.6117 23.14220 24.93740 26.22350 26.4104
## 127 Toyota.Manual 18.3106 21.10525 31.46640 32.46950 34.2683
## 128 Volkswagen.Manual 27.0113 27.21503 27.89105 30.19742 34.0383
## 129 Volvo.Manual NA NA NA NA NA
## mean sd n missing
## 1 23.01335 0.7240967 12 0
## 2 23.61548 3.1416333 6 0
## 3 18.89560 1.7762265 4 0
## 4 20.16287 2.6788769 18 0
## 5 16.57697 2.6195004 3 0
## 6 22.73364 4.0014768 91 0
## 7 NaN NA 0 0
## 8 24.02603 2.3907689 12 0
## 9 22.17932 2.9424019 24 0
## 10 19.97375 3.8098039 70 0
## 11 21.54754 1.4739410 5 0
## 12 18.81113 2.5285027 20 0
## 13 NaN NA 0 0
## 14 26.73790 2.2785083 3 0
## 15 22.04350 3.3541903 58 0
## 16 20.15884 1.8937776 14 0
## 17 19.33331 3.1824577 53 0
## 18 22.62464 2.0395735 11 0
## 19 25.85100 3.5502762 17 0
## 20 20.75888 2.7029904 10 0
## 21 23.14366 3.1570395 29 0
## 22 22.12939 3.1149947 31 0
## 23 25.80000 NA 1 0
## 24 23.00007 2.4033674 20 0
## 25 13.93290 NA 1 0
## 26 20.11077 2.6820479 23 0
## 27 21.69416 2.6580654 27 0
## 28 21.00324 2.1655734 22 0
## 29 19.71800 0.1735240 2 0
## 30 17.71110 1.5327897 10 0
## 31 28.00607 2.7217048 20 0
## 32 21.52698 2.9986884 75 0
## 33 27.30197 1.4057506 8 0
## 34 23.50000 2.1213203 2 0
## 35 18.10561 2.4793187 7 0
## 36 18.85277 1.6811553 6 0
## 37 20.31741 2.9959636 14 0
## 38 14.15570 0.2280787 8 0
## 39 13.19850 0.9056894 3 0
## 40 27.48250 NA 1 0
## 41 23.93826 5.9176532 36 0
## 42 23.46119 4.7631624 14 0
## 43 25.61402 2.4995829 21 0
## 44 NaN NA 0 0
## 45 NaN NA 0 0
## 46 NaN NA 0 0
## 47 NaN NA 0 0
## 48 NaN NA 0 0
## 49 NaN NA 0 0
## 50 NaN NA 0 0
## 51 29.33900 1.9691510 2 0
## 52 NaN NA 0 0
## 53 32.47593 0.8436242 3 0
## 54 29.52070 NA 1 0
## 55 NaN NA 0 0
## 56 NaN NA 0 0
## 57 NaN NA 0 0
## 58 41.08530 0.8259620 6 0
## 59 NaN NA 0 0
## 60 NaN NA 0 0
## 61 34.82176 6.2429633 25 0
## 62 34.25122 2.0243420 4 0
## 63 23.80752 2.1507034 4 0
## 64 NaN NA 0 0
## 65 NaN NA 0 0
## 66 NaN NA 0 0
## 67 32.97784 2.5022334 5 0
## 68 NaN NA 0 0
## 69 NaN NA 0 0
## 70 33.07721 6.3337566 10 0
## 71 41.00000 NA 1 0
## 72 NaN NA 0 0
## 73 NaN NA 0 0
## 74 NaN NA 0 0
## 75 NaN NA 0 0
## 76 NaN NA 0 0
## 77 28.01383 4.7243765 12 0
## 78 27.83977 4.1436982 20 0
## 79 NaN NA 0 0
## 80 NaN NA 0 0
## 81 NaN NA 0 0
## 82 NaN NA 0 0
## 83 28.10805 4.0325070 14 0
## 84 41.53163 8.6027520 21 0
## 85 NaN NA 0 0
## 86 NaN NA 0 0
## 87 26.18838 2.6209508 6 0
## 88 27.94060 NA 1 0
## 89 NaN NA 0 0
## 90 24.80793 3.4996936 22 0
## 91 16.44990 2.3005418 5 0
## 92 20.44831 1.9700537 20 0
## 93 10.59210 NA 1 0
## 94 NaN NA 0 0
## 95 NaN NA 0 0
## 96 23.66343 7.1031392 6 0
## 97 NaN NA 0 0
## 98 16.58920 1.0175484 5 0
## 99 15.94799 1.8588942 7 0
## 100 29.66620 NA 1 0
## 101 19.12871 3.9785122 9 0
## 102 21.67790 NA 1 0
## 103 NaN NA 0 0
## 104 29.05384 2.4074971 9 0
## 105 35.77791 11.2641938 18 0
## 106 NaN NA 0 0
## 107 NaN NA 0 0
## 108 21.83040 3.4918143 5 0
## 109 NaN NA 0 0
## 110 37.24952 8.8465150 12 0
## 111 13.60677 2.1561343 6 0
## 112 NaN NA 0 0
## 113 NaN NA 0 0
## 114 NaN NA 0 0
## 115 19.46940 0.7800802 2 0
## 116 NaN NA 0 0
## 117 30.76620 2.8693730 3 0
## 118 22.57678 4.8008565 14 0
## 119 29.78544 1.3675572 9 0
## 120 35.69900 0.9885353 2 0
## 121 22.77230 6.3101550 3 0
## 122 20.53109 1.0959089 34 0
## 123 NaN NA 0 0
## 124 NaN NA 0 0
## 125 15.28130 NA 1 0
## 126 24.17932 2.4739036 9 0
## 127 27.31354 6.9695383 7 0
## 128 29.13127 2.8009049 6 0
## 129 NaN NA 0 0
#favstats(mpg~ car_make+aspiration, data=cars2020)
#favstats(mpg~ car_make+fuelType1, data=cars2020)
library(rsample)
set.seed(1729)
split <- initial_split(cars2020, prop = 0.8, strata = mpg)
train <- training(split)
test <- testing(split)
# splits the data in a 80:20 ratio (training:testing).
#uses the outcome variable, mpg to stratify. This is done to ensure that the
#distribution of the outcome is comparable in both data sets.
#initial_time_split() takes the 1st prop samples for training,instead of random selection.
# Labeling the train and tests sets then combining them for purposes of making the plot
cars_recon <- bind_rows(mutate(train, Data = "Training"),
mutate(test, Data = "Testing"))
ggplot(cars_recon, aes(x = mpg, fill = Data)) +
geom_density(alpha = 0.4) +
ggtitle("Comparing MPG distributions in train and test data sets")
ggplot(cars_recon, aes(x = mpg, colour = Data)) +
geom_density(alpha = 0.4) +
ggtitle("Comparing MPG distributions in train and test data sets")
library(dlookr)
# 1. provides descriptive statistics for numerical data
describe(cars2020)
## # A tibble: 6 x 26
## variable n na mean sd se_mean IQR skewness kurtosis p00 p01
## <chr> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 mpg 1164 0 23.6 6.40 0.188 7.03 1.64 4.78 10.6 13.9
## 2 gears 1164 0 7.30 1.97 0.0578 2 -1.62 3.39 1 1
## 3 displ 1164 0 3.08 1.29 0.0379 1.6 0.970 0.260 1 1.36
## 4 cylinde~ 1164 0 5.57 1.83 0.0535 2 1.14 1.79 3 3
## 5 lv2 1164 0 1.59 3.94 0.115 0 2.34 4.22 0 0
## 6 lv4 1164 0 5.25 8.25 0.242 13 1.52 2.31 0 0
## # ... with 15 more variables: p05 <dbl>, p10 <dbl>, p20 <dbl>, p25 <dbl>,
## # p30 <dbl>, p40 <dbl>, p50 <dbl>, p60 <dbl>, p70 <dbl>, p75 <dbl>,
## # p80 <dbl>, p90 <dbl>, p95 <dbl>, p99 <dbl>, p100 <dbl>
# 2. eda_report()
# eda_report() performs EDA on all variables of the data frame or object
#(tbl_df,tbl, etc.) that inherits the data frame.
# eda_report() creates an EDA report in two forms: pdf file based on Latex
# and html file
##eda_report(cars2020, target = mpg , output_file = "EDACars2020.pdf") #pdf
##eda_report(cars2020, target=mpg, output_format="html", output_file="EDACars2020.html")
num <- target_by(cars2020, mpg)
#general relationship between target variable fuel-efficiency (mpg) and
#predictor engine size (displacement)
#we show the result of simple regression model of target ~ predictor relation
num_num <- relate(num, displ)
num_num
##
## Call:
## lm(formula = formula_str, data = data)
##
## Coefficients:
## (Intercept) displ
## 34.052 -3.413
summary(num_num)
##
## Call:
## lm(formula = formula_str, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.9928 -2.7259 -0.9033 1.4828 29.1912
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.0522 0.3506 97.12 <2e-16 ***
## displ -3.4131 0.1051 -32.49 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.636 on 1162 degrees of freedom
## Multiple R-squared: 0.476, Adjusted R-squared: 0.4755
## F-statistic: 1056 on 1 and 1162 DF, p-value: < 2.2e-16
# visualize the relationship between the target variable and the predictor
plot(num_num)
#The relationship between 'mpg' and 'displ' is represented as a scatter plot.
#The plot on the left represents the scatter plot of 'mpg' and 'displ' and the
#confidence interval of the regression line and the regression line.
#The plot on the right represents the relationship between the original data and
#the predicted value of the linear model as a scatter plot. If there is a linear
#relationship between the two variables, the observations will converge on the
#red diagonal in the scatter plot.
#The scatter plot of the data with a large number of observations is output as
#overlapping points. This makes it difficult to judge the relationship between the two variables. It also takes a long time to perform the visualization.
#In this case, the above problem can be solved by hexabin plot.
#In plot(), the hex_thres argument provides a basis for drawing hexabin plots. For data with more than this number of observations, draw a hexabin plot.
ggplot(data = train,
aes(x = displ, y = mpg)) +
geom_point(alpha = 0.25) + geom_smooth() +
xlab("Engine displace (L)") +
ylab("Miles per gallon") +
ggtitle("Fuel-efficiency vs Engine Size (displacement)")
#difference in distribution of fuel-efficiency for each transmission type
#shows the result of performing one-way ANOVA of target ~ predictor relation
cars2020$transmission <- factor(cars2020$transmission)
num1 <- target_by(cars2020, mpg)
num_cat <- relate(num1, transmission)
num_cat
## Analysis of Variance Table
##
## Response: mpg
## Df Sum Sq Mean Sq F value Pr(>F)
## transmission 2 14258 7128.9 247.82 < 2.2e-16 ***
## Residuals 1161 33398 28.8
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(num_cat)
##
## Call:
## lm(formula = formula(formula_str), data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.908 -3.788 -0.086 2.905 33.282
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 21.7928 0.1882 115.784 < 2e-16 ***
## transmissionCVT 11.2557 0.5101 22.067 < 2e-16 ***
## transmissionManual 2.7073 0.4048 6.688 3.5e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.363 on 1161 degrees of freedom
## Multiple R-squared: 0.2992, Adjusted R-squared: 0.298
## F-statistic: 247.8 on 2 and 1161 DF, p-value: < 2.2e-16
plot(num_cat)
#object of type 'closure' is not subsettable error happens when you’re trying to
#treat a function like a list, vector, or data frame.
#To fix it, start treating the function like a function.
ggplot(data = train, aes(x = transmission, y = mpg)) +
geom_boxplot() +
xlab("Transmission Type") +
ylab("Miles per gallon") +
ggtitle("Transmission type and fuel efficiency of 2020 cars")
categ <- target_by(cars2020, transmission)
#the descriptive statistics are shown for each level of the target variable
cat_num <- relate(categ, mpg)
cat_num
## # A tibble: 4 x 27
## variable transmission n na mean sd se_mean IQR skewness kurtosis
## <chr> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 mpg Automatic 812 0 21.8 3.96 0.139 5.76 0.189 -0.262
## 2 mpg CVT 128 0 33.0 7.55 0.667 7.45 0.981 0.779
## 3 mpg Manual 224 0 24.5 7.77 0.519 9.03 1.50 3.61
## 4 mpg total 1164 0 23.6 6.40 0.188 7.03 1.64 4.78
## # ... with 17 more variables: p00 <dbl>, p01 <dbl>, p05 <dbl>, p10 <dbl>,
## # p20 <dbl>, p25 <dbl>, p30 <dbl>, p40 <dbl>, p50 <dbl>, p60 <dbl>,
## # p70 <dbl>, p75 <dbl>, p80 <dbl>, p90 <dbl>, p95 <dbl>, p99 <dbl>,
## # p100 <dbl>
summary(cat_num)
## variable transmission n na mean
## Length:4 Automatic:1 Min. : 128 Min. :0 Min. :21.79
## Class :character CVT :1 1st Qu.: 200 1st Qu.:0 1st Qu.:23.11
## Mode :character Manual :1 Median : 518 Median :0 Median :24.03
## total :1 Mean : 582 Mean :0 Mean :25.72
## 3rd Qu.: 900 3rd Qu.:0 3rd Qu.:26.64
## Max. :1164 Max. :0 Max. :33.05
## sd se_mean IQR skewness
## Min. :3.956 Min. :0.1388 Min. :5.764 Min. :0.1887
## 1st Qu.:5.790 1st Qu.:0.1754 1st Qu.:6.716 1st Qu.:0.7830
## Median :6.975 Median :0.3535 Median :7.244 Median :1.2404
## Mean :6.419 Mean :0.3782 Mean :7.321 Mean :1.0785
## 3rd Qu.:7.605 3rd Qu.:0.5563 3rd Qu.:7.849 3rd Qu.:1.5359
## Max. :7.772 Max. :0.6672 Max. :9.032 Max. :1.6444
## kurtosis p00 p01 p05
## Min. :-0.2616 Min. :10.59 Min. :11.49 Min. :15.00
## 1st Qu.: 0.5186 1st Qu.:10.59 1st Qu.:13.29 1st Qu.:15.27
## Median : 2.1928 Median :11.63 Median :13.89 Median :15.44
## Mean : 2.2253 Mean :13.60 Mean :15.20 Mean :17.15
## 3rd Qu.: 3.8995 3rd Qu.:14.64 3rd Qu.:15.80 3rd Qu.:17.33
## Max. : 4.7772 Max. :20.54 Max. :21.52 Max. :22.73
## p10 p20 p25 p30
## Min. :16.66 Min. :18.00 Min. :18.75 Min. :19.62
## 1st Qu.:16.76 1st Qu.:18.44 1st Qu.:19.13 1st Qu.:19.90
## Median :16.87 Median :18.63 Median :19.29 Median :20.02
## Mean :18.89 Mean :20.56 Mean :21.39 Mean :22.12
## 3rd Qu.:18.99 3rd Qu.:20.76 3rd Qu.:21.55 3rd Qu.:22.25
## Max. :25.15 Max. :27.00 Max. :28.21 Max. :28.83
## p40 p50 p60 p70
## Min. :20.77 Min. :21.84 Min. :22.84 Min. :23.96
## 1st Qu.:20.94 1st Qu.:22.43 1st Qu.:23.72 1st Qu.:25.09
## Median :21.20 Median :22.85 Median :24.83 Median :26.28
## Mean :23.36 Mean :24.81 Mean :26.41 Mean :27.87
## 3rd Qu.:23.62 3rd Qu.:25.23 3rd Qu.:27.52 3rd Qu.:29.06
## Max. :30.25 Max. :31.71 Max. :33.12 Max. :34.95
## p75 p80 p90 p95
## Min. :24.51 Min. :25.04 Min. :27.00 Min. :28.23
## 1st Qu.:25.90 1st Qu.:26.77 1st Qu.:29.57 1st Qu.:32.75
## Median :27.33 Median :28.40 Median :31.07 Median :35.30
## Mean :28.71 Mean :30.07 Mean :33.12 Mean :37.03
## 3rd Qu.:30.14 3rd Qu.:31.70 3rd Qu.:34.62 3rd Qu.:39.58
## Max. :35.67 Max. :38.43 Max. :43.35 Max. :49.29
## p99 p100
## Min. :30.68 Min. :35.00
## 1st Qu.:44.56 1st Qu.:50.52
## Median :50.70 Median :56.74
## Mean :46.48 Mean :51.57
## 3rd Qu.:52.62 3rd Qu.:57.78
## Max. :53.84 Max. :57.78
plot(cat_num)
ggplot(data = train, aes(x = mpg, fill = transmission)) +
geom_density(alpha = 0.4) +
ggtitle("Comparing Transmission type distributions and MPG")
cars2020$drive <- factor(cars2020$drive)
categ1 <- target_by(cars2020, transmission)
#we show the contigency table of two variables. an independence test is performed
#on the contigency table.
cat_cat <- relate(categ1, drive)
cat_cat
## drive
## transmission 4WD AWD FWD PT 4WD RWD
## Automatic 141 282 141 25 223
## CVT 9 31 84 2 2
## Manual 14 65 73 3 69
summary(cat_cat)
## Call: xtabs(formula = formula_str, data = data, addNA = TRUE)
## Number of cases in table: 1164
## Number of factors: 2
## Test for independence of all factors:
## Chisq = 166.52, df = 8, p-value = 6.904e-32
## Chi-squared approximation may be incorrect
plot(cat_cat)
#visualizes the relationship between the target variable and the predictor by mosaic plot
library(ggmosaic)
ggplot(data = train) +
geom_mosaic(aes(x = product(transmission, drive), fill = transmission)) +
labs(title = "Transmission type by drivetrain - 1 ~ W(fill=Y) + Y + X",
subtitle = "f(Transmission type , drivetrain)") +
xlab("Drivetrain") +
ylab("Transmission Type")
ggplot(data = train) +
geom_mosaic(aes(x = product(drive), fill = drive)) +
labs(title = "drivetrain - 1 ~ W(fill=X) + X",
subtitle = "f(drivetrain)")+
xlab("Drivetrain")
ggplot(data = train) +
geom_mosaic(aes(x = product(drive), fill = transmission)) +
labs(title = "drivetrain - 1 ~ W(fill=Y) + X",
subtitle = "f(drivetrain)")+
xlab("Drivetrain")
ggplot(data = train) +
geom_mosaic(aes(x = product(transmission, drive), fill = transmission, conds=product(sidi))) +
labs(title = "Transmission type by drivetrain - 1 ~ W(fill=Y) + Y|Z + X",
subtitle = "f(Transmission type|sidi , drivetrain)") +
xlab("y") +
ylab("drivetrain")+
facet_grid(sidi~.)+
coord_flip()
#scatter plots of fuel efficiency (mpg) vs engine size (displacement) differ across transmission types
ggplot(data = train,
aes(x = displ, y = mpg)) +
geom_point(alpha = 0.25) +
geom_smooth()+
facet_wrap(~transmission) +
xlab("Engine displace (L)") +
ylab("Miles per gallon") +
ggtitle("Fuel-efficiency vs Engine Size (displacement)")
library(car)
library(MASS) #So that distributions that must be non-zero can make sense of my data
qqp(cars2020$mpg+1, "norm", main="Q-Q Plot ~ mpg+1 Normal model")
## [1] 933 621
qqp(cars2020$mpg+1, "lnorm", main="Q-Q Plot ~ mpg+1 LogNormal model") #lnorm is lognormal
## [1] 933 621
qqp(cars2020$mpg+1, "exp", main="Q-Q Plot ~ mpg+1 Exponential model")
## [1] 933 621
#qqp requires estimates of the parameters of the negative binomial, Poisson
# and gamma distributions. You can generate estimates using the fitdistr function.
#negative binomial and gamma distributions can only handle positive numbers.
#Poisson distribution can only handle positive whole numbers.
#Binomial and Poisson distributions are different from the others because they are
#discrete rather than continuous, which means they quantify distinct,
#countable events or the probability of these events
pois <- fitdistr(cars2020$mpg+1, "Poisson")
qqp(cars2020$mpg+1, "pois", lambda=pois$estimate, main="Q-Q Plot ~ mpg+1 Poisson model")
## [1] 933 621
gamma <- fitdistr(cars2020$mpg+1, "gamma",
list(shape = 1, rate = 0.1), lower = 0.4)
qqp(cars2020$mpg+1, "gamma", shape = gamma$estimate[[1]], rate =
gamma$estimate[[2]], main="Q-Q Plot ~ mpg+1 Gamma model")
## [1] 933 621
weibull <- fitdistr(cars2020$mpg+1, "weibull")
qqp(cars2020$mpg+1, "weibull", shape = weibull$estimate[[1]],
scale=weibull$estimate[[2]], main="Q-Q Plot ~ mpg+1 Weibull model")
## [1] 933 621
# Deselect the 2 column variables
id_cols <- c("car_make", "model")
train1 <- train[,!(names(train) %in% id_cols)]
#To fit a linear model using the method of (OLS) we use the lm function
ols_model <- lm(mpg~., data = train1)
ols_model
##
## Call:
## lm(formula = mpg ~ ., data = train1)
##
## Coefficients:
## (Intercept) transmissionCVT
## 39.33753 3.20564
## transmissionManual gears
## 0.04928 -0.28849
## driveAWD driveFWD
## 0.18354 3.71183
## drivePT 4WD driveRWD
## 1.04471 1.15595
## displ cylinders
## -1.80544 -0.46562
## classLarge classMid St Wagon
## -0.25964 -0.01333
## classMidsize classMinicompact
## 0.44561 -1.50638
## classMinivan classPassenger Van
## -6.40468 -6.76966
## classSm Pickup Truck classSm St Wagon
## -6.12654 0.09396
## classSm SUV classSPV
## -3.29938 -6.18484
## classStd Pickup Truck classStd SUV
## -4.06379 -3.60436
## classSubcompact classTwo Seater
## -0.38888 -2.03608
## lv2 lv4
## -0.07743 -0.05236
## sidiY aspirationSuper
## 0.72650 -1.79538
## aspirationSuper+Turbo aspirationTurbo
## -2.27298 -2.97902
## fuelType1Midgrade Gasoline fuelType1Premium Gasoline
## -1.14707 -2.74374
## fuelType1Regular Gasoline atvTypeFFV
## -2.11444 -3.62581
## atvTypeHybrid atvTypeNone
## 2.60985 -3.01049
## atvTypePlug-in Hybrid startStopY
## NA 1.76579
summary(ols_model)
##
## Call:
## lm(formula = mpg ~ ., data = train1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.4451 -1.5039 0.0256 1.4984 18.4379
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.33753 1.05957 37.126 < 2e-16 ***
## transmissionCVT 3.20564 0.39999 8.014 3.44e-15 ***
## transmissionManual 0.04928 0.27904 0.177 0.859867
## gears -0.28849 0.06629 -4.352 1.50e-05 ***
## driveAWD 0.18354 0.31012 0.592 0.554118
## driveFWD 3.71183 0.36504 10.168 < 2e-16 ***
## drivePT 4WD 1.04471 0.62608 1.669 0.095536 .
## driveRWD 1.15595 0.32103 3.601 0.000335 ***
## displ -1.80544 0.26539 -6.803 1.87e-11 ***
## cylinders -0.46562 0.16171 -2.879 0.004080 **
## classLarge -0.25964 0.44778 -0.580 0.562169
## classMid St Wagon -0.01333 0.98020 -0.014 0.989151
## classMidsize 0.44561 0.37282 1.195 0.232314
## classMinicompact -1.50638 0.65828 -2.288 0.022348 *
## classMinivan -6.40468 1.45903 -4.390 1.27e-05 ***
## classPassenger Van -6.76966 2.12311 -3.189 0.001479 **
## classSm Pickup Truck -6.12654 0.79912 -7.667 4.59e-14 ***
## classSm St Wagon 0.09396 0.65624 0.143 0.886182
## classSm SUV -3.29938 0.41144 -8.019 3.32e-15 ***
## classSPV -6.18484 0.75260 -8.218 7.21e-16 ***
## classStd Pickup Truck -4.06379 0.59454 -6.835 1.51e-11 ***
## classStd SUV -3.60436 0.48867 -7.376 3.72e-13 ***
## classSubcompact -0.38888 0.45717 -0.851 0.395201
## classTwo Seater -2.03608 0.61029 -3.336 0.000884 ***
## lv2 -0.07743 0.03857 -2.007 0.045028 *
## lv4 -0.05236 0.01864 -2.809 0.005081 **
## sidiY 0.72650 0.29891 2.430 0.015274 *
## aspirationSuper -1.79538 0.55081 -3.260 0.001158 **
## aspirationSuper+Turbo -2.27298 1.26201 -1.801 0.072028 .
## aspirationTurbo -2.97902 0.31597 -9.428 < 2e-16 ***
## fuelType1Midgrade Gasoline -1.14707 1.27397 -0.900 0.368153
## fuelType1Premium Gasoline -2.74374 0.95625 -2.869 0.004211 **
## fuelType1Regular Gasoline -2.11444 0.96458 -2.192 0.028630 *
## atvTypeFFV -3.62581 0.90125 -4.023 6.23e-05 ***
## atvTypeHybrid 2.60985 0.64928 4.020 6.32e-05 ***
## atvTypeNone -3.01049 0.56948 -5.286 1.57e-07 ***
## atvTypePlug-in Hybrid NA NA NA NA
## startStopY 1.76579 0.24750 7.135 2.00e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.765 on 896 degrees of freedom
## Multiple R-squared: 0.826, Adjusted R-squared: 0.8191
## F-statistic: 118.2 on 36 and 896 DF, p-value: < 2.2e-16
plot(ols_model) #shows several diagnostic graphs
#This alternative model differs from the previous in using an outcome of log(mpg)
#This choice is suggested because of the lognormal nature of the distribution of mpg
ols_log_model <- lm(log(mpg)~., data = train1)
summary(ols_log_model)
##
## Call:
## lm(formula = log(mpg) ~ ., data = train1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.31347 -0.05100 -0.00161 0.06017 0.38168
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.8220213 0.0351682 108.678 < 2e-16 ***
## transmissionCVT 0.1109602 0.0132760 8.358 2.42e-16 ***
## transmissionManual -0.0106554 0.0092615 -1.151 0.250240
## gears -0.0048106 0.0022002 -2.186 0.029043 *
## driveAWD 0.0072863 0.0102932 0.708 0.479203
## driveFWD 0.1238549 0.0121160 10.222 < 2e-16 ***
## drivePT 4WD 0.0300485 0.0207803 1.446 0.148524
## driveRWD 0.0470704 0.0106552 4.418 1.12e-05 ***
## displ -0.0827641 0.0088084 -9.396 < 2e-16 ***
## cylinders -0.0278904 0.0053673 -5.196 2.52e-07 ***
## classLarge -0.0093175 0.0148624 -0.627 0.530872
## classMid St Wagon -0.0068622 0.0325338 -0.211 0.832993
## classMidsize 0.0112245 0.0123743 0.907 0.364608
## classMinicompact -0.0463353 0.0218488 -2.121 0.034219 *
## classMinivan -0.1803056 0.0484266 -3.723 0.000209 ***
## classPassenger Van -0.3492118 0.0704682 -4.956 8.62e-07 ***
## classSm Pickup Truck -0.2367447 0.0265234 -8.926 < 2e-16 ***
## classSm St Wagon -0.0149604 0.0217811 -0.687 0.492355
## classSm SUV -0.1123641 0.0136562 -8.228 6.67e-16 ***
## classSPV -0.2506623 0.0249796 -10.035 < 2e-16 ***
## classStd Pickup Truck -0.1538307 0.0197333 -7.795 1.78e-14 ***
## classStd SUV -0.1404778 0.0162194 -8.661 < 2e-16 ***
## classSubcompact -0.0104613 0.0151738 -0.689 0.490730
## classTwo Seater -0.0749875 0.0202562 -3.702 0.000227 ***
## lv2 -0.0017083 0.0012803 -1.334 0.182452
## lv4 -0.0011513 0.0006187 -1.861 0.063076 .
## sidiY 0.0348726 0.0099211 3.515 0.000462 ***
## aspirationSuper -0.0928291 0.0182818 -5.078 4.65e-07 ***
## aspirationSuper+Turbo -0.1203417 0.0418875 -2.873 0.004162 **
## aspirationTurbo -0.1193375 0.0104872 -11.379 < 2e-16 ***
## fuelType1Midgrade Gasoline -0.0705227 0.0422843 -1.668 0.095700 .
## fuelType1Premium Gasoline -0.1555444 0.0317389 -4.901 1.13e-06 ***
## fuelType1Regular Gasoline -0.1407809 0.0320154 -4.397 1.23e-05 ***
## atvTypeFFV -0.1026318 0.0299132 -3.431 0.000629 ***
## atvTypeHybrid 0.0734123 0.0215504 3.407 0.000687 ***
## atvTypeNone -0.0913198 0.0189017 -4.831 1.60e-06 ***
## atvTypePlug-in Hybrid NA NA NA NA
## startStopY 0.0544453 0.0082147 6.628 5.88e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.09178 on 896 degrees of freedom
## Multiple R-squared: 0.8718, Adjusted R-squared: 0.8667
## F-statistic: 169.3 on 36 and 896 DF, p-value: < 2.2e-16
#fitting decision tree models. Each decision tree model will formulated with an
#equivalent outcome to one of the OLS models
library(rpart)
library(rpart.plot)
dt_model <- rpart(mpg~., data = train1)
rpart.plot(dt_model)
dt_log_model <- rpart(log(mpg)~., data = train1)
rpart.plot(dt_log_model)
#Note the values displayed at the nodes represent the predicted log of the mpg variable.
#To convert these to interpretable fuel-efficiency values you use the exp function.
#we formulate two random forest models (one for each of the predictors)
library(randomForest)
set.seed(2001)
rf_model <- randomForest(mpg~., data = train1)
set.seed(99)
rf_log_model <- randomForest(log(mpg)~., data = train1)
#add the 6 model estimates to a data frame that also contains the actual mpg values
#as well as the model predictors.
train_results <- mutate(train1,
ols = predict(ols_model, train1),
ols_log = exp(predict(ols_log_model, train1)),
dt = predict(dt_model, train1),
dt_log = exp(predict(dt_log_model, train1)),
rf = predict(rf_model, train1),
rf_log = exp(predict(rf_log_model, train1))
)
#visualize the model performance by graphing the model estimates vs the actual values
#of the fuel efficiency
#We reshape the data to be used easily with 'ggplot2'
library(tidyr)
train_results_long <- pivot_longer(train_results, ols:rf_log,
names_to = "method", values_to = "estimate")
#train_results_long1 <- train_results_long[,c('mpg', 'method','estimate')]
head(train_results_long)
## # A tibble: 6 x 16
## mpg transmission gears drive displ cylinders class lv2 lv4 sidi
## <dbl> <chr> <int> <chr> <dbl> <int> <chr> <int> <int> <chr>
## 1 34.3 CVT 10 FWD 2 4 Comp~ 0 13 Y
## 2 34.3 CVT 10 FWD 2 4 Comp~ 0 13 Y
## 3 34.3 CVT 10 FWD 2 4 Comp~ 0 13 Y
## 4 34.3 CVT 10 FWD 2 4 Comp~ 0 13 Y
## 5 34.3 CVT 10 FWD 2 4 Comp~ 0 13 Y
## 6 34.3 CVT 10 FWD 2 4 Comp~ 0 13 Y
## # ... with 6 more variables: aspiration <chr>, fuelType1 <chr>, atvType <chr>,
## # startStop <chr>, method <chr>, estimate <dbl>
tail(train_results_long)
## # A tibble: 6 x 16
## mpg transmission gears drive displ cylinders class lv2 lv4 sidi
## <dbl> <chr> <int> <chr> <dbl> <int> <chr> <int> <int> <chr>
## 1 18.7 Automatic 8 AWD 3 6 Std ~ 0 0 Y
## 2 18.7 Automatic 8 AWD 3 6 Std ~ 0 0 Y
## 3 18.7 Automatic 8 AWD 3 6 Std ~ 0 0 Y
## 4 18.7 Automatic 8 AWD 3 6 Std ~ 0 0 Y
## 5 18.7 Automatic 8 AWD 3 6 Std ~ 0 0 Y
## 6 18.7 Automatic 8 AWD 3 6 Std ~ 0 0 Y
## # ... with 6 more variables: aspiration <chr>, fuelType1 <chr>, atvType <chr>,
## # startStop <chr>, method <chr>, estimate <dbl>
#plot of the model estimates vs the actual MPG values
ggplot(data = train_results_long,
aes(x = mpg, y = estimate)) +
geom_point(shape = 21, colour = "blue") +
facet_wrap(~method, ncol = 2) +
geom_abline(slope = 1, intercept = 0) +
xlim(c(0,60)) + ylim(c(0,60)) + theme_minimal()
#Points on the diagonal line correspond to cars for which the model estimated value
#and the actual mpg value are very close. Points that are above or below the line
#correspond to cars for which the model overestimates or underestimates the mpg
#respectively.
#Inspecting these plots visually, it seems that the random forest models fit more
#closely than the OLS models, which fit more closely than the decision tree models.
#The yardstick::metrics function can be used for summary statistics for a model.
#It requires that the model estimates be available in as a column of a data frame
#that also contains a column of corresponding truth values.
#The three reported metrics are:
#1. Root mean squared error
# 2. R2
# 3. Mean absolute error
library(yardstick)
metric_ols <- metrics(train_results, truth = mpg, estimate = ols)
metric_ols
## # A tibble: 3 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 2.71
## 2 rsq standard 0.826
## 3 mae standard 1.94
metric_ols_log <- metrics(train_results, truth = mpg, estimate = ols_log)
metric_ols_log
## # A tibble: 3 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 2.42
## 2 rsq standard 0.865
## 3 mae standard 1.67
metric_dt <- metrics(train_results, truth = mpg, estimate = dt)
metric_dt
## # A tibble: 3 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 2.69
## 2 rsq standard 0.829
## 3 mae standard 1.99
metric_dt_log <- metrics(train_results, truth = mpg, estimate = dt_log)
metric_dt_log
## # A tibble: 3 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 2.97
## 2 rsq standard 0.791
## 3 mae standard 2.10
metric_rf <- metrics(train_results, truth = mpg, estimate = rf)
metric_rf
## # A tibble: 3 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 1.10
## 2 rsq standard 0.973
## 3 mae standard 0.782
metric_rf_log <- metrics(train_results, truth = mpg, estimate = rf_log)
metric_rf_log
## # A tibble: 3 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 1.13
## 2 rsq standard 0.972
## 3 mae standard 0.784
metrics2 <- function(fit_name, in_df, truth){
out_df <- yardstick::metrics(in_df, fit_name, truth)
out_df$fit <- fit_name
return(out_df)
}
comp_models <- function(in_df, fit_names, in_truth, metric, prefix = ""){
out_df <- purrr::map_df(fit_names, metrics2, in_df = in_df, truth = in_truth)
out_df <- out_df %>%
filter(.metric == metric) %>%
arrange(fit, .estimate)
names(out_df)[2] <- paste(prefix, metric)
return(out_df)
}
#we use comp_models to get the metrics for each of the models fitting on the training data:
model_names <- c("ols", "ols_log", "dt", "dt_log", "rf", "rf_log")
train_rmse <- comp_models(train_results,
model_names,
in_truth = "mpg",
metric = "rmse",
prefix = "train")
train_rmse
## # A tibble: 6 x 4
## .metric `train rmse` .estimate fit
## <chr> <chr> <dbl> <chr>
## 1 rmse standard 2.69 dt
## 2 rmse standard 2.97 dt_log
## 3 rmse standard 2.71 ols
## 4 rmse standard 2.42 ols_log
## 5 rmse standard 1.10 rf
## 6 rmse standard 1.13 rf_log
train_rsq <- comp_models(train_results,
model_names,
in_truth = "mpg",
metric = "rsq",
prefix = "train")
train_rsq
## # A tibble: 6 x 4
## .metric `train rsq` .estimate fit
## <chr> <chr> <dbl> <chr>
## 1 rsq standard 0.829 dt
## 2 rsq standard 0.791 dt_log
## 3 rsq standard 0.826 ols
## 4 rsq standard 0.865 ols_log
## 5 rsq standard 0.973 rf
## 6 rsq standard 0.972 rf_log
train_mae <- comp_models(train_results,
model_names,
in_truth = "mpg",
metric = "mae",
prefix = "train")
train_mae
## # A tibble: 6 x 4
## .metric `train mae` .estimate fit
## <chr> <chr> <dbl> <chr>
## 1 mae standard 1.99 dt
## 2 mae standard 2.10 dt_log
## 3 mae standard 1.94 ols
## 4 mae standard 1.67 ols_log
## 5 mae standard 0.782 rf
## 6 mae standard 0.784 rf_log
#To test how the models do on the unseen testing data. Use similar code as before to
#augment the testing data frame with the model estimates for each of the 6 models
test_results <- mutate(test, ols = predict(ols_model, test),
ols_log = exp(predict(ols_log_model, test)),
dt = predict(dt_model, test),
dt_log = exp(predict(dt_log_model, test)),
rf = predict(rf_model, test),
rf_log = exp(predict(rf_log_model, test))
)
test_results_long <- pivot_longer(test_results,
ols:rf_log,
names_to = "method",
values_to = "estimate")
#plot of the model estimates vs the actual MPG values
ggplot(data = test_results_long,
aes(x = mpg, y = estimate)) +
geom_point(shape = 21, colour = "red") +
facet_wrap(~method, ncol = 2) +
geom_abline(slope = 1, intercept = 0) +
xlim(c(0,60)) + ylim(c(0,60)) +
theme_minimal()
test_rmse <- comp_models(test_results,
model_names,
in_truth = "mpg",
metric = "rmse",
prefix = "test")
test_rmse
## # A tibble: 6 x 4
## .metric `test rmse` .estimate fit
## <chr> <chr> <dbl> <chr>
## 1 rmse standard 2.76 dt
## 2 rmse standard 2.45 dt_log
## 3 rmse standard 2.69 ols
## 4 rmse standard 2.24 ols_log
## 5 rmse standard 1.82 rf
## 6 rmse standard 1.81 rf_log
test_rsq <- comp_models(test_results,
model_names,
in_truth = "mpg",
metric = "rsq",
prefix = "test")
test_rsq
## # A tibble: 6 x 4
## .metric `test rsq` .estimate fit
## <chr> <chr> <dbl> <chr>
## 1 rsq standard 0.804 dt
## 2 rsq standard 0.835 dt_log
## 3 rsq standard 0.801 ols
## 4 rsq standard 0.860 ols_log
## 5 rsq standard 0.910 rf
## 6 rsq standard 0.910 rf_log
test_mae <- comp_models(test_results,
model_names,
in_truth = "mpg",
metric = "mae",
prefix = "test")
test_mae
## # A tibble: 6 x 4
## .metric `test mae` .estimate fit
## <chr> <chr> <dbl> <chr>
## 1 mae standard 1.86 dt
## 2 mae standard 1.77 dt_log
## 3 mae standard 1.95 ols
## 4 mae standard 1.58 ols_log
## 5 mae standard 1.33 rf
## 6 mae standard 1.33 rf_log
#We can see how the metric values changed collectively moving the models from the
#training data to the testing data.
inner_join(train_rmse, test_rmse, by = "fit")
## # A tibble: 6 x 7
## .metric.x `train rmse` .estimate.x fit .metric.y `test rmse` .estimate.y
## <chr> <chr> <dbl> <chr> <chr> <chr> <dbl>
## 1 rmse standard 2.69 dt rmse standard 2.76
## 2 rmse standard 2.97 dt_log rmse standard 2.45
## 3 rmse standard 2.71 ols rmse standard 2.69
## 4 rmse standard 2.42 ols_log rmse standard 2.24
## 5 rmse standard 1.10 rf rmse standard 1.82
## 6 rmse standard 1.13 rf_log rmse standard 1.81
inner_join(train_rsq, test_rsq, by = "fit")
## # A tibble: 6 x 7
## .metric.x `train rsq` .estimate.x fit .metric.y `test rsq` .estimate.y
## <chr> <chr> <dbl> <chr> <chr> <chr> <dbl>
## 1 rsq standard 0.829 dt rsq standard 0.804
## 2 rsq standard 0.791 dt_log rsq standard 0.835
## 3 rsq standard 0.826 ols rsq standard 0.801
## 4 rsq standard 0.865 ols_log rsq standard 0.860
## 5 rsq standard 0.973 rf rsq standard 0.910
## 6 rsq standard 0.972 rf_log rsq standard 0.910
inner_join(train_mae, test_mae, by = "fit")
## # A tibble: 6 x 7
## .metric.x `train mae` .estimate.x fit .metric.y `test mae` .estimate.y
## <chr> <chr> <dbl> <chr> <chr> <chr> <dbl>
## 1 mae standard 1.99 dt mae standard 1.86
## 2 mae standard 2.10 dt_log mae standard 1.77
## 3 mae standard 1.94 ols mae standard 1.95
## 4 mae standard 1.67 ols_log mae standard 1.58
## 5 mae standard 0.782 rf mae standard 1.33
## 6 mae standard 0.784 rf_log mae standard 1.33
#add the 6 model residuals to a data frame that also contains the actual mpg values
#as well as the model predictors.
train_residuals <- mutate(train_results,
ols_residual = ols - mpg,
ols_log_residual = ols_log - mpg,
dt_residual = dt - mpg,
dt_log_residual = dt_log - mpg,
rf_residual = rf - mpg,
rf_log_residual = rf_log - mpg
)
train_residuals_long <- pivot_longer(train_residuals, ols_residual:rf_log_residual,
names_to = "method1", values_to = "residual")
#plot of the model residuals vs the actual MPG values
ggplot(data = train_residuals_long,
aes(x = mpg, y = residual)) +
geom_point(shape = 21, colour = "green") +
facet_wrap(~method1, ncol = 2) +
geom_hline(yintercept = 0, linetype = "dashed") +
xlim(c(0,60)) + ylim(c(-30,25)) + theme_minimal()
#add the 6 model residuals to a data frame that also contains the actual mpg values
#as well as the model predictors.
test_residuals <- mutate(test_results,
ols_residual = ols - mpg,
ols_log_residual = ols_log - mpg,
dt_residual = dt - mpg,
dt_log_residual = dt_log - mpg,
rf_residual = rf - mpg,
rf_log_residual = rf_log - mpg
)
test_residuals_long <- pivot_longer(test_residuals, ols_residual:rf_log_residual,
names_to = "method1", values_to = "residual")
#plot of the model residuals vs the actual MPG values
ggplot(data = test_residuals_long,
aes(x = mpg, y = residual)) +
geom_point(shape = 21, colour = "orange") +
facet_wrap(~method1, ncol = 2) +
geom_hline(yintercept = 0, linetype = "dashed") +
xlim(c(0,60)) + ylim(c(-30,25)) + theme_minimal()