In [None]:
# Lets load some libraries and car data and precalculated features
library(dplyr)
library(ggplot2)
library(glmnet)
library(corrplot)

load("data/trafi.RData"); 
kori.class <- class

In [None]:
# Let's see 
select(autodata,ryhma,merkki,malli,mallimerkinta,kori.orig,kayttoonottoVuosi) %>% 
head(20)

In [None]:
# Let's see the car type; there's plenty of missing data

filter(autodata,kayttoonottoVuosi>1979) %>% 
count(.,kori.orig,kayttoonottoVuosi) %>% 
ggplot(.,aes(x=kayttoonottoVuosi,y=n,fill=kori.orig))+geom_bar(stat="identity")

# change the ... +geom_bar(stat="identity") into +geom_bar(stat="identity",position="fill") 
# swithch kori.orig into kori (that contains a ready made prediction for missing)

# Observations?

In [None]:

# Let's demonstrate LASSO regulatized regression, cross-validation, and 

# X contains a sparse matrix where data "merkki+mallimerkinta" is divided by space and some punctuation 
# 5D PASSAT VARIANT 1.8 CL-351-C/263 => [5D, PASSAT, VARIANT, 1.8, CL, 351, C/263]
# and one-hot-coded into a sparse matrix X and class vector c
#
# This has been done elsewhere, and rows with missing class kori.orig are omitted

head(X)
head(class)

# lets make it a bit smaller, and take a 5% sample
sample.idx<-seq(1, dim(X)[1], 20)
select.vars<-colSums(X)>10

dim(X)
dim(X[sample.idx,select.vars])

In [None]:
# Multinomial classification, ten folds, LASSO 

# because we use L1 norm regularization (LASSO) (alpha=0), most coefficients  are pushed to zero.
# You'll see this later
# if you set alpha=1, you'll get Ridge reg. (and (almost) all coefficients usually have some other value than 
# exactly zero),  

classifier <-cv.glmnet(X[sample.idx, select.vars],
             as.factor(kori.class[sample.idx]),
             type.measure="deviance",
             family="multinomial",
             alpha=1,
             nfolds=10)


In [None]:
# Lets see some statistics (deviation is not =classification error)
plot(classifier)

In [None]:
#Lets predict using the best model and see the accuracy!

mean(kori.class == predict(classifier, X[,select.vars], type="class"))*100


In [None]:
# Let's see the confusion matrix!
# kori.orig (true car type) is capitalized
options(scipen=999)

table(toupper(class), predict(classifier,X[,select.vars], type="class")) %>% prop.table(.,2)


In [None]:
# Lets see the coefficients 

coef(classifier)$Sedan

In [None]:
# Let's see this a bit better
# Change Sedan to another chassis if you wish to see other results


cf<-coef(classifier)$Sedan
data.frame(attr=rownames(cf), coef=as.numeric(cf)) %>% 
arrange(-coef)

In [None]:
# For reference, there is a ready made 
# autodata contains a field kori.orig that is the original car type and 
# kori.est that has a prediction on the car type (sedan, wagon, ...)
# It's made also using the brand and model information string, and a tree gradient boosted tree classifier (XGboost). 

# Prediction accuracy (percentage)
mean(autodata$kori.est == autodata$kori.orig,na.rm=TRUE)*100