In [None]:
# remember to install some packages (see github repo readme!)

# Lets load some libraries and car data and precalculated features
library(dplyr)
library(ggplot2)
library(tidyr)
library(corrplot)

load("data/trafi.RData")


In [None]:
# Lets see data on car chassis type
# kori.orig is the chassis in Trafi data, 

autodata %>% 
select(., ryhma, 
       merkki, 
       mallimerkinta, 
       kuntanimi, 
       kori, 
       omamassa, 
       iskutilavuus, 
       kayttoonottoVuosi,
       matkamittarilukema) %>%
head(15)



In [None]:
# If we want to cluster the cars, what would happen if clustered by technical data (mass, kW, age, mileage)?
# Instead of that, let's cluster brand according to county, this is kind of a social clustering...
# Let's check the number of car brands

autodata %>% group_by(merkki) %>% 
  summarise(n=n()) %>% 
  arrange(-n) %>% 
  mutate(k=row_number()) 

In [None]:
# Count brands on each commune; for practical (and statistical reasons we'll limit to common brands)
# first we count number of N.merkki and change brans below the limit into "other"
# then we count commune-brand combindations and spread the data 

kunnat.merkit <- autodata %>% 
  group_by(merkki) %>% 
  mutate(N.merkki=n()) %>%  
  ungroup %>% 
  mutate(merkki = ifelse(N.merkki < 300, "other", merkki)) %>%
  count(merkki, kuntanimi) 

merkki.profile <-
  spread(kunnat.merkit, merkki, n, fill=0) 


In [None]:
# Let's check the data ...
dim(merkki.profile)
merkki.profile

In [None]:
# Next we compute a distance (well, actually similarity between) the brand vectors by using 
# ordinary linear correlation [-1,1]

cor.matrix <- select(merkki.profile, -kuntanimi) %>% 
  cor(.,use="na.or.") 


In [None]:
# This shows the correlation matrix, first, with alphabetical, then we'll change it to hclust
# check also hclust.method="single" or "complete", (centroid, average, ... )
# The matrix that is shown contais brand A to brand B similarities r(A,B)
# r=1 Brands A,B appear in similar fashion in same communes as 
# r=0 no relation
# -1  Brands A,B apprear in opposite fashion in communes (A high, B low and vice versa)

# you can sho explict clusters by setting addrect= <num of cluser>

corrplot(cor.matrix, order="hclust", 
         hclust.method="ward.D", 
         tl.cex=0.6,tl.col="black", 
         method="shade" #, 
                        #addrect=10
        )

In [None]:
# But why is everything blue
# Number of cars in communes is different!!

count(autodata,kuntanimi) %>% arrange(-n)

In [None]:
# We'll rescale the data so that it reflects the differences of commune internal brand disrtribution (relative frequensices)
#instead of raw frequencies (which are obviously correlated)
# The relative freq. are on the same scale

# N is the number of cars in  a commune:

merkki.profile.norm <- merkki.profile
merkki.profile.norm$N <-select(merkki.profile.norm, -kuntanimi) %>% rowSums
merkki.profile.norm <- mutate_at(merkki.profile.norm, vars(-kuntanimi,-N), funs(./N))

head(merkki.profile.norm,10)

# now the vector shows the proportion of brand in a commune and is more comparable between big and small communes

In [None]:
# Recompute correaltion, take all communes first

# Some of the communes are really small, so they make noise to the results; set a higher limit to N to see 
# Maybe we should have a weighted / more statistically based model for this... 

cor.matrix <- 
  filter(merkki.profile.norm, N > 0) %>% # change the N>0 to N>1000 to get better results
  select(-kuntanimi, -N) %>%
  cor(., use="na.or.") 

corrplot(cor.matrix, order="hclust", 
         hclust.method="complete", 
         tl.cex=0.6,tl.col="black", 
         method="shade" #,
                        #addrect=10
        )


In [None]:
# Some of the communes are really small, so they make noise to the results; set a higher limit to N to see 
# Maybe we should have a weighted / more statistically based model for this... 

count(autodata,kuntanimi) %>% arrange(-n)

# go back to previous one and change N to filter out 
# change also the cluster 

In [None]:
# Factoring... well sort of, well plot the brand correlation as a scatterplot 
# with a (non-linear distance preserving) T-SNE projection 
library(tsne)

In [None]:
# Change the correlation (similarity) into a distance
dist.cor <- as.dist(1-cor.matrix)

In [None]:
# k sets the output dim (should be 2 here for plotting, bigger perplexity makes map moe global)
# try perplexity 2...20

X.cor<-tsne(dist.cor, 
            perplexity=10, 
            k=2, 
            whiten=TRUE, 
            max_iter=3000)

In [None]:
D<-data.frame(merkki=rownames(cor.matrix),X.cor)

In [None]:
ggplot(D,aes(x=X1,y=X2,label=merkki))+geom_point(size=1,color="yellow")+geom_text(size=2)