In [3]:
## 분석 목적 : Deep Learning 을 이용한 무인 자동차 핵심 기술별 논문 분류 분석

## 본 Page는 R을 이용한 Deep Learning 연습용 페이지입니다.
## 위 분석 중 일부의 분석을 가공하여 공개합니다.

## 본 Page 설명
## Data : 핵심 기술별 특허 정보 한국 DB from WIPS
## 분석 : 특허 DB의 초록을 Deep Learning을 이용하여 텍스트 기반 특허 기술 분류 모형 학습

## 실제 최종 분석에는 특허 DB 중 미국 DB를 사용하였습니다.
## 한국 DATA는 기술 별 수가 적어, 결과 해석에 주의할 필요가 있습니다.

## 배포 및 수정, 재배포를 삼가주세요.

## 2015.04.28 by 김형준(Hyung-jun, Kim)
## soeque1@gmail.com
## http://soeque1.github.io/r_slide/

In [2]:
rm(list=ls())
save_dir <- "/Users/kimhyungjun/Dropbox/h2o/prac/"

In [3]:
options(repos='http://cran.nexr.com')

In [4]:
install_lib <- function(x){
  for( i in x ){
    #  require returns TRUE invisibly if it was able to load package
    if( ! require( i , character.only = TRUE ) ){
      #  If package was not able to be loaded then re-install
      install.packages( i , dependencies = TRUE )
      #  Load package after installing
            library( i , character.only = T)
    } else {
            library( i , character.only = T)
    }
  }
}

In [5]:
suppressMessages(install_lib(c("readxl", 
                               "dplyr", "stringr", 
                               "tm", "lsa",
                               "KoNLP", 
                               "h2o")))

In [6]:
save_dir <- "/Users/kimhyungjun/Dropbox/h2o/prac/"

In [7]:
data_list <- list.files(paste(save_dir,"data/wips",sep=""))
data_list_kr <- data_list[grep("kr",data_list)]

data <- data.frame()
for (i in 1:length(data_list_kr))
{
    data_temp <- read_excel(paste(save_dir,"data/wips/",data_list_kr[i],sep=""))
    data_temp <- cbind(rep(substr(data_list_kr[i],1,1),nrow(data_temp)),data_temp)
    colnames(data_temp)[1] <- "기술"
    data <- rbind(data,data_temp)
    rm(data_temp)
}


In [8]:
dt_kr <- data[,c("기술", "Original IPC Main", "출원일","발명의 명칭", "요약", "국가코드")]
dim(dt_kr)

[1] 126   6

In [9]:
dt_text <- dt_kr %>% dplyr::select(요약) %>% .[[1]]

In [10]:
DT <-  sapply(dt_text, extractNoun, USE.NAMES = F) %>%
       sapply(function(x) paste(x, collapse = ' ')) %>%
       as.data.frame

In [11]:
tdm <- TermDocumentMatrix(Corpus(DataframeSource(DT)),
                                control = list(
                                    removeNumbers = TRUE,
                                    wordLengths = c(2,Inf),
                                    removePunctuation = TRUE,
                                    weighting = function(x)
                                        weightSMART(x, spec = "nnn")))
tdm <- as.matrix(tdm)
print(tdm[1:5,1:5])

              Docs
Terms          1 2 3 4 5
  가능         0 0 1 0 0
  가능성       0 0 0 0 0
  가능하다동일 0 0 0 0 0
  가변         0 0 0 0 0
  가변하는     0 0 0 0 0


In [12]:
tdm <- lw_logtf(tdm) * gw_entropy(tdm)

In [13]:
print(tdm[1:5,1:5])

              Docs
Terms          1 2        3 4 5
  가능         0 0 1.126579 0 0
  가능성       0 0 0.000000 0 0
  가능하다동일 0 0 0.000000 0 0
  가변         0 0 0.000000 0 0
  가변하는     0 0 0.000000 0 0


In [14]:
rm(DT)

In [15]:
actual_y <- dt_kr%>%dplyr::select(기술)%>%.[[1]]
save_data <- data.frame(cbind(t(tdm),actual_y))
write.table(save_data, paste(save_dir,'data/patent_kr.csv', sep=""),
            row.names=F, col.names=F)

In [16]:
## Deep Learning

In [17]:
h2oServer <- h2o.init(nthreads=-1, max_mem_size = "6g")

Successfully connected to http://127.0.0.1:54321 

R is connected to H2O cluster:
    H2O cluster uptime:         1 hours 2 minutes 
    H2O cluster version:        2.8.4.4 
    H2O cluster name:           H2O_started_from_R 
    H2O cluster total nodes:    1 
    H2O cluster total memory:   5.33 GB 
    H2O cluster total cores:    4 
    H2O cluster allowed cores:  4 
    H2O cluster healthy:        TRUE 



In [18]:
data_hex <- h2o.importFile(h2oServer, path = paste(save_dir,"data/patent_kr.csv", sep=""))



In [19]:
random <- h2o.runif(data_hex, seed = 654321)
train.hex <- h2o.assign(data_hex[random <= .8,], "train.hex")
test.hex  <- h2o.assign(data_hex[random > .8,], "test.hex")

In [20]:
label_t <- test.hex %>% as.data.frame %>% select(ncol(test.hex)) %>% table

In [21]:
label_t

test.hex %>% as.data.frame %>% select(ncol(test.hex))
 A  B  C  E  H 
 4  2 23  2  1 

In [22]:
label_t[label_t == max(label_t)] / sum(label_t)

      C 
0.71875 

In [23]:
my.dl <- h2o.deeplearning(x = 1:(ncol(train.hex)-1), y = ncol(train.hex), data=train.hex, validation=test.hex,
                         variable_importances=T,
                         activation = "RectifierWithDropout", 
                         input_dropout_ratio = 0.25, 
                         hidden_dropout_ratios = c(0.5,0.5,0.5), 
                         adaptive_rate = T,
                         balance_classes = T, 
                         train_samples_per_iteration = 1500, 
                         hidden = c(250,250,250), 
                         epochs = 15)



In [24]:
my.dl

IP Address: 127.0.0.1 
Port      : 54321 
Parsed Data Key: train.hex 

Deep Learning Model Key: DeepLearning_b1d4668517cab735bd3788d3a204df6

Training classification error: 0.01542416

Validation classification error: 0.15625

Confusion matrix:
Reported on test.hex 
        Predicted
Actual   A B  C D E G H   Error
  A      2 0  1 0 1 0 0 0.50000
  B      1 1  0 0 0 0 0 0.50000
  C      0 0 23 0 0 0 0 0.00000
  D      0 0  0 0 0 0 0     NaN
  E      1 0  0 0 1 0 0 0.50000
  G      0 0  0 0 0 0 0     NaN
  H      1 0  0 0 0 0 0 1.00000
  Totals 5 1 24 0 2 0 0 0.15625

Hit Ratios for Multi-class Classification:
  k hit_ratios
1 1    0.84375
2 2    0.84375
3 3    0.87500
4 4    0.96875
5 5    1.00000
6 6    1.00000
7 7    1.00000

Relative Variable Importance:
  C966     C1129      C590     C326     C1002      C496     C686       C64
1    1 0.8975929 0.8479354 0.846368 0.8334404 0.8257619 0.821659 0.8195269
      C1307      C509      C618      C922      C388     C1032      C399
1 0.815695

In [25]:
my.dl@model[[5]]
1 - my.dl@model[[7]]  ##  ACC

        Predicted
Actual   A B  C D E G H   Error
  A      2 0  1 0 1 0 0 0.50000
  B      1 1  0 0 0 0 0 0.50000
  C      0 0 23 0 0 0 0 0.00000
  D      0 0  0 0 0 0 0     NaN
  E      1 0  0 0 1 0 0 0.50000
  G      0 0  0 0 0 0 0     NaN
  H      1 0  0 0 0 0 0 1.00000
  Totals 5 1 24 0 2 0 0 0.15625

[1] 0.84375

In [26]:
h2o.predict(my.dl, test.hex)%>%as.data.frame

   predict            A            B            C            D            E
1        A 8.910353e-01 7.370919e-02 4.151384e-04 2.001426e-02 1.469966e-02
2        C 6.575533e-02 4.006533e-03 6.922491e-01 1.994550e-03 2.357033e-01
3        A 9.916058e-01 3.679702e-05 4.273191e-03 1.494910e-03 2.100528e-03
4        E 8.663132e-04 2.727302e-04 1.563110e-02 2.535046e-05 9.809678e-01
5        B 6.141900e-03 8.492556e-01 7.596204e-02 8.158414e-04 6.774489e-02
6        A 4.749292e-01 3.193418e-02 3.677447e-01 6.163182e-02 5.019460e-02
7        C 1.811038e-01 3.085864e-06 8.183260e-01 2.069992e-05 5.096302e-04
8        C 5.756019e-03 2.522375e-03 9.888914e-01 1.087247e-03 1.734982e-03
9        C 5.930470e-02 3.130129e-05 9.375899e-01 3.312651e-04 2.671929e-03
10       C 1.825703e-03 4.023292e-06 9.747251e-01 1.472770e-05 2.341670e-02
11       C 1.510467e-01 9.470678e-06 8.479494e-01 2.856549e-04 6.926252e-04
12       C 8.396786e-04 9.822395e-07 9.987714e-01 2.709155e-06 3.833199e-04
13       C 1

In [27]:
colnames(save_data)[str_replace_all(names(my.dl@model[[9]]), "C", "")%>%as.numeric%>%.[1:10]]

 [1] "주차"   "통신"   "안전"   "목적지" "지도"   "센서"   "운전자" "경고"  
 [9] "lf"     "속도"  

In [28]:
my.rf <- h2o.randomForest(x = 1:(ncol(train.hex)-1), y = ncol(train.hex), data = train.hex, validation = test.hex,
                         type="fast", 
                         importance=TRUE, 
                         ntree=c(5), 
                         depth=c(5,10))



In [29]:
print(my.rf)

IP Address: 127.0.0.1 
Port      : 54321 
Parsed Data Key: train.hex 

Grid Search Model Key: GridSearch_854db4fb4e907d8477a4ddd9d52d429b 

Summary
                                 model_key ntrees max_depth nbins
1 SpeeDRF_8e1207d25a80c2ab78a9f42b87084dec      5         5  1024
2 SpeeDRF_a5396d0c751d639fd2bf68a9c0224c25      5        10  1024
  prediction_error run_time
1             0.25     5893
2          0.34375     6311


In [30]:
my.rf@model[[1]]@model$confusion

        Predicted
Actual   A B  C D E G H Error
  A      1 0  2 0 1 0 0  0.75
  B      0 0  2 0 0 0 0  1.00
  C      0 0 23 0 0 0 0  0.00
  D      0 0  0 0 0 0 0   NaN
  E      0 0  2 0 0 0 0  1.00
  G      0 0  0 0 0 0 0   NaN
  H      0 0  1 0 0 0 0  1.00
  Totals 1 0 30 0 1 0 0  0.25

In [31]:
print(1 - my.rf@sumtable[[1]]$prediction_error)
print(1 - my.rf@sumtable[[2]]$prediction_error)

[1] 0.75
[1] 0.65625


In [32]:
h2o.shutdown(h2oServer)

Are you sure you want to shutdown the H2O instance running at http://127.0.0.1:54321 (Y/N)? 
