## 1. IRIS 데이터에 대해서 5겹 교차검증(K-fold cross validation)을 사용하여 분류하기

In [14]:
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import numpy

In [15]:
def create_model():
    # create model
    model = Sequential([
        Dense(16, input_shape=(4,), activation='relu'),
        Dense(3, activation='softmax')
    ])
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [16]:
# seed 값 설정
seed = 2020
np.random.seed(seed)
tf.random.set_seed(seed)

In [49]:
import pandas as pd
df = pd.read_csv('data/iris.csv', 
                 names = ["sepal_length", "sepal_width", "petal_length", "petal_width", "species"])
df = df.drop(['Id'])
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
1,5.1,3.5,1.4,0.2,Iris-setosa
2,4.9,3.0,1.4,0.2,Iris-setosa
3,4.7,3.2,1.3,0.2,Iris-setosa
4,4.6,3.1,1.5,0.2,Iris-setosa
5,5.0,3.6,1.4,0.2,Iris-setosa


In [50]:
dataset = df.values
X = dataset[:,0:4]
Y_obj = dataset[:,4]
Y_obj[0], Y_obj[50], Y_obj[100]

('Iris-setosa', 'Iris-versicolor', 'Iris-virginica')

In [51]:
from sklearn.preprocessing import LabelEncoder

e = LabelEncoder()
Y = e.fit_transform(Y_obj)

In [52]:
Y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [55]:
# K겹 교차검증을 위해 케라스를 사이킷런으로 변환
model = KerasClassifier(build_fn=create_model, epochs=150, batch_size=10, verbose=0)
 
kfold = KFold(n_splits=5, shuffle=True, random_state=seed)
results = cross_val_score(model, X, Y, cv=kfold)

In [56]:
results

array([0.96666664, 1.        , 1.        , 0.93333334, 0.93333334])

# 2. Fashion MNIST Dataset CNN으로 분류하기

In [2]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D
from keras.callbacks import ModelCheckpoint, EarlyStopping

import os
import matplotlib.pyplot as plt
%matplotlib inline

Using TensorFlow backend.


In [3]:
# seed 값 설정
seed = 2020
np.random.seed(seed)
tf.random.set_seed(seed)

In [4]:
# 데이터 불러오기
(X_train, Y_train), (X_test, Y_test) = mnist.load_data()
X_train = X_train.reshape(-1, 28, 28, 1).astype('float32') / 255
X_test = X_test.reshape(-1, 28, 28, 1).astype('float32') / 255
Y_train = keras.utils.to_categorical(Y_train)
Y_test = keras.utils.to_categorical(Y_test)

In [5]:
# 컨볼루션 신경망 설정
model = Sequential([
    Conv2D(32, kernel_size=(3, 3), input_shape=(28, 28, 1), 
           activation='relu'),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D(pool_size=2),
    Dropout(0.25),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(10, activation='softmax')
])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 26, 26, 32)        320       
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 24, 24, 64)        18496     
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 12, 12, 64)        0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 12, 12, 64)        0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 9216)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               1179776   
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)              

In [6]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [7]:
# 모델 최적화 설정
MODEL_DIR = './model/'
if not os.path.exists(MODEL_DIR):
    os.mkdir(MODEL_DIR)

In [8]:
modelpath = MODEL_DIR + "mnist-cnn-{epoch:02d}-{val_loss:.4f}.hdf5"
checkpointer = ModelCheckpoint(filepath=modelpath, monitor='val_loss', 
                               verbose=1, save_best_only=True)
early_stopping_callback = EarlyStopping(monitor='val_loss', 
                                        patience=10)

In [9]:
# 모델의 실행
history = model.fit(X_train, Y_train, validation_split=0.2, epochs=30,
                    batch_size=200, #epochs=5, verbose=2, 
                    callbacks=[early_stopping_callback, checkpointer])

Train on 48000 samples, validate on 12000 samples
Epoch 1/30

Epoch 00001: val_loss improved from inf to 0.06749, saving model to ./model/mnist-cnn-01-0.0675.hdf5
Epoch 2/30

Epoch 00002: val_loss improved from 0.06749 to 0.05081, saving model to ./model/mnist-cnn-02-0.0508.hdf5
Epoch 3/30

Epoch 00003: val_loss improved from 0.05081 to 0.04556, saving model to ./model/mnist-cnn-03-0.0456.hdf5
Epoch 4/30

Epoch 00004: val_loss improved from 0.04556 to 0.04252, saving model to ./model/mnist-cnn-04-0.0425.hdf5
Epoch 5/30

Epoch 00005: val_loss improved from 0.04252 to 0.03454, saving model to ./model/mnist-cnn-05-0.0345.hdf5
Epoch 6/30

Epoch 00006: val_loss did not improve from 0.03454
Epoch 7/30

Epoch 00007: val_loss did not improve from 0.03454
Epoch 8/30

Epoch 00008: val_loss improved from 0.03454 to 0.03254, saving model to ./model/mnist-cnn-08-0.0325.hdf5
Epoch 9/30

Epoch 00009: val_loss did not improve from 0.03254
Epoch 10/30

Epoch 00010: val_loss did not improve from 0.03254

In [12]:
from keras.models import load_model
del model
model = load_model('model/mnist-cnn-08-0.0325.hdf5')

In [13]:
# 테스트 정확도 출력 
print("\n Test Accuracy: %.4f" % 
      (model.evaluate(X_test, Y_test, verbose=0)[1]))


 Test Accuracy: 0.9901


## 3. IMDB 영화 리뷰 데이터에 대하여 딥 러닝을 이용하여 감성분석 하기

In [93]:
import pandas as pd

train = pd.read_csv('data/labeledTrainData.tsv', header = 0, delimiter ='\t', quoting=3)
test = pd.read_csv('data/testData.tsv', header = 0, delimiter ='\t', quoting=3)

train.head()
test.head()

Unnamed: 0,id,review
0,"""12311_10""","""Naturally in a film who's main themes are of ..."
1,"""8348_2""","""This movie is a disaster within a disaster fi..."
2,"""5828_4""","""All in all, this is a movie for kids. We saw ..."
3,"""7186_2""","""Afraid of the Dark left me with the impressio..."
4,"""12128_7""","""A very accurate depiction of small time mob l..."


In [94]:
train['sentiment'].value_counts()

1    12500
0    12500
Name: sentiment, dtype: int64

In [95]:
train['review'][0][:100]

'"With all this stuff going down at the moment with MJ i\'ve started listening to his music, watching '

In [96]:
train.isnull().sum()

id           0
sentiment    0
review       0
dtype: int64

In [97]:
from bs4 import BeautifulSoup

example1 = BeautifulSoup(train['review'][0], "lxml")

example1.get_text()[:100] # 텍스트만 받아옴

'"With all this stuff going down at the moment with MJ i\'ve started listening to his music, watching '

In [98]:
# 정규표현식을 사용하게 하는 모듈 설치
import re

# ^ : 시작을 의미, 알파벳 소대문자로 시작하는 문자만 추출
letters_only = re.sub('[^a-zA-z]',' ',example1.get_text())
letters_only[:100]

' With all this stuff going down at the moment with MJ i ve started listening to his music  watching '

In [99]:
# 모두 소문자로 변환한다.
lower_case = letters_only.lower()

# 문자를 나눈다. => 토큰화
words = lower_case.split()
print(len(words))
words[:10]

437


['with',
 'all',
 'this',
 'stuff',
 'going',
 'down',
 'at',
 'the',
 'moment',
 'with']

In [100]:
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
stopwords.words('english')[:10]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\10\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [101]:
words = [w for w in words if not w in stopwords.words('english')]

In [109]:
train['review'].shape

(25000,)

In [112]:
from nltk.stem.porter import *
stemmer = PorterStemmer()
def review_to_words(raw_review):
    # 1. HTML 제거
    review_text = BeautifulSoup(raw_review, 'html.parser').get_text()
    # 2. 영문자가 아닌 문자는 공백으로 변환
    letters_only = re.sub('[^a-zA-Z]', ' ', review_text)
    # 3. 소문자 변환
    words = letters_only.lower().split()
    # 4. Stopwords를 세트로 변환
    # 파이썬에서는 리스트보다 세트로 찾는게 훨씬 빠르다.
    stops = stopwords.words('english')
    # 5. Stopwords 제거
    meaningful_words = [w for w in words if not w in stops]
    # 6. 어간추출
    stemming_words = [stemmer.stem(w) for w in meaningful_words]
    # 7. 공백으로 구분된 문자열로 결합하여 결과를 반환
    return(' '.join(stemming_words))
    
# 예시로 첫 리뷰만 적용
clean_review = review_to_words(train['review'][0])
clean_review

'stuff go moment mj start listen music watch odd documentari watch wiz watch moonwalk mayb want get certain insight guy thought realli cool eighti mayb make mind whether guilti innoc moonwalk part biographi part featur film rememb go see cinema origin releas subtl messag mj feel toward press also obviou messag drug bad kay visual impress cours michael jackson unless remot like mj anyway go hate find bore may call mj egotist consent make movi mj fan would say made fan true realli nice actual featur film bit final start minut exclud smooth crimin sequenc joe pesci convinc psychopath power drug lord want mj dead bad beyond mj overheard plan nah joe pesci charact rant want peopl know suppli drug etc dunno mayb hate mj music lot cool thing like mj turn car robot whole speed demon sequenc also director must patienc saint came film kiddi bad sequenc usual director hate work one kid let alon whole bunch perform complex danc scene bottom line movi peopl like mj one level anoth think peopl stay 

In [113]:
# 5000개 단위로 상태를 찍도록 개선, test data set도 동일하게 적용
num_reviews = len(train['review'])
clean_train_reviews = []
for i in range(0,num_reviews) :
    if (i+1) % 5000 == 0:
        print('Review {} of {}'.format(i+1, num_reviews))
    clean_train_reviews.append(review_to_words(train['review'][i]))

Review 5000 of 25000
Review 10000 of 25000
Review 15000 of 25000
Review 20000 of 25000
Review 25000 of 25000


In [114]:
%time train['review_clean'] = train['review'].apply(review_to_words)

Wall time: 1min 13s


In [115]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

# 튜토리얼과 다르게 파라미터 값을 수정
vectorizer = CountVectorizer(analyzer = 'word', 
                             tokenizer = None,
                             preprocessor = None, 
                             stop_words = None, 
                             min_df = 2, # 토큰이 나타날 최소 문서 개수
                             ngram_range=(1, 3),
                             max_features = 20000)

# 속도 개선을 위해 파이프라인을 사용하도록 개선
pipeline = Pipeline([('vect', vectorizer),])

# 벡터화
train_data_features = pipeline.fit_transform(clean_train_reviews)
train_data_features

<25000x20000 sparse matrix of type '<class 'numpy.int64'>'
	with 2757814 stored elements in Compressed Sparse Row format>

In [116]:
train_data_features.shape

(25000, 20000)

In [117]:
vocab = vectorizer.get_feature_names()
print(len(vocab))
vocab[:10]

20000


['aag',
 'aaron',
 'ab',
 'abandon',
 'abbey',
 'abbi',
 'abbot',
 'abbott',
 'abc',
 'abduct']

In [118]:
from sklearn.ensemble import RandomForestClassifier

# 랜덤포레스트 분류기를 사용
forest = RandomForestClassifier(n_estimators = 100, n_jobs = -1, random_state=2020)
forest = forest.fit(train_data_features, train['sentiment'])

In [120]:
from sklearn.model_selection import cross_val_score
score = np.mean(cross_val_score(forest,train_data_features,train['sentiment'], cv=10, scoring='roc_auc'))

In [121]:
score

0.9273054079999999

In [123]:
clean_test_reviews = []
for i in range(0,num_reviews) :
    if (i+1) % 5000 == 0:
        print('Review {} of {}'.format(i+1, num_reviews))
    clean_test_reviews.append(review_to_words(test['review'][i]))

Review 5000 of 25000
Review 10000 of 25000
Review 15000 of 25000
Review 20000 of 25000
Review 25000 of 25000


In [124]:
# test 데이터를 벡터화 함
test_data_features = pipeline.transform(clean_test_reviews)
test_data_features = test_data_features.toarray()

# 벡터화한 test 데이터를 넣고 예측한다.
result = forest.predict(test_data_features)

In [125]:
output = pd.DataFrame(data={'id':test['id'], 'sentiment':result})
output.head()

Unnamed: 0,id,sentiment
0,"""12311_10""",1
1,"""8348_2""",0
2,"""5828_4""",1
3,"""7186_2""",1
4,"""12128_7""",1
