# Week10 Learning from Text Data

---
p6

# 1. Preprocessing texts

### Load the IMDb movie review data 

In [1]:
# movie review data
import pandas as pd

df = pd.read_csv('movie_data.csv', encoding='utf-8')
df.head(3)

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0


In [2]:
df.shape

(50000, 2)

---
p7

### Cleaning text data

In [4]:
# cleaning texts using regular expression
import re
def preprocessor(text):
    text = re.sub("<[^>]*>", "", text) # remove <...> (tags)
    text = re.sub("[\W]+", " ", text)  # remove all non-words
    text = text.lower()                # change to lower cases
    return text

preprocessor("</a>This is a $100 TEST!!! ^^")

'this is a 100 test '

---
p8

In [5]:
# review 1 text
df.loc[1, 'review']

"OK... so... I really like Kris Kristofferson and his usual easy going delivery of lines in his movies. Age has helped him with his soft spoken low energy style and he will steal a scene effortlessly. But, Disappearance is his misstep. Holy Moly, this was a bad movie! <br /><br />I must give kudos to the cinematography and and the actors, including Kris, for trying their darndest to make sense from this goofy, confusing story! None of it made sense and Kris probably didn't understand it either and he was just going through the motions hoping someone would come up to him and tell him what it was all about! <br /><br />I don't care that everyone on this movie was doing out of love for the project, or some such nonsense... I've seen low budget movies that had a plot for goodness sake! This had none, zilcho, nada, zippo, empty of reason... a complete waste of good talent, scenery and celluloid! <br /><br />I rented this piece of garbage for a buck, and I want my money back! I want my 2 hou

In [6]:
# cleaning review 1 text 
preprocessor(df.loc[1, 'review'])

'ok so i really like kris kristofferson and his usual easy going delivery of lines in his movies age has helped him with his soft spoken low energy style and he will steal a scene effortlessly but disappearance is his misstep holy moly this was a bad movie i must give kudos to the cinematography and and the actors including kris for trying their darndest to make sense from this goofy confusing story none of it made sense and kris probably didn t understand it either and he was just going through the motions hoping someone would come up to him and tell him what it was all about i don t care that everyone on this movie was doing out of love for the project or some such nonsense i ve seen low budget movies that had a plot for goodness sake this had none zilcho nada zippo empty of reason a complete waste of good talent scenery and celluloid i rented this piece of garbage for a buck and i want my money back i want my 2 hours back i invested on this grade f waste of my time don t watch this 

---
p9

### Processing documents into tokens(English)

In [7]:
text = 'The sun is shining, the weather is sweet, and she likes RUNNING!'
print(text)

# cleaning
text_prep = preprocessor(text)
print(text_prep)

The sun is shining, the weather is sweet, and she likes RUNNING!
the sun is shining the weather is sweet and she likes running 


In [11]:
# tokenizing
import nltk
nltk.download('punkt')   # tokenizer

text_tokens = nltk.word_tokenize(text)
print(text_tokens)

['The', 'sun', 'is', 'shining', ',', 'the', 'weather', 'is', 'sweet', ',', 'and', 'she', 'likes', 'RUNNING', '!']


[nltk_data] Downloading package punkt to /Users/seonjin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


---
p10

In [12]:
# stemming
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()

def tokenizer_porter(text):
    text_tokens = nltk.word_tokenize(text)
    return [porter.stem(word) for word in text_tokens]

text_stems = tokenizer_porter(text_prep)
print(text_stems)

['the', 'sun', 'is', 'shine', 'the', 'weather', 'is', 'sweet', 'and', 'she', 'like', 'run']


---
p11

In [13]:
# stopwords
from nltk.corpus import stopwords
nltk.download('stopwords')

stop = stopwords.words("english")
print(stop)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/seonjin/nltk_data...


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data]   Unzipping corpora/stopwords.zip.


In [14]:
# removing stopwords
def remove_stopwords(text):
    return [w for w in text if w not in stop]

text_stems = remove_stopwords(tokenizer_porter(text_prep))
text_stems

['sun', 'shine', 'weather', 'sweet', 'like', 'run']

---
p12

### POS tagging(English)

In [18]:
# POS tagging
from nltk.tag import pos_tag
nltk.download('averaged_perception_tagger') # POS tagger

tagged_text = pos_tag(nltk.word_tokenize(text_prep))
tagged_text

[nltk_data] Error loading averaged_perception_tagger: Package
[nltk_data]     'averaged_perception_tagger' not found in index


LookupError: 
**********************************************************************
  Resource [93maveraged_perceptron_tagger[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('averaged_perceptron_tagger')
  [0m
  Attempted to load [93mtaggers/averaged_perceptron_tagger/averaged_perceptron_tagger.pickle[0m

  Searched in:
    - '/Users/seonjin/nltk_data'
    - '/Users/seonjin/anaconda3/nltk_data'
    - '/Users/seonjin/anaconda3/share/nltk_data'
    - '/Users/seonjin/anaconda3/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


---
p13

### Processing documents into tokens(Korean)

In [19]:
# Korean movie reviews
df_kor = pd.read_csv("kor_movie.csv", encoding='utf-8')
df_kor.head(3)

Unnamed: 0,review,sentiment
0,아 더빙.. 진짜 짜증나네요 목소리,0
1,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,너무재밓었다그래서보는것을추천한다,0


In [20]:
df_kor.shape

(200000, 2)

In [21]:
# review 1 text 
df_kor.loc[1,'review']

'흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나'

In [22]:
# cleaning review 1 text 
preprocessor(df_kor.loc[1,'review'])

'흠 포스터보고 초딩영화줄 오버연기조차 가볍지 않구나'

---
p14

In [24]:
# tokenizing - Okt
from konlpy.tag import Okt
okt = Okt()

text = '하늘을 나는 아름다운 꿈을 꾸었습니다!'

# simple split() method is not appropriate
print('<split() method>')
print(text.split())

print('<Okt word tokenizer>')
print(okt.morphs(text))

<split() method>
['하늘을', '나는', '아름다운', '꿈을', '꾸었습니다!']
<Okt word tokenizer>
['하늘', '을', '나', '는', '아름다운', '꿈', '을', '꾸었습니다', '!']


---
p15

In [25]:
# POS tagging (형태소 분석))
tagged_text = okt.pos(text)
tagged_text

[('하늘', 'Noun'),
 ('을', 'Josa'),
 ('나', 'Noun'),
 ('는', 'Josa'),
 ('아름다운', 'Adjective'),
 ('꿈', 'Noun'),
 ('을', 'Josa'),
 ('꾸었습니다', 'Verb'),
 ('!', 'Punctuation')]

---
p16

In [26]:
# tokenizing - Kkma
from konlpy.tag import Kkma
kkma = Kkma()

print('<Kkma word tokenizer>')
print(kkma.morphs(text))

<Kkma word tokenizer>
['하늘', '을', '날', '는', '아름답', 'ㄴ', '꿈', '을', '꾸', '었', '습니다', '!']


In [28]:
# POS tagging (형태소 분석) - Kkma
tagged_text = kkma.pos(text)
tagged_text

[('하늘', 'NNG'),
 ('을', 'JKO'),
 ('날', 'VV'),
 ('는', 'ETD'),
 ('아름답', 'VA'),
 ('ㄴ', 'ETD'),
 ('꿈', 'NNG'),
 ('을', 'JKO'),
 ('꾸', 'VV'),
 ('었', 'EPT'),
 ('습니다', 'EFN'),
 ('!', 'SF')]

---
p17

In [29]:
# Stemming
def tokenizer_porter_kor(text):
    return okt.morphs(text, norm=True, stem=True)

In [31]:
# tokenizing only
okt.morphs(text)

['하늘', '을', '나', '는', '아름다운', '꿈', '을', '꾸었습니다', '!']

In [23]:
# tokenizing + stemming
tokenizer_porter_kor(text)

['하늘', '을', '나', '는', '아름답다', '꿈', '을', '꾸다', '!']

In [33]:
# nouns only
okt.nouns(text)

['하늘', '나', '꿈']

In [34]:
# 띄어쓰기 오류인 경우도 가능
okt.nouns('아버지가방에들어가신다')

['아버지', '가방']

---
p20

# 2. Vectorization: the bag-of-words model

### Transforming documents into term frequency vectors 

In [38]:
# vectorize texts - Document-Term Matrix
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer()

docs = np.array([
        'The sun is shining',
        'The weather is sweet',
        'The sun is shining, the weather is sweet, and she likes RUNNING!'])
bag = count.fit_transform(docs)

In [48]:
# vocabulary
print(count.vocabulary_)

{'the': 8, 'sun': 6, 'is': 1, 'shining': 5, 'weather': 9, 'sweet': 7, 'and': 0, 'she': 4, 'likes': 2, 'running': 3}


In [42]:
# Document-Term Matrix
print(bag.toarray())

[[0 1 0 0 0 1 1 0 1 0]
 [0 1 0 0 0 0 0 1 1 1]
 [1 2 1 1 1 1 1 1 2 1]]


---
p23

### Transforming documents into TF-IDF vectors

In [29]:
np.set_printoptions(precision=2)

In [45]:
# vectorize texts - TF-IDF Matrix
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
docs_vector = tfidf.fit_transform(docs)

In [46]:
# vocabulary
print(tfidf.vocabulary_)

{'the': 8, 'sun': 6, 'is': 1, 'shining': 5, 'weather': 9, 'sweet': 7, 'and': 0, 'she': 4, 'likes': 2, 'running': 3}


In [47]:
# TF-IDF Matrix (normalized)
print(docs_vector.toarray())

[[0.         0.43370786 0.         0.         0.         0.55847784
  0.55847784 0.         0.43370786 0.        ]
 [0.         0.43370786 0.         0.         0.         0.
  0.         0.55847784 0.43370786 0.55847784]
 [0.33141999 0.39148397 0.33141999 0.33141999 0.33141999 0.25205345
  0.25205345 0.25205345 0.39148397 0.25205345]]


---
p24

In [49]:
# vectorize texts - TF-IDF Matrix (with preprocessing, stemming, stopwords)
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=preprocessor,   # preprocessing
                        tokenizer=tokenizer_porter,  # stemming
                        stop_words=stop              # removing stopwords
                       )
docs_vector = tfidf.fit_transform(docs)

  'stop_words.' % sorted(inconsistent))


In [51]:
# vocabulary
print(tfidf.vocabulary_)

{'sun': 3, 'shine': 2, 'weather': 5, 'sweet': 4, 'like': 0, 'run': 1}


In [52]:
# TF-IDF Matrix (normalized)
print(docs_vector.toarray())

[[0.         0.         0.70710678 0.70710678 0.         0.        ]
 [0.         0.         0.         0.         0.70710678 0.70710678]
 [0.48148213 0.48148213 0.36617957 0.36617957 0.36617957 0.36617957]]


---
p25

In [54]:
# TFIDF example
docs = np.array([
        'The sun is shining',
        'The weather is sweet',
        'The sun is shining, the weather is sweet, and she likes RUNNING!'])
n_docs = 3

# Tf-Idf score of "is" in doc 1
tf = 1
df = 3
idf = np.log((n_docs+1)/(df+1))
tfidf = tf * (idf + 1)
print('tf-idf of term "is" = %.2f' % tfidf)

# Tf-Idf score of "sun" in doc 1
tf = 1
df = 2
idf = np.log((n_docs+1)/(df+1))
tfidf = tf * (idf+1)
print('tf-idf of term "sun" = %.2f' % tfidf)

tf-idf of term "is" = 1.00
tf-idf of term "sun" = 1.29


---
p27

# 3. Training a model for document classification

### Load the IMDb movie review data 

In [55]:
import pandas as pd

df = pd.read_csv('movie_data.csv', encoding='utf-8')
df.head(3)

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0


---
p28

### Preprocessing

In [77]:
# use 1000 texts for training and test
X_train = df.loc[0:999, 'review'].values
y_train = df.loc[0:999, 'sentiment'].values
X_test = df.loc[49000:, 'review'].values
y_test = df.loc[49000:, 'sentiment'].values

X_train.shape

(1000,)

In [78]:
X_train[0]

'In 1974, the teenager Martha Moxley (Maggie Grace) moves to the high-class area of Belle Haven, Greenwich, Connecticut. On the Mischief Night, eve of Halloween, she was murdered in the backyard of her house and her murder remained unsolved. Twenty-two years later, the writer Mark Fuhrman (Christopher Meloni), who is a former LA detective that has fallen in disgrace for perjury in O.J. Simpson trial and moved to Idaho, decides to investigate the case with his partner Stephen Weeks (Andrew Mitchell) with the purpose of writing a book. The locals squirm and do not welcome them, but with the support of the retired detective Steve Carroll (Robert Forster) that was in charge of the investigation in the 70\'s, they discover the criminal and a net of power and money to cover the murder.<br /><br />"Murder in Greenwich" is a good TV movie, with the true story of a murder of a fifteen years old girl that was committed by a wealthy teenager whose mother was a Kennedy. The powerful and rich famil

---
p29

In [79]:
# vectorize to TF-IDF Matrix 
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=preprocessor,
                        # Below two steps are needed, but it takes long time,
                        # so, We'll skip this processes
                        #tokenizer=tokenizer_porter,
                        #stop_words=stop,
                        min_df=0.1,   # ignore terms occured in more than 10% of docs (stop words)
                       )

X_train_vector = tfidf.fit_transform(X_train)
X_test_vector = tfidf.transform(X_test)

In [80]:
# automatic stop words
print(tfidf.stop_words_)



---
p30

In [81]:
# data dimension
X_train_vector = X_train_vector.toarray()
X_test_vector = X_test_vector.toarray()
X_train_vector.shape

(1000, 212)

---
p31

In [82]:
# vectorize to TF-IDF Matrix - remove rare terms too
tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=preprocessor,
                        # Below two steps are needed, but it takes long time,
                        # so, We'll skip this processes
                        #tokenizer=tokenizer_porter,
                        #stop_words=stop,
                        max_df=0.1,   # ignore terms occured in more than 10% of docs (stop words)
                        min_df=0.1     # ignore terms occured in less than 10 docs
                       )

X_train_vector = tfidf.fit_transform(X_train)
X_test_vector = tfidf.transform(X_test)

In [83]:
# data dimension
X_train_vector = X_train_vector.toarray()
X_test_vector = X_test_vector.toarray()
X_train_vector.shape

(1000, 1)

---
p32

In [84]:
# text 0 
print(X_train[0])

In 1974, the teenager Martha Moxley (Maggie Grace) moves to the high-class area of Belle Haven, Greenwich, Connecticut. On the Mischief Night, eve of Halloween, she was murdered in the backyard of her house and her murder remained unsolved. Twenty-two years later, the writer Mark Fuhrman (Christopher Meloni), who is a former LA detective that has fallen in disgrace for perjury in O.J. Simpson trial and moved to Idaho, decides to investigate the case with his partner Stephen Weeks (Andrew Mitchell) with the purpose of writing a book. The locals squirm and do not welcome them, but with the support of the retired detective Steve Carroll (Robert Forster) that was in charge of the investigation in the 70's, they discover the criminal and a net of power and money to cover the murder.<br /><br />"Murder in Greenwich" is a good TV movie, with the true story of a murder of a fifteen years old girl that was committed by a wealthy teenager whose mother was a Kennedy. The powerful and rich family 

In [85]:
# text 0 
print(X_train_vector[0])

[0.]


---
p33

### Logistic Regression

In [86]:
# train using Logistic Regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(penalty='l2', verbose=1)
lr.fit(X_train_vector, y_train)

[LibLinear]



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=1, warm_start=False)

In [87]:
# train score
lr.score(X_train_vector, y_train)

0.513

In [88]:
# test score
lr.score(X_test_vector, y_test)

0.508

In [89]:
# sentiment prediction example 
tweets = ["this movie is garbage", 
          "I loved it very much", 
          "what a fantastic film!"]

tweets_tfidf = tfidf.transform(tweets)
lr.predict(tweets_tfidf)

array([1, 1, 1])

---
p34

### Decision Tree

In [90]:
# train using Decision Tree
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(max_depth=20)
tree.fit(X_train_vector, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=20,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [91]:
# train score
lr.score(X_train_vector, y_train)

0.513

In [92]:
# test score
lr.score(X_test_vector, y_test)

0.508

---
p35

In [93]:
# finding most important terms
importances = tree.feature_importances_
indices = np.argsort(importances)[::-1]

for f in range(10):
    print("%2d. %-30s %f" % (f+1, 
                             [w for w, n in tfidf.vocabulary_.items() if n == indices[f]],
                             importances[indices[f]]))

 1. ['music']                      1.000000


IndexError: index 1 is out of bounds for axis 0 with size 1

### Gaussian Naive Bayes

In [146]:
# train using Decision Tree
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(X_train_vector, y_train)

GaussianNB(priors=None)

In [147]:
# train score
nb.score(X_train_vector, y_train)

0.95899999999999996

In [148]:
# test score
nb.score(X_test_vector, y_test)

0.73599999999999999

### KNN
- Not recommended

In [94]:
# train using K-NN
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train_vector, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

In [95]:
# train score
knn.score(X_train_vector, y_train)

0.513

In [96]:
# test score
knn.score(X_test_vector, y_test)

0.508

### Multi-layer Neural Network

In [97]:
# train using Neural Network
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(learning_rate_init=0.01, max_iter=100, verbose=1)
mlp.fit(X_train_vector, y_train)

Iteration 1, loss = 0.69318005
Iteration 2, loss = 0.69389990
Iteration 3, loss = 0.69250087
Iteration 4, loss = 0.69246409
Iteration 5, loss = 0.69105734
Iteration 6, loss = 0.69144135
Iteration 7, loss = 0.69176423
Iteration 8, loss = 0.69199268
Iteration 9, loss = 0.69122007
Iteration 10, loss = 0.69154856
Iteration 11, loss = 0.69227361
Iteration 12, loss = 0.69125104
Iteration 13, loss = 0.69207852
Iteration 14, loss = 0.69119775
Iteration 15, loss = 0.69237249
Iteration 16, loss = 0.69154192
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.


MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.01, max_iter=100, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=1, warm_start=False)

In [98]:
# train score
mlp.score(X_train_vector, y_train)

0.513

In [99]:
# test score
mlp.score(X_test_vector, y_test)

0.508

---
p37

# Quiz : Naver movie review classification(Korean)
- Use movie review dataset "kor_movie.csv"
- class : 0, 1 (neg, pos)
- data size : 200,000 - use first 1,000 texts
- use 70% as training set
1. Preprocess text using Okt to make TFIDF vectors - ignore terms occured in more than 10% of texts
2. Build model using Logistic Regression, Decision Tree, and Neural Network. Check the accuracies
3. Find most important 20 terms using DT

### Read dataset. Use first 1000 texts only

In [155]:
import pandas as pd
import numpy as np

# read dataset
df_kor = pd.read_csv("kor_movie.csv", encoding='utf-8')
df_kor.head(3)

Unnamed: 0,review,sentiment
0,아 더빙.. 진짜 짜증나네요 목소리,0
1,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,너무재밓었다그래서보는것을추천한다,0


In [156]:
# use 1000 reviews 
df_sample = df_kor.iloc[:1000]
np.bincount(df_sample.sentiment)

array([508, 492], dtype=int64)

### Get training and test set

In [157]:
# get X and y
X = df_sample["review"].values
y = df_sample['sentiment'].values

# number of data
X.shape

(1000,)

In [158]:
# Split Dataset into 70% train and 30% test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=1)
X_train.shape

(700,)

### Vectorize using Okt

In [159]:
# function for tokenizing + stemming using Okt
from konlpy.tag import Okt
okt = None

def tokenizer_kor(text):
    return None 

In [160]:
# vectorize to TF-IDF Matrix
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = None

X_train_vector = None
X_test_vector = None

In [161]:
# check automatic stop words
print(None)

{'하다', '에', '이', '은', '들', '가', '보다', '없다', '도', '을', '영화', '의'}


In [162]:
# data dimension
X_train_vector = X_train_vector.toarray()
X_test_vector = X_test_vector.toarray()
X_train_vector.shape

(700, 2654)

In [163]:
# text 0 
print(X_train[0])

진짜 잘 만든 수작


In [164]:
# vector of text 0 
print(X_train_vector[0])

[ 0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.

### Logistic Regression

In [165]:
from sklearn.linear_model import LogisticRegression
lr = None
None

[LibLinear]

  " = {}.".format(self.n_jobs))


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=1, warm_start=False)

In [166]:
lr.score(X_train_vector, y_train)

0.97285714285714286

In [167]:
lr.score(X_test_vector, y_test)

0.75666666666666671

### Decision Tree

In [168]:
# train using Decision Tree
from sklearn.tree import DecisionTreeClassifier
tree = None
None

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=30,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [169]:
tree.score(X_train_vector, y_train)

0.83857142857142852

In [170]:
tree.score(X_test_vector, y_test)

0.68000000000000005

In [171]:
# finding 20 most important terms
importances = None
indices = None

for f in range(20):
    print("%2d. %-30s %f" % None)

 1. ['재밌다']                        0.100997
 2. ['최고']                         0.074553
 3. ['적']                          0.056057
 4. ['있다']                         0.046053
 5. ['재미있다']                       0.040013
 6. ['재미없다']                       0.037542
 7. ['그']                          0.034809
 8. ['말']                          0.031189
 9. ['못']                          0.027528
10. ['굿']                          0.026359
11. ['내용']                         0.025618
12. ['좋다']                         0.025488
13. ['ㅎㅎ']                         0.021447
14. ['괜찮다']                        0.020102
15. ['명작']                         0.019655
16. ['와']                          0.019223
17. ['보고']                         0.018771
18. ['대단하다']                       0.018297
19. ['마지막']                        0.017175
20. ['인상']                         0.017158


### Multi-layer Nueral Network

In [172]:
# train using Neural Network
from sklearn.neural_network import MLPClassifier
mlp = None
None

Iteration 1, loss = 0.68455276
Iteration 2, loss = 0.52977602
Iteration 3, loss = 0.34821027
Iteration 4, loss = 0.19276600
Iteration 5, loss = 0.10139269
Iteration 6, loss = 0.05531005
Iteration 7, loss = 0.03423999
Iteration 8, loss = 0.02366867
Iteration 9, loss = 0.01726586
Iteration 10, loss = 0.01359902
Iteration 11, loss = 0.01101815
Iteration 12, loss = 0.00959464
Iteration 13, loss = 0.00815791
Iteration 14, loss = 0.00717363
Iteration 15, loss = 0.00660868
Iteration 16, loss = 0.00608317
Iteration 17, loss = 0.00565105
Iteration 18, loss = 0.00514797
Iteration 19, loss = 0.00494041
Iteration 20, loss = 0.00468230
Iteration 21, loss = 0.00451519
Iteration 22, loss = 0.00440235
Iteration 23, loss = 0.00426854
Iteration 24, loss = 0.00420126
Iteration 25, loss = 0.00409780
Iteration 26, loss = 0.00400663
Iteration 27, loss = 0.00395034
Iteration 28, loss = 0.00386498
Training loss did not improve more than tol=0.000100 for two consecutive epochs. Stopping.


MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.01, max_iter=100, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=1, warm_start=False)

In [173]:
mlp.score(X_train_vector, y_train)

0.99857142857142855

In [174]:
mlp.score(X_test_vector, y_test)

0.72666666666666668