### Random Forest Model Implementation

#### 1. Import Module and Data

In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
DATA_IN_PATH = './data_in/'
DATA_OUT_PATH = './data_out/'
TRAIN_CLEAN_DATA = 'train_clean.csv'
TEST_SIZE = 0.2
RANDOM_SEED = 42

In [None]:
train_data = pd.read_csv(DATA_IN_PATH + TRAIN_CLEAN_DATA)

In [None]:
reviews = list(train_data['reviews'])
sentiments = np.array(train_data['sentiment'])

#### 2. Vectorizing with CountVectorizer

In [None]:
vectorizer = CountVectorizer(analyzer='word', max_features=5000)
# analyzer를 통해 word 단위로 읽기
# max_features : 각 문장 벡터의 최대 길이

train_data_features = vectorizer.fit_transform(reviews)
# fit_transform() : 모든 문장에 등장하는 단어에 대해 index 부여 및 치환

print(train_data_features)

#### 3. Distribute Train / Validation data

In [None]:
from sklearn.model_selection import train_test_split

train_input, eval_input, train_label, eval_label = train_test_split(train_data_features, 
                                                                    sentiments, 
                                                                    test_size=TEST_SIZE,
                                                                    random_state=RANDOM_SEED)

#### 4. Model Implementation and Training

In [None]:
from sklearn.ensemble import RandomForestClassifier

# 100 Decession Trees for Random Forest Classifier
forest = RandomForestClassifier(n_estimators=100)

# Start Training with Bundle of words, vectorized data and evaluate data
forest.fit(train_input, train_label)

In [None]:
print(train_input.shape) # CountVectorizer로 5000 차원을 갖는 벡터화된 문장이 준비

In [None]:
print('Accuracy : %f' % forest.score(eval_input, eval_label)) # Evaluate the accuracy with score method

#### 5. Submit Data

In [None]:
TEST_CLEAN_DATA = 'test_clean.csv'

test_data = pd.read_csv(DATA_IN_PATH + TEST_CLEAN_DATA)

test_reviews = list(test_data['review'])
ids = list(test_data['id'])

In [None]:
test_data_features = vectorizer.transform(test_reviews)

In [None]:
if not os.path.exists(DATA_OUT_PATH):
    os.makedirs(DATA_OUT_PATH)
    
# Predict value from RFC
result = forest.predict(test_data_features)

output = pd.DataFrame(data = {'id':ids, 'sentiment':result})
output.to_csv(DATA_OUT_PATH + 'Bag_of_Words_model_RF.csv', index=False, quoting=3)