# Random Forest

# ==== input 형태 만들기 ====

In [2]:
import pandas as pd
import numpy as np

data = pd.read_json("./labeled_data.json")
data.columns = ["label", "trigram"]

In [3]:
# data flatten 해줌 3 x 50 = 150 feature
data['trigram'] = data['trigram'].apply(lambda x: (np.array(x).reshape(-1)))

# 훈련데이터 테스트데이터 분리
from sklearn.model_selection import train_test_split

y = data.pop('label')
X = data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

X_train = np.array(X_train['trigram'].tolist())
X_test = np.array(X_test['trigram'].tolist())

# ==== 모델링 ====

In [5]:
from sklearn.ensemble import RandomForestClassifier

# 기본모델로 돌려보기
rf = RandomForestClassifier(n_estimators=100, max_depth=2,
                             random_state=1)

rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=2, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=1, verbose=0,
                       warm_start=False)

In [6]:
# score 출력 역시 좋지않다!
print(rf.score(X_train, y_train), rf.score(X_test, y_test))

0.7163309131813069 0.7174898592221427


In [7]:
print(rf.get_params())

{'bootstrap': True, 'class_weight': None, 'criterion': 'gini', 'max_depth': 2, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 1, 'verbose': 0, 'warm_start': False}


In [9]:
# grid search를 위한 hyper parameter 범위 설정

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 100, num = 10)]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 100, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
random_grid

{'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
 'max_depth': [10, 19, 28, 37, 46, 55, 64, 73, 82, 91, 100, None],
 'min_samples_split': [2, 5, 10],
 'min_samples_leaf': [1, 2, 4],
 'bootstrap': [True, False]}

In [12]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# scaler한다음 rf grid search
pipe_rf = Pipeline([('scl', StandardScaler()), ('rf', RandomForestClassifier(random_state=1))])

param_grid = {'rf__n_estimators': n_estimators,
               'rf__max_depth': max_depth,
               'rf__min_samples_split': min_samples_split,
               'rf__min_samples_leaf': min_samples_leaf}
#                'rf__bootstrap': bootstrap}

# 욕설아닌 것들을 잘 예측을 못하기 때문에 f1 score기준으로 search하였다.
gs = GridSearchCV(estimator=pipe_rf, param_grid=param_grid,
                  scoring='f1', cv=5, n_jobs=-1)
%time gs = gs.fit(X_train, y_train)

CPU times: user 18.4 s, sys: 1.64 s, total: 20.1 s
Wall time: 56min 20s


In [13]:
# grid search를 통해 나온 best score, parameters 보기
print(gs.best_score_)
print(gs.best_params_)

0.9041595093718033
{'rf__max_depth': 28, 'rf__min_samples_leaf': 1, 'rf__min_samples_split': 2, 'rf__n_estimators': 80}


In [20]:
# 베스트 파라미터 적용시켜서 다시하기
clf = RandomForestClassifier(max_depth=28, min_samples_leaf=1, min_samples_split=2, n_estimators=80)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=28, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=80,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [25]:
from sklearn.metrics import classification_report, confusion_matrix 
# print prediction results 
predictions = clf.predict(X_test) 
print(clf.score(X_train, y_train), clf.score(X_test, y_test))
print(classification_report(y_test, predictions)) 

0.9988751406074241 0.8618468146027202
              precision    recall  f1-score   support

           0       0.83      0.65      0.73      1205
           1       0.87      0.95      0.91      2986

    accuracy                           0.86      4191
   macro avg       0.85      0.80      0.82      4191
weighted avg       0.86      0.86      0.86      4191



In [22]:
tt = zip(list(range(len(clf.feature_importances_))), list(clf.feature_importances_))
sorted(tt, key=lambda x: x[1], reverse=True)[:20] # 50 부터 100까지가 욕설의 형태를 띈 단어인데 그주변이 대부분이다.

[(74, 0.04833944969431489),
 (81, 0.024481502486178877),
 (73, 0.02281956474931012),
 (78, 0.022678854591804846),
 (61, 0.020070450843119064),
 (79, 0.01811390039696703),
 (82, 0.017959499259192598),
 (67, 0.01770153025298927),
 (85, 0.01736047291538577),
 (96, 0.01606220945314773),
 (98, 0.01531237403515211),
 (124, 0.014949087173775319),
 (71, 0.014630017385657945),
 (72, 0.013871216070973913),
 (93, 0.012456008243748231),
 (75, 0.012148444677806035),
 (83, 0.011277990090811298),
 (50, 0.010752187493707616),
 (62, 0.010748594436991686),
 (86, 0.009882246027366728)]

In [26]:
# save the model to disk
import pickle
filename = 'rf_model'
pickle.dump(clf, open(filename, 'wb'))

In [27]:
# load the model from disk
from sklearn.externals import joblib
loaded_model = joblib.load(filename)
result = loaded_model.predict(X_train[0:1])
print(result)
clf.predict_proba(X_train[0:2])

[1]


array([[0.05  , 0.95  ],
       [0.1625, 0.8375]])