# NLP Basics: Building A Basic Random Forest Model On Top Of Vectorized Text

### Read In & Clean Text

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Read in, clean, and vectorize data
import nltk
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import string

nltk.download("stopwords")
stopwords = nltk.corpus.stopwords.words('english')

messages = pd.read_csv('/content/drive/My Drive/Portfolio/dataset/data/spam.csv', encoding='latin-1')
messages = messages.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
messages.columns = ["label", "text"]

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [word for word in tokens if word not in stopwords]
    return text

tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(messages['text'])

X_features = pd.DataFrame(X_tfidf.toarray())

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [106]:
messages.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### Explore RandomForestClassifier Attributes & Hyperparameters

In [None]:
# Import Random Forest for classification from sklearn
from sklearn.ensemble import RandomForestClassifier

In [None]:
# View the arguments (and default values) for RandomForestClassifier
print(RandomForestClassifier())

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)


### Explore RandomForestClassifier using different optimization libraries

In [None]:
# Import the methods that will be needed to evaluate a basic model
from sklearn.metrics import precision_score, recall_score, accuracy_score
from sklearn.model_selection import train_test_split

In [None]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_features,
                                                    messages['label'],
                                                    test_size=0.2)

In [None]:
# Fit a basic Random Forest model
rc = RandomForestClassifier(n_jobs=-1)
rc.fit(X_train, y_train)
y_pred = rc.predict(X_test)

In [None]:
# Evalute model predictions using precision and recall
precision = precision_score(y_test, y_pred, pos_label='spam')
recall = recall_score(y_test, y_pred, pos_label='spam')
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy : {} / Precision: {} / Recall: {}'.format(round(accuracy, 3), round(precision, 3), round(recall, 3)))

Accuracy : 0.973 / Precision: 1.0 / Recall: 0.808


# GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = {
    "n_estimators" : [100, 150, 200],
    "max_depth" : [None, 2,3,4],
    "criterion" : ["gini", "entropy"]
    }

model = GridSearchCV(
    estimator=rc,
    param_grid = param_grid,
    scoring = "accuracy",
    verbose = 10,
    n_jobs = 1,
    cv=5
)

model.fit(X_features, messages['label'])

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV] criterion=gini, max_depth=None, n_estimators=100 ................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  criterion=gini, max_depth=None, n_estimators=100, score=0.960, total=  27.6s
[CV] criterion=gini, max_depth=None, n_estimators=100 ................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   27.6s remaining:    0.0s


[CV]  criterion=gini, max_depth=None, n_estimators=100, score=0.971, total=  27.1s
[CV] criterion=gini, max_depth=None, n_estimators=100 ................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   54.7s remaining:    0.0s


[CV]  criterion=gini, max_depth=None, n_estimators=100, score=0.975, total=  27.8s
[CV] criterion=gini, max_depth=None, n_estimators=100 ................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.4min remaining:    0.0s


[CV]  criterion=gini, max_depth=None, n_estimators=100, score=0.971, total=  26.2s
[CV] criterion=gini, max_depth=None, n_estimators=100 ................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  1.8min remaining:    0.0s


[CV]  criterion=gini, max_depth=None, n_estimators=100, score=0.971, total=  26.5s
[CV] criterion=gini, max_depth=None, n_estimators=150 ................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.3min remaining:    0.0s


[CV]  criterion=gini, max_depth=None, n_estimators=150, score=0.964, total=  39.8s
[CV] criterion=gini, max_depth=None, n_estimators=150 ................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  2.9min remaining:    0.0s


[CV]  criterion=gini, max_depth=None, n_estimators=150, score=0.966, total=  41.1s
[CV] criterion=gini, max_depth=None, n_estimators=150 ................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  3.6min remaining:    0.0s


[CV]  criterion=gini, max_depth=None, n_estimators=150, score=0.974, total=  41.7s
[CV] criterion=gini, max_depth=None, n_estimators=150 ................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  4.3min remaining:    0.0s


[CV]  criterion=gini, max_depth=None, n_estimators=150, score=0.970, total=  39.5s
[CV] criterion=gini, max_depth=None, n_estimators=150 ................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  5.0min remaining:    0.0s


[CV]  criterion=gini, max_depth=None, n_estimators=150, score=0.972, total=  41.6s
[CV] criterion=gini, max_depth=None, n_estimators=200 ................
[CV]  criterion=gini, max_depth=None, n_estimators=200, score=0.962, total=  54.2s
[CV] criterion=gini, max_depth=None, n_estimators=200 ................
[CV]  criterion=gini, max_depth=None, n_estimators=200, score=0.970, total=  55.4s
[CV] criterion=gini, max_depth=None, n_estimators=200 ................
[CV]  criterion=gini, max_depth=None, n_estimators=200, score=0.973, total=  55.9s
[CV] criterion=gini, max_depth=None, n_estimators=200 ................
[CV]  criterion=gini, max_depth=None, n_estimators=200, score=0.970, total=  53.7s
[CV] criterion=gini, max_depth=None, n_estimators=200 ................
[CV]  criterion=gini, max_depth=None, n_estimators=200, score=0.975, total= 1.0min
[CV] criterion=gini, max_depth=2, n_estimators=100 ...................
[CV]  criterion=gini, max_depth=2, n_estimators=100, score=0.868, total=   1

[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed: 25.6min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=-1,
                                              oob_score=False,
                                              random_

In [None]:
print(model.best_score_)
print(model.best_estimator_.get_params())

0.9699367869024073
{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 200, 'n_jobs': -1, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


# Randomize search 

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
param_grid = {
    "n_estimators" : np.arange(100, 1500, 100),
    "max_depth" : np.arange(1, 20),
    "criterion" : ["gini", "entropy"]
    }

model = RandomizedSearchCV(
    estimator=rc,
    param_distributions = param_grid,
    scoring = "accuracy",
    n_iter=15,
    verbose = 10,
    n_jobs = 1,
    cv=5
)

model.fit(X_features, messages['label'])

Fitting 5 folds for each of 15 candidates, totalling 75 fits
[CV] n_estimators=1200, max_depth=6, criterion=entropy ...............


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  n_estimators=1200, max_depth=6, criterion=entropy, score=0.871, total=  35.1s
[CV] n_estimators=1200, max_depth=6, criterion=entropy ...............


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   35.1s remaining:    0.0s


[CV]  n_estimators=1200, max_depth=6, criterion=entropy, score=0.870, total=  35.1s
[CV] n_estimators=1200, max_depth=6, criterion=entropy ...............


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.2min remaining:    0.0s


[CV]  n_estimators=1200, max_depth=6, criterion=entropy, score=0.872, total=  35.1s
[CV] n_estimators=1200, max_depth=6, criterion=entropy ...............


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.8min remaining:    0.0s


[CV]  n_estimators=1200, max_depth=6, criterion=entropy, score=0.871, total=  37.6s
[CV] n_estimators=1200, max_depth=6, criterion=entropy ...............


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  2.4min remaining:    0.0s


[CV]  n_estimators=1200, max_depth=6, criterion=entropy, score=0.872, total=  35.6s
[CV] n_estimators=900, max_depth=8, criterion=entropy ................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  3.0min remaining:    0.0s


[CV]  n_estimators=900, max_depth=8, criterion=entropy, score=0.878, total=  38.6s
[CV] n_estimators=900, max_depth=8, criterion=entropy ................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  3.6min remaining:    0.0s


[CV]  n_estimators=900, max_depth=8, criterion=entropy, score=0.883, total=  37.1s
[CV] n_estimators=900, max_depth=8, criterion=entropy ................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  4.2min remaining:    0.0s


[CV]  n_estimators=900, max_depth=8, criterion=entropy, score=0.880, total=  36.9s
[CV] n_estimators=900, max_depth=8, criterion=entropy ................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  4.9min remaining:    0.0s


[CV]  n_estimators=900, max_depth=8, criterion=entropy, score=0.875, total=  38.0s
[CV] n_estimators=900, max_depth=8, criterion=entropy ................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  5.5min remaining:    0.0s


[CV]  n_estimators=900, max_depth=8, criterion=entropy, score=0.877, total=  36.3s
[CV] n_estimators=1200, max_depth=2, criterion=entropy ...............
[CV]  n_estimators=1200, max_depth=2, criterion=entropy, score=0.868, total=  14.3s
[CV] n_estimators=1200, max_depth=2, criterion=entropy ...............
[CV]  n_estimators=1200, max_depth=2, criterion=entropy, score=0.867, total=  14.2s
[CV] n_estimators=1200, max_depth=2, criterion=entropy ...............
[CV]  n_estimators=1200, max_depth=2, criterion=entropy, score=0.868, total=  14.3s
[CV] n_estimators=1200, max_depth=2, criterion=entropy ...............
[CV]  n_estimators=1200, max_depth=2, criterion=entropy, score=0.868, total=  14.3s
[CV] n_estimators=1200, max_depth=2, criterion=entropy ...............
[CV]  n_estimators=1200, max_depth=2, criterion=entropy, score=0.868, total=  14.2s
[CV] n_estimators=1000, max_depth=13, criterion=gini .................
[CV]  n_estimators=1000, max_depth=13, criterion=gini, score=0.905, tot

[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed: 50.9min finished


RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
               

In [None]:
print(model.best_score_)
print(model.best_estimator_.get_params())

0.9302242655755361
{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'entropy', 'max_depth': 18, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 400, 'n_jobs': -1, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


# Pipeline

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

This is an example of pipeline. 

In [None]:
scaler = StandardScaler()
pca = PCA()
rf = RandomForestClassifier(n_jobs=-1)

classifier = Pipeline([("scaler", scaler), ("pca", pca), ("rf", rf)])


param_grid = {
    "pca__n_components" : np.arange(5, 10),
    "rf__n_estimators" : np.arange(100, 1500, 100),      # notice the parameters
    "rf__max_depth" : np.arange(1, 20),
    "rf__criterion" : ["gini", "entropy"]
    }

model = RandomizedSearchCV(
    estimator=classifier,
    param_distributions = param_grid,
    scoring = "accuracy",
    n_iter=15,
    verbose = 10,
    n_jobs = 1,
    cv=5
)

model.fit(X_features, messages['label'])

Fitting 5 folds for each of 15 candidates, totalling 75 fits
[CV] rf__n_estimators=1400, rf__max_depth=6, rf__criterion=gini, pca__n_components=8 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  rf__n_estimators=1400, rf__max_depth=6, rf__criterion=gini, pca__n_components=8, score=0.944, total=  10.0s
[CV] rf__n_estimators=1400, rf__max_depth=6, rf__criterion=gini, pca__n_components=8 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   10.0s remaining:    0.0s


[CV]  rf__n_estimators=1400, rf__max_depth=6, rf__criterion=gini, pca__n_components=8, score=0.953, total=   9.7s
[CV] rf__n_estimators=1400, rf__max_depth=6, rf__criterion=gini, pca__n_components=8 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   19.7s remaining:    0.0s


[CV]  rf__n_estimators=1400, rf__max_depth=6, rf__criterion=gini, pca__n_components=8, score=0.961, total=   9.8s
[CV] rf__n_estimators=1400, rf__max_depth=6, rf__criterion=gini, pca__n_components=8 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   29.6s remaining:    0.0s


[CV]  rf__n_estimators=1400, rf__max_depth=6, rf__criterion=gini, pca__n_components=8, score=0.945, total=   9.6s
[CV] rf__n_estimators=1400, rf__max_depth=6, rf__criterion=gini, pca__n_components=8 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   39.1s remaining:    0.0s


[CV]  rf__n_estimators=1400, rf__max_depth=6, rf__criterion=gini, pca__n_components=8, score=0.955, total=   9.5s
[CV] rf__n_estimators=1200, rf__max_depth=13, rf__criterion=gini, pca__n_components=6 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   48.6s remaining:    0.0s


[CV]  rf__n_estimators=1200, rf__max_depth=13, rf__criterion=gini, pca__n_components=6, score=0.933, total=  10.9s
[CV] rf__n_estimators=1200, rf__max_depth=13, rf__criterion=gini, pca__n_components=6 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   59.5s remaining:    0.0s


[CV]  rf__n_estimators=1200, rf__max_depth=13, rf__criterion=gini, pca__n_components=6, score=0.947, total=  10.4s
[CV] rf__n_estimators=1200, rf__max_depth=13, rf__criterion=gini, pca__n_components=6 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  1.2min remaining:    0.0s


[CV]  rf__n_estimators=1200, rf__max_depth=13, rf__criterion=gini, pca__n_components=6, score=0.937, total=  10.8s
[CV] rf__n_estimators=1200, rf__max_depth=13, rf__criterion=gini, pca__n_components=6 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  1.3min remaining:    0.0s


[CV]  rf__n_estimators=1200, rf__max_depth=13, rf__criterion=gini, pca__n_components=6, score=0.951, total=  10.9s
[CV] rf__n_estimators=1200, rf__max_depth=13, rf__criterion=gini, pca__n_components=6 


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  1.5min remaining:    0.0s


[CV]  rf__n_estimators=1200, rf__max_depth=13, rf__criterion=gini, pca__n_components=6, score=0.956, total=  10.7s
[CV] rf__n_estimators=200, rf__max_depth=10, rf__criterion=entropy, pca__n_components=6 
[CV]  rf__n_estimators=200, rf__max_depth=10, rf__criterion=entropy, pca__n_components=6, score=0.941, total=   4.5s
[CV] rf__n_estimators=200, rf__max_depth=10, rf__criterion=entropy, pca__n_components=6 
[CV]  rf__n_estimators=200, rf__max_depth=10, rf__criterion=entropy, pca__n_components=6, score=0.943, total=   4.5s
[CV] rf__n_estimators=200, rf__max_depth=10, rf__criterion=entropy, pca__n_components=6 
[CV]  rf__n_estimators=200, rf__max_depth=10, rf__criterion=entropy, pca__n_components=6, score=0.958, total=   4.4s
[CV] rf__n_estimators=200, rf__max_depth=10, rf__criterion=entropy, pca__n_components=6 
[CV]  rf__n_estimators=200, rf__max_depth=10, rf__criterion=entropy, pca__n_components=6, score=0.943, total=   4.5s
[CV] rf__n_estimators=200, rf__max_depth=10, rf__criterion=en

[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed: 10.9min finished


RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=Pipeline(memory=None,
                                      steps=[('scaler',
                                              StandardScaler(copy=True,
                                                             with_mean=True,
                                                             with_std=True)),
                                             ('pca',
                                              PCA(copy=True,
                                                  iterated_power='auto',
                                                  n_components=None,
                                                  random_state=None,
                                                  svd_solver='auto', tol=0.0,
                                                  whiten=False)),
                                             ('rf',
                                              RandomForestClassifier(bootstrap=True,
                  

In [None]:
print(model.best_score_)
print(model.best_estimator_.get_params())

0.9542346735919232
{'memory': None, 'steps': [('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, iterated_power='auto', n_components=8, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)), ('rf', RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=18, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=800,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False))], 'verbose': False, 'scaler': StandardScaler(copy=True, with_mean=True, with_std=True), 'pca': PCA(copy=True, iterated_power='auto', n_components=8, random_state=None,
    svd_solver='auto', tol=0.0, w

# SKOPT

In [None]:
from sklearn.model_selection import StratifiedKFold
from functools import partial
from skopt import space
from skopt import gp_minimize

In [None]:
def optimize(params, param_names, x, y):

  params = dict(zip(param_names, params))
  model = RandomForestClassifier(**params)
  Kfold = StratifiedKFold(n_splits =5)

  accuracy = []

  for idx in Kfold.split(x, y):
    xtrain = x[idx[0]] 
    ytrain = y[idx[0]]

    xtest = x[idx[1]]
    ytest = y[idx[1]]

    model.fit(xtrain, ytrain)
    ypred = model.predict(xtest)
    acc = accuracy_score(ytest, ypred)
    accuracy.append(acc)

  return np.mean(accuracy)


In [87]:
param_space = [
    space.Integer(3, 15, name="max_depth"),
    space.Integer(100, 600, name="n_estimators"),
    space.Categorical(["gini", "entropy"], name="criterion"),
    space.Real(0.01, 1 , prior="uniform", name="max_features")
]
param_names=[i.name for i in param_space]

optimizer_function = partial(
    optimize,
    param_names=param_names,
    x=X_tfidf, y=messages['label']
)
result = gp_minimize(
    optimizer_function,
    dimensions= param_space,
    n_calls=15,
    n_random_starts=10,
    verbose=10
)

Iteration No: 1 started. Evaluating function at random point.
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 120.8676
Function value obtained: 0.9659
Current minimum: 0.9659
Iteration No: 2 started. Evaluating function at random point.
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 68.0298
Function value obtained: 0.9630
Current minimum: 0.9630
Iteration No: 3 started. Evaluating function at random point.
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 283.3437
Function value obtained: 0.9628
Current minimum: 0.9628
Iteration No: 4 started. Evaluating function at random point.
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 62.2915
Function value obtained: 0.9630
Current minimum: 0.9628
Iteration No: 5 started. Evaluating function at random point.
Iteration No: 5 ended. Evaluation done at random point.
Time taken: 87.9418
Function value obtained: 0.9539
Current minimum: 0.9539
Iteration No: 6 started. Eva

In [88]:
print(dict(zip(param_names, result.x)))

{'max_depth': 3, 'n_estimators': 600, 'criterion': 'gini', 'max_features': 0.01}


#Hyper opt

In [107]:
from hyperopt import hp, fmin, tpe, Trials
from hyperopt.pyll.base import scope

In [108]:
def optimize(params, x, y):

  model = RandomForestClassifier(**params)
  Kfold = StratifiedKFold(n_splits =5)

  accuracy = []

  for idx in Kfold.split(x, y):
    xtrain = x[idx[0]] 
    ytrain = y[idx[0]]

    xtest = x[idx[1]]
    ytest = y[idx[1]]

    model.fit(xtrain, ytrain)
    ypred = model.predict(xtest)
    acc = accuracy_score(ytest, ypred)
    accuracy.append(acc)

  return np.mean(accuracy)

In [110]:
param_space = {
    "max_depth": scope.int(hp.quniform("max_depth", 3, 15, 1)),
    "n_estimators": scope.int(hp.quniform("n_estimators", 100, 600, 1)),
    "criterion": hp.choice("criterion", ["gini", "entropy"]),
    "max_features": hp.uniform("max_features", 0.01, 1)
}

optimizer_function = partial(
    optimize,
    x=X_tfidf, y=messages['label']
)
trails = Trials()

result = fmin(
    fn=optimizer_function,
    space= param_space,
    algo=tpe.suggest,
    max_evals=15,
    trials=trails
)
print(result)

100%|██████████| 15/15 [28:12<00:00, 112.85s/it, best loss: 0.928032943942163]
{'criterion': 0, 'max_depth': 3.0, 'max_features': 0.3277656197847996, 'n_estimators': 383.0}


# Optuna

In [114]:
import optuna

In [121]:
def optimize(trail, x, y):
  
  criterion = trail.suggest_categorical("criterion", ["gini", "entropy"])
  n_estimators = trail.suggest_int("n_estimators", 100, 1500)
  max_depth = trail.suggest_int("max_depth", 3, 15)
  max_features = trail.suggest_uniform("max_features", 0.01, 1.0)

  
  model = RandomForestClassifier(
      n_estimators=n_estimators, max_depth=max_depth,
      max_features=max_features, criterion=criterion
  )
  Kfold = StratifiedKFold(n_splits =5)

  accuracy = []

  for idx in Kfold.split(x, y):
    xtrain = x[idx[0]] 
    ytrain = y[idx[0]]

    xtest = x[idx[1]]
    ytest = y[idx[1]]

    model.fit(xtrain, ytrain)
    ypred = model.predict(xtest)
    acc = accuracy_score(ytest, ypred)
    accuracy.append(acc)

  return np.mean(accuracy)

In [123]:
optimizer_function = partial(
    optimize,
    x=X_tfidf, y=messages['label']
)

study = optuna.create_study(direction="minimize")
study.optimize(
    optimizer_function, n_trials=15
)


[I 2020-09-27 13:29:23,146] A new study created in memory with name: no-name-f9929c28-fc7a-4fc7-aaf4-fc1d88adc749
[I 2020-09-27 13:38:35,476] Trial 0 finished with value: 0.9635673169042999 and parameters: {'criterion': 'entropy', 'n_estimators': 1154, 'max_depth': 10, 'max_features': 0.8482278754692272}. Best is trial 0 with value: 0.9635673169042999.
[I 2020-09-27 13:49:25,764] Trial 1 finished with value: 0.967335904227484 and parameters: {'criterion': 'entropy', 'n_estimators': 1309, 'max_depth': 14, 'max_features': 0.5589951041763699}. Best is trial 0 with value: 0.9635673169042999.
[I 2020-09-27 13:59:21,088] Trial 2 finished with value: 0.967873859803077 and parameters: {'criterion': 'gini', 'n_estimators': 1486, 'max_depth': 14, 'max_features': 0.40803176454895196}. Best is trial 0 with value: 0.9635673169042999.
[I 2020-09-27 14:05:46,206] Trial 3 finished with value: 0.9664383991755964 and parameters: {'criterion': 'entropy', 'n_estimators': 984, 'max_depth': 13, 'max_feature