#Tutorial Sentiment Analysis
Pengajar :
1. Aris Budi Santoso
2. Prabu Kresna Putra

Tutorial ini merupakan panduan dalam melakukan sentiment analysis.

Data yang digunakan dalam sesi praktikum ini merupakan data yang telah berlabel dalam format csv yang telah digunakan dalam penelitian yang dilakukan oleh Simanungkalit, Tiarma dalam Tugas Mata Kuliah Analitika Media Sosial dan Digital dengan Tema Sentimen Analisis atas Kebijakan PPKM pada Program MTI Universitas Indonesia.

## 1. Persiapan

Install dan import library yang dibutuhkan

### Install Library

In [1]:
# !pip install nlp-id

In [2]:
# !pip install emoji

### Import Library

In [3]:
# # Library untuk akses Google Drive dari Colabs
# from google.colab import drive
# drive.mount("/content/gdrive")

In [None]:
# Library NLP
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')
#nltk.download('stopwords')
from nlp_id.stopword import StopWord
from nlp_id.postag import PosTag
from nlp_id.lemmatizer import Lemmatizer 

# Library Preprocessing
import pandas as pd
import re
import csv
import random
import string
import emoji
import operator
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

# Library untuk Melatih dan Evaluasi Model
from sklearn.model_selection import train_test_split
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from nltk.classify import ClassifierI
from statistics import mode
import pickle

[nltk_data] Downloading package punkt to C:\Users\Ade
[nltk_data]     Satya\AppData\Roaming\nltk_data...


In [None]:
#Visualization
import matplotlib.pyplot as plt
import seaborn as sns

## Data Preprocessing

### Load Data

In [None]:
import csv
link_file = 'https://raw.githubusercontent.com/audit-ti/pjj-pengolahan-data-python/main/05.%20Basic%20Machine%20Learning/use%20case%20ML/dataset-sentimen.csv'

In [None]:
with open(link_file, 'r', encoding="utf8") as nodecsv: # Buka file                       
    csvreader = csv.reader(nodecsv) # membaca data
    # Menyusun data dalam list dan menghilangkan header data
    datacsv = [n for n in csvreader][1:]

In [None]:
print(datacsv[:10])

### Tokenisasi

#### Regex untuk tokenisasi

In [None]:
import re

In [None]:
emoticons_str = r"""
    (?:
        [:=;] # Eyes
        [oO\-]? # Nose (optional)
        [D\)\]\(\]/\\OpP] # Mouth
    )"""
regex_str = []
regex_str.append(emoticons_str)
regex_str.append(r'<[^>]+>')# HTML tags
regex_str.append(r'(?:@[\w_]+)')# @-mentions
regex_str.append(r'(?:&[\w_]+)')
regex_str.append(r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)") # hash-tags
regex_str.append(r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+') # URLs
regex_str.append(r'(?:(?:\d+,?)+(?:\.?\d+)?)') # numbers
regex_str.append(r"(?:[a-z][a-z'\-_]+[a-z])") # words with - and '
regex_str.append(r'(?:[\w_]+)') # other words
regex_str.append(r'(?:\S)') # anything else

tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)

In [None]:
def tokenize(s):
    tokens = tokens_re.findall(s)
    return tokens

In [None]:
tokens = []
for c in datacsv:
    tokens.append(tokenize(c[4]))

print(tokens[:5])

#### Stop Word Removal

In [None]:
stopword = StopWord() 

In [None]:
punctuation = list(string.punctuation)
stop = stopword.get_stopword() + punctuation + ['rt', 'via', '…','•','“']

In [None]:
# Fungsi untuk melakukan stop word removal
def cleanTweet(token,regex):
    terms_all = [emoji.demojize(term) for term in token if term.lower() not in stop and not regex.match(term)]
    return terms_all

In [None]:
exclude_str = []
exclude_str.append(emoticons_str)
exclude_str.append(r'<[^>]+>')# HTML tags
exclude_str.append(r'(?:@[\w_]+)')# @-mentions
exclude_str.append(r'(?:&[\w_]+)')
exclude_str.append(r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)") # hash-tags
exclude_str.append(r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+') # URLs
exclude_str.append(r'(?:(?:\d+,?)+(?:\.?\d+)?)') # numbers
exclude_str.append(r'(\b[a-zA-Z0-9]{2,3}\b)') # numbers

exclude_re = re.compile(r'('+'|'.join(exclude_str)+')', re.VERBOSE | re.IGNORECASE)

In [None]:
# Menghitung frekwensi kata
all_word=[]
from collections import Counter
count_all = Counter()
for token in tokens:
    cleanToken = cleanTweet(token,exclude_re)
    terms_all = [term.lower() for term in cleanToken]
    count_all.update(terms_all)
    for w in terms_all:
        all_word.append(w)
print(count_all.most_common(100))

In [None]:
len(all_word)

#### Lemmatisasi (Mengambil kata dasar)

In [None]:
lemmatizer = Lemmatizer()

In [None]:
all_stmword = [lemmatizer.lemmatize(w) for w in all_word]

In [None]:
print(all_stmword[:100])

In [None]:
print(lemmatizer.lemmatize('menyejukan'))

In [None]:
len(all_stmword)

#### POS Tagging

In [None]:
postagger = PosTag() 

In [None]:
pos = [postagger.get_pos_tag(w)[0] for w in all_stmword if len(postagger.get_pos_tag(w))>0]

In [None]:
print(pos[:10])

In [None]:
allowed_word_types = ["ADV","JJ","ADJP","VB"]
allowed_words=[w[0] for w in pos if w[1] in allowed_word_types]

In [None]:
print(allowed_words[:100])

In [None]:
len(allowed_words)

### Bag of Words

#### Frekwensi Kemunculan Kata

In [None]:
#Menyusun distribusi kata berdasarkan kemunculannya
allwords = nltk.FreqDist(allowed_words)

#Mengambil 1000 pertama dari distribusi kata berdasarkan frekwensi kemunculannya 
tuple_features = list(allwords.most_common())[:1000]
word_features = [k for (k,v) in tuple_features]
#Memeriksa word_features
print(word_features[:100])

In [None]:
tuple_features

#### Membentuk Featureset

In [None]:
# Membentuk list dokumen dan labelnya
document=[]
for w in datacsv:
  document.append((w[4],w[1]))

In [None]:
# Fungsi untuk membentuk bag of words
def find_features(document):
    words = tokenize(document)
    words = cleanTweet(words,exclude_re)
    words = [lemmatizer.lemmatize(w) for w in words]
    features = {}
    for w in word_features:
        features[w] = (w in words)
    return features

In [None]:
nltk.download('punkt')
featuresets = [(find_features(rev), category) for (rev, category) in document]

In [None]:
print(featuresets[3])

### Distribusi Dataset

In [None]:
lsclass = [category for (rev, category) in document]
df_alltrain = pd.DataFrame(lsclass, columns=['class'])

In [None]:
sns.barplot(df_alltrain['class'].unique(),df_alltrain['class'].value_counts())
plt.title('Class Label Distribution')
plt.xlabel('Class Label')
plt.ylabel('Count')
plt.show()

print('Jumlah baris kelas positif: ',df_alltrain['class'].value_counts()['positive'])
print('Jumlah baris kelas negatif: ',df_alltrain['class'].value_counts()['negative'])

### Membagi Data Training dan Testing

In [None]:
# menyusun data training dan testing 
trainsize = round(len(featuresets)*0.7)

random.shuffle(featuresets)
training_set = featuresets[:trainsize]
testing_set = featuresets[trainsize:]


In [None]:
len(testing_set)

### Melatih Model Klasifikasi dengan Library NLTK

#### Naive Bayes

##### Melatih Model

In [None]:
classifier = nltk.NaiveBayesClassifier.train(training_set)

print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier, testing_set))*100)
classifier.show_most_informative_features(25)

##### Mengukur kinerja model

In [None]:
from nltk.metrics.scores import (precision, recall)
import collections

refsets = collections.defaultdict(set)
testsets = collections.defaultdict(set)

for i, (feats, label) in enumerate(testing_set):
    refsets[label].add(i)
    observed = classifier.classify(feats)
    testsets[observed].add(i)

print('Recall:', recall(refsets['positive'], testsets['positive']))
print('Precision:', precision(refsets['positive'], testsets['positive']))

In [None]:
from sklearn.model_selection import KFold
cv = KFold(n_splits=5, shuffle=False, random_state=None)

In [None]:
k=1
for traincv, testcv in cv.split(featuresets):
    training_sets = featuresets[traincv[0]:traincv[len(traincv)-1]]
    testing_sets = featuresets[testcv[0]:testcv[len(testcv)-1]]
    classifier = nltk.NaiveBayesClassifier.train(featuresets[traincv[0]:traincv[len(traincv)-1]])
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)
 
    for i, (feats, label) in enumerate(testing_sets):
        refsets[label].add(i)
        observed = classifier.classify(feats)
        testsets[observed].add(i)
    print ("Fold - "+str(k))
    print("Accuracy:",(nltk.classify.accuracy(classifier, testing_sets))*100)
    print('Recall:', recall(refsets['positive'], testsets['positive'])*100)
    print('Precision:', precision(refsets['positive'], testsets['positive'])*100)
    k+=1

##### Menyimpan model ke dalam bentuk file

In [None]:
#menyimpan model yang telah ditrainning ke file sav
filename = '/content/gdrive/My Drive/PRAKTIKUM_AMSD/model/ppkm_nb_model.sav'
pickle.dump(classifier, open(filename, 'wb'))

In [None]:
#load model classifier nb dari file
loaded_model = pickle.load(open(filename, 'rb'))

In [None]:
#melakukan prediksi data menggunakan model classifier nb yang telah diload
lsPrediksi = []
for (rev, category) in document[trainsize:]:
    result=loaded_model.classify(find_features(rev))
    lsPrediksi.append([rev, result, category])

In [None]:
print(lsPrediksi[:10])

In [None]:
act=[]
pred=[]

for i, (feats, label) in enumerate(testing_set):
    observed = classifier.classify(feats)
    act.append(label)
    pred.append(observed)

##### Confusion Matrix

In [None]:
from nltk.metrics import ConfusionMatrix

In [None]:
matrix = ConfusionMatrix(act,pred)

In [None]:
print(matrix)

#### Support Vector Machine

In [None]:
classifier2 = nltk.classify.SklearnClassifier(LinearSVC())
classifier2.train(training_set)

In [None]:
print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier2, testing_set))*100)

#### Decision Tree

In [None]:
classifier3 = nltk.classify.SklearnClassifier(DecisionTreeClassifier())
classifier3.train(training_set)

In [None]:
print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier3, testing_set))*100)

#### AdaBoost


In [None]:
classifier4 = nltk.classify.SklearnClassifier(AdaBoostClassifier())
classifier4.train(training_set)

In [None]:
print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier4, testing_set))*100)

#### Random Forest


In [None]:
classifier5 = nltk.classify.SklearnClassifier(RandomForestClassifier())
classifier5.train(training_set)

In [None]:
print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier5, testing_set))*100)

#### Gradient Boosting

In [None]:
classifier6 = nltk.classify.SklearnClassifier(GradientBoostingClassifier())
classifier6.train(training_set)

In [None]:
print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier6, testing_set))*100)

### Melatih Model dengan Sklearn dan Vektorisasi

In [None]:
#Pengolahan data
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import RidgeClassifier ,LogisticRegression, Ridge
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import VotingClassifier, StackingClassifier, RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import MinMaxScaler
from lightgbm import LGBMClassifier

import joblib
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV, KFold, cross_validate, LeaveOneOut
from sklearn.metrics import confusion_matrix,f1_score,log_loss,roc_curve,recall_score,precision_recall_curve,precision_score,fbeta_score,auc, roc_auc_score, accuracy_score, classification_report, ConfusionMatrixDisplay, RocCurveDisplay
from sklearn.feature_selection import RFE, SelectKBest, SelectPercentile, chi2, SelectFromModel
from sklearn.decomposition import PCA

#Visualization
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
def cleanText(text):
  regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]')
  text = regex.sub(" ", text.lower())
  return text

In [None]:
def get_cltext2(document):
    words = tokenize(document)
    words = cleanTweet(words,exclude_re)
    words = [lemmatizer.lemmatize(w) for w in words]
    pos = [postagger.get_pos_tag(w)[0] for w in words if len(postagger.get_pos_tag(w))>0]
    allowed_word_types = ["ADV","JJ","ADJP","VB"]
    allowed_words=[w[0] for w in pos if w[1] in allowed_word_types]
    cltext = ' '.join(allowed_words)
    return cltext

In [None]:
def get_cltext(document):
    words = tokenize(document)
    words = cleanTweet(words,exclude_re)
    words = [lemmatizer.lemmatize(w) for w in words]
    cltext = ' '.join(words)
    return cltext

In [None]:
#Melatih Model k-NN
x_train = [];
y_train = [];
for (rev, category) in document[:trainsize]:
  #text = cleanText(rev)
  text = get_cltext(rev)
  #lmz_train = lemmatizer.lemmatize(text);
  x_train.append(text);
  y_train.append(category);

In [None]:
def set_numeric_label(y):
  dfy = pd.DataFrame(y)
  nmy = dfy.replace(['negative','positive'],[0,1])
  return nmy[0].values.tolist()

In [None]:
y_train = set_numeric_label(y_train)

In [None]:
print(y_train[:100])

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
bow_vectorizer = CountVectorizer(max_features=50000, stop_words=set(stop))

In [None]:
bowVect = bow_vectorizer.fit(x_train)

In [None]:
bowTrain = bowVect.transform(x_train)

In [None]:
feat = bow_vectorizer.get_feature_names()
feat

In [None]:
x_test=[]
for (rev, category) in document[trainsize:]:
  text = get_cltext(rev)
  lmz_test = lemmatizer.lemmatize(text);
  x_test.append(lmz_test);

In [None]:
bowTest = bowVect.transform(x_test)

In [None]:
# Nilai label dari data test
y_act=[]
for (rev, category) in document[trainsize:]:
  y_act.append(category)

y_act = set_numeric_label(y_act)

In [None]:
#Mendefinisikan fungsi untuk menghitung metrik evaluasi kinerja model
def calculateMetric(lspred, y_test):
  lsres = []
  for prd in lspred:
    acc = accuracy_score(y_test, prd[1], normalize = True)
    prs = precision_score(y_test, prd[1], pos_label=1, average='binary')
    rcl =recall_score(y_test, prd[1], pos_label=1, average='binary')
    f1 = f1_score(y_test, prd[1], pos_label=1, average='binary')
    auc = roc_auc_score(y_test, prd[1])
    lsres.append([prd[0],acc,prs,rcl,f1,auc])
  return lsres

##### K Nearest Neighbor

In [None]:
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(bowTrain, y_train )

In [None]:
knpred = knn.predict(bowTest)

In [None]:
from sklearn.metrics import classification_report

In [None]:
target_names = [1, 0]

In [None]:
print(classification_report(y_act, knpred))

##### Decision Tree

In [None]:
dtree = DecisionTreeClassifier()

dtclf = dtree.fit(bowTrain, y_train) 
dtpred = dtclf.predict(bowTest)


In [None]:
print("Decission Tree accuracy : ",accuracy_score(y_act, dtpred, normalize = True))

##### Logistic Regression

In [None]:
lr = LogisticRegression()
lrclf = lr.fit(bowTrain, y_train)
lrpred = lrclf.predict(bowTest)

In [None]:
print("Logistic regression accuracy : ",accuracy_score(y_act, lrpred, normalize = True))

##### Naive Bayes

In [None]:
gnb = GaussianNB()
# latih classifier dengan data train dan lakukan prediksi
nbclf = gnb.fit(bowTrain.toarray(), y_train) 
nbpred = nbclf.predict(bowTest.toarray())
print("Naive-Bayes accuracy : ",accuracy_score(y_act, nbpred, normalize = True))

##### Random Forest

In [None]:
# Random Forest
rf = RandomForestClassifier(n_estimators=1000,max_depth=100,random_state=1, n_jobs=-1)
rfclf = rf.fit(bowTrain.toarray(), y_train)
rf_pred = rfclf.predict(bowTest.toarray())
print("random forest accuracy : ",accuracy_score(y_act, rf_pred, normalize = True))

In [None]:
print(classification_report(y_act, rf_pred))

##### LGBM

In [None]:
#LGBMBoost
lgb = LGBMClassifier()

lgbclf = lgb.fit(bowTrain.toarray(), y_train) 
lgb_pred = lgbclf.predict(bowTest.toarray())

In [None]:
print(classification_report(y_act, lgb_pred))

##### XGBoost

In [None]:
#XGBoost
xgb = XGBClassifier()

xgbclf = xgb.fit(bowTrain, y_train) 
xgb_pred = xgbclf.predict(bowTest)

In [None]:
print(classification_report(y_act, xgb_pred))

##### AdaBoost

In [None]:
#AdaBoost
ada = AdaBoostClassifier(n_estimators=10, random_state=1)
adaclf = ada.fit(bowTrain, y_train)
ada_pred = adaclf.predict(bowTest)

In [None]:
print(classification_report(y_act, ada_pred))

##### Stacking - Voting Classifier

In [None]:
models = [
    ('knn', knn),
    ('lr', lr),
    ('ada', ada),
    ('rf', rf)]

model_voting = VotingClassifier(estimators=models, n_jobs=-1)
model_voting.fit(bowTrain.toarray(), y_train)
vot_pred = model_voting.predict(bowTest.toarray())

In [None]:
print(classification_report(y_act, vot_pred))

### Evaluasi Kinerja Model

#### Hold Out

In [None]:
#menjalankan evaluasi model
lspred = []
lspred.append(['knn',knpred])
lspred.append(['dt',dtpred])
lspred.append(['lr',lrpred])
lspred.append(['rf',rf_pred])
lspred.append(['lgbm',lgb_pred])
lspred.append(['xgb',xgb_pred])
lspred.append(['adaboost',ada_pred])
lspred.append(['vot',vot_pred])

dfeval = pd.DataFrame(calculateMetric(lspred, y_act), columns=['Classifier','Accuracy','Precision','Recall','F1-Score', 'AUC Score'])
dfeval.sort_values(['F1-Score'], ascending=False)

#### Cross Validation


In [None]:
x_cvtrain = [];
y_cvtrain = [];
for (rev, category) in document:
  #text = cleanText(rev)
  text = get_cltext(rev)
  #lmz_train = lemmatizer.lemmatize(text);
  x_cvtrain.append(text);
  y_cvtrain.append(category);

In [None]:
bow_cvtrain = bowVect.transform(x_cvtrain)

In [None]:
#Mendefinisikan fungsi untuk membandingkan hasil cross validation dari setiap classifier
def recCrossval(lsclf, xtrain, ytrain, k):
  lsCvResult = []
  for clf in lsclf:
    cvreport = cross_validate(clf[1], xtrain, ytrain, scoring=['accuracy','precision_macro','recall_macro','f1_macro'], cv=k,n_jobs= -1, return_train_score= True)
    lsCvResult.append([clf[0],np.average(cvreport['fit_time']),np.average(cvreport['test_accuracy']),np.average(cvreport['test_precision_macro']),np.average(cvreport['test_recall_macro']),np.average(cvreport['test_f1_macro'])])
  return lsCvResult

In [None]:
res = recCrossval(models,bow_cvtrain, y_cvtrain, 5)
dfcvresult = pd.DataFrame(res, columns=['classifier','avg fit time','avg accuracy','avg precision macro','avg recall macro','avg f1 macro'])
dfcvresult.sort_values(['avg f1 macro'], ascending=False)

### Model Goodfitness
* Overfitting : Hal ini terjadi ketika model bekerja sangat baik dengan data-train namun memiliki performa kurang baik saat diberikan data-validation dan data-test, biasanya disebabkan karena model terlalu kompleks sehingga menangkap setiap data noise yang ada (Bias yang rendah dan Variance yang tinggi)

* Underfitting : Hal ini terjadi ketika model tidak dapat menangkap pola yang kompleks pada data sehingga memiliki performa yang buruk saat train , test , dan validation. Biasanya terjadi pada model yang cenderung sederhana. (Bias yang tinggi dan Variance yang rendah).

#### Kompleksitas Model

In [None]:
p = list(range(1, 10))
lst_test =[]
lst_train =[]
for i in p:
    knn = KNeighborsClassifier(n_neighbors = i)
    knn.fit(bowTrain, y_train)
    z = knn.score(bowTest, y_act)
    t = knn.score(bowTrain, y_train)
    lst_test.append(z)
    lst_train.append(t)
      
plt.plot(p, lst_test, color ='red', label ='Test Accuracy')
plt.plot(p, lst_train, color ='b', label ='Train Accuracy')
plt.xlabel('K VALUES --->')
plt.title('FINDING BEST VALUE FOR K')
plt.legend()

#### Model Tunning

##### Feature Reduction

###### Select From Model

In [None]:
feature_select = SelectFromModel(lr, prefit=True)
x_mdltrain = feature_select.transform(bow_cvtrain)
x_mdltrain.shape

In [None]:
mask = feature_select.get_support()
mdlfeat = []
for bool, feature in zip(mask, bowVect.get_feature_names()):
    if bool :
        mdlfeat.append(feature)

In [None]:
# Coba data baru
score_dt = cross_validate(lr, x_mdltrain, y_cvtrain, scoring='accuracy', cv=10, return_train_score= True)
column = {'LR Train': score_dt['train_score'], 'LR Test': score_dt['test_score']}
df = pd.DataFrame(data=column)
plot = sns.boxplot(data = df, linewidth=2.5)
df.mean()

##### Hyper Parameter Tuning

In [None]:
# Mendefinisikan fungsi untuk melakukan tunning dengan Grid Search Cross Validation
def tuning( x , y , model , params , cv=10 ,verbose=10 ):
    clf = GridSearchCV(estimator= model,
                       param_grid= params,
                       scoring= 'f1_macro',
                       cv= cv,                         
                       verbose= verbose, return_train_score=True,
                       n_jobs= -1)
    
    clf.fit( x , y )
    return clf.best_params_ , clf.best_score_, clf.cv_results_

In [None]:
# Tunning max depth
# Mendefinisikan parameter untuk tunning
params =  {"max_depth": range(1,100,2)}

# Menemukan hyperparameters terbaik
best_params , best_score, cv_result = tuning(bow_cvtrain,
                                        y_cvtrain,
                                        DecisionTreeClassifier(),
                                        params,
                                        cv=10,
                                        verbose=2)


print(" Best Parameters:",best_params,"with score of:",best_score)

In [None]:
result = cv_result
grid_result = {"params":[d['max_depth'] for d in result['params']]}
for key, value in result.items():
    if key.find('split') >= 0 :
        grid_result[key] = value
    
plt.figure(figsize=(16, 6))
grids = pd.DataFrame(grid_result).melt(id_vars='params')
grids['variable'] = grids.variable.str[7:-6]
sns.boxplot(data=grids, x='params', y='value', hue='variable')