# PyCaret 2 NLP Example
This notebook is created using PyCaret 2.0. Last updated : 28-07-2020

In [1]:
# check version
from pycaret.utils import version
version()

pycaret-nightly-0.38


# 1. Loading Dataset

In [2]:
from pycaret.datasets import get_data
data = get_data('kiva')

Unnamed: 0,country,en,gender,loan_amount,nonpayment,sector,status
0,Dominican Republic,"""Banco Esperanza"" is a group of 10 women looki...",F,1225,partner,Retail,0
1,Dominican Republic,"""Caminemos Hacia Adelante"" or ""Walking Forward...",F,1975,lender,Clothing,0
2,Dominican Republic,"""Creciendo Por La Union"" is a group of 10 peop...",F,2175,partner,Clothing,0
3,Dominican Republic,"""Cristo Vive"" (""Christ lives"" is a group of 10...",F,1425,partner,Clothing,0
4,Dominican Republic,"""Cristo Vive"" is a large group of 35 people, 2...",F,4025,partner,Food,0


# 2. Initialize Setup

In [3]:
from pycaret.nlp import *
nlp1 = setup(data, target = 'en', session_id=123, log_experiment=True, log_plots = True, experiment_name='kiva1')

Description,Value
session_id,123
Documents,6818
Vocab Size,10671
Custom Stopwords,False


# 3. Create Model

In [4]:
models()

Unnamed: 0_level_0,Name,Reference
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
lda,Latent Dirichlet Allocation,gensim/models/ldamodel
lsi,Latent Semantic Indexing,gensim/models/lsimodel
hdp,Hierarchical Dirichlet Process,gensim/models/hdpmodel
rp,Random Projections,gensim/models/rpmodel
nmf,Non-Negative Matrix Factorization,sklearn.decomposition.NMF


In [6]:
lda = create_model('lda')

In [7]:
nmf = create_model('nmf', num_topics = 6)

# 4. Assign Labels

In [8]:
lda_results = assign_model(lda)
lda_results.head()

Unnamed: 0,country,en,gender,loan_amount,nonpayment,sector,status,Topic_0,Topic_1,Topic_2,Topic_3,Dominant_Topic,Perc_Dominant_Topic
0,Dominican Republic,group woman look receive small loan take small...,F,1225,partner,Retail,0,0.443424,0.17063,0.001865,0.384082,Topic 0,0.44
1,Dominican Republic,walk forward group entrepreneur seek second lo...,F,1975,lender,Clothing,0,0.335674,0.416064,0.001612,0.24665,Topic 1,0.42
2,Dominican Republic,group people hope start business group look re...,F,2175,partner,Clothing,0,0.568597,0.153035,0.002369,0.275999,Topic 0,0.57
3,Dominican Republic,live group woman look receive first loan young...,F,1425,partner,Clothing,0,0.285612,0.225983,0.001858,0.486547,Topic 3,0.49
4,Dominican Republic,vive large group people hope take loan many se...,F,4025,partner,Food,0,0.383666,0.345023,0.001839,0.269472,Topic 0,0.38


# 5. Analyze Model

In [9]:
plot_model(lda)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [10]:
plot_model(lda, plot = 'bigram')

In [11]:
plot_model(lda, plot = 'tsne')

In [12]:
evaluate_model(lda)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Frequency Plot', 'freque…

# 6. MLFlow UI

In [13]:
!mlflow ui

^C


# End
Thank you. For more information / tutorials on PyCaret, please visit https://www.pycaret.org