# Playground for Topic Modeling slides - Simple Version
- Stephen W. Thomas
- Used for MMA 865; MMAI 891; Exec Ed

In [1]:
import datetime
print(datetime.datetime.now())

2020-12-10 07:57:37.075914


In [2]:
import pandas as pd

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Read in the Data

In [3]:
kiva_df=pd.read_csv("../data/kiva_cleaned.csv")
kiva_df.info()

kiva_df = kiva_df.dropna()
kiva_df = kiva_df.reset_index(drop=True)

kiva_df.info()
kiva_df.shape
kiva_df.head()
kiva_df.tail()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6818 entries, 0 to 6817
Data columns (total 8 columns):
loan_id        6818 non-null int64
status         6818 non-null object
sector         6818 non-null object
country        6818 non-null object
gender         6818 non-null object
loan_amount    6818 non-null int64
nonpayment     6818 non-null object
en_clean       6802 non-null object
dtypes: int64(2), object(6)
memory usage: 426.2+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6802 entries, 0 to 6801
Data columns (total 8 columns):
loan_id        6802 non-null int64
status         6802 non-null object
sector         6802 non-null object
country        6802 non-null object
gender         6802 non-null object
loan_amount    6802 non-null int64
nonpayment     6802 non-null object
en_clean       6802 non-null object
dtypes: int64(2), object(6)
memory usage: 425.2+ KB


(6802, 8)

Unnamed: 0,loan_id,status,sector,country,gender,loan_amount,nonpayment,en_clean
0,0,defaulted,Agriculture,Kenya,M,500,lender,"Robert, 40, is married and has 6 children. In ..."
1,1,defaulted,Food,Kenya,F,500,lender,"Petronilla, 30, was deserted by her husband an..."
2,2,defaulted,Food,Kenya,M,500,lender,"Tom Mung'ahu, 45, is married and has 6 childre..."
3,3,defaulted,Services,Kenya,F,500,lender,"Benedina, 42, is married and has 4 girls. In a..."
4,4,defaulted,Construction,Kenya,M,500,lender,"Vincent Ondego 40, is married and has 8 childr..."


Unnamed: 0,loan_id,status,sector,country,gender,loan_amount,nonpayment,en_clean
6797,7983,paid,Clothing,Ecuador,F,1050,lender,Rosa is a member of the 3 de Mayo Communal Ban...
6798,7984,paid,Agriculture,Ecuador,F,2650,partner,"The communal bank, ""Cumbe,"" is located on the ..."
6799,7985,paid,Food,Ecuador,F,775,partner,Gladys Burgos Macas is a member of the 3 de Ma...
6800,7986,paid,Retail,Ecuador,F,275,partner,The Communal Bank Las Caleras is located in th...
6801,7987,paid,Services,Kenya,M,275,partner,Eliud Ouma owns the Purple Hut salon located o...


# Text Preprocessing

In [4]:
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer
import re
import unidecode

#import warnings

#with warnings.catch_warnings():
#    warnings.filterwarnings("ignore",category=DeprecationWarning)

stop_words = set(stopwords.words('english') + stopwords.words('spanish'))

lemmer = WordNetLemmatizer()

def preprocess(x):
    # Lower case
    x = x.lower()
    
    # Remove punctuation
    x = re.sub(r'[^\w\s]', '', x)
    
    # Remove non-unicode
    x = unidecode.unidecode(x)
    
    # Remove numbers
    x = re.sub(r'\d+', '', x)
    
    # Remove stopwords and lemmatize
    x = [lemmer.lemmatize(w) for w in x.split() if w not in stop_words]
    return ' '.join(x) 

kiva_df['en_clean_pre'] = kiva_df['en_clean'].apply(preprocess)

In [5]:
kiva_df.head()

Unnamed: 0,loan_id,status,sector,country,gender,loan_amount,nonpayment,en_clean,en_clean_pre
0,0,defaulted,Agriculture,Kenya,M,500,lender,"Robert, 40, is married and has 6 children. In ...",robert married child addition family take care...
1,1,defaulted,Food,Kenya,F,500,lender,"Petronilla, 30, was deserted by her husband an...",petronilla deserted husband responsible upbrin...
2,2,defaulted,Food,Kenya,M,500,lender,"Tom Mung'ahu, 45, is married and has 6 childre...",tom mungahu married child child attending elem...
3,3,defaulted,Services,Kenya,F,500,lender,"Benedina, 42, is married and has 4 girls. In a...",benedina married girl addition family also tak...
4,4,defaulted,Construction,Kenya,M,500,lender,"Vincent Ondego 40, is married and has 8 childr...",vincent ondego married child beside family vin...


In [6]:
kiva_df.iloc[0,:].en_clean

'Robert, 40, is married and has 6 children. In addition to his family of 8, he takes care of his mother and 5 brothers. Robert started by planting vegetables and selling at the local market. He then diversified with a tea nursery which is more profitable. Given a loan of $500, Robert will be able to improve his activities by buying fertilizers, pesticides, a pump and seedlings. The pump will remove the uncertainties of the weather. The current capital in his venture is Ksh 10500 or $150, which does not allow Robert to maximize his potential. Robert completed high school and has never got employment. He got apprentice training from a tea extension officer. He is a go getter, and his main hobby is teaching music.'

In [7]:
kiva_df.iloc[0,:].en_clean_pre

'robert married child addition family take care mother brother robert started planting vegetable selling local market diversified tea nursery profitable given loan robert able improve activity buying fertilizer pesticide pump seedling pump remove uncertainty weather current capital venture ksh allow robert maximize potential robert completed high school never got employment got apprentice training tea extension officer go getter main hobby teaching music'

## Topic Modeling with Sci-kit Learn

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_df=0.5, min_df=0.05, 
                             max_features=1000, ngram_range=[1,3])
dtm = vectorizer.fit_transform(kiva_df['en_clean_pre'])
print(dtm.shape)

(6802, 270)


In [9]:
vectorizer.get_feature_names()

['able',
 'active',
 'active member',
 'activity',
 'add',
 'additional',
 'age',
 'ago',
 'allow',
 'also',
 'always',
 'amount',
 'another',
 'applied',
 'applied loan',
 'area',
 'attend',
 'back',
 'basic',
 'bean',
 'began',
 'belief',
 'better',
 'born',
 'bought',
 'boy',
 'business selling',
 'buy',
 'buying',
 'capital',
 'care',
 'cement',
 'child school',
 'childrens',
 'church',
 'city',
 'client',
 'clothes',
 'clothing',
 'college',
 'community',
 'continue',
 'cost',
 'could',
 'cow',
 'currently',
 'customer',
 'dairy',
 'dairy cow',
 'daughter',
 'day',
 'decided',
 'demand',
 'different',
 'domingo',
 'dominican',
 'dream',
 'due',
 'earn',
 'educate',
 'education',
 'enable',
 'enough',
 'entrepreneur',
 'every',
 'every day',
 'expand',
 'expand business',
 'expense',
 'experience',
 'explains',
 'family',
 'farm',
 'farmer',
 'farming',
 'father',
 'fee',
 'feed',
 'first',
 'first loan',
 'five',
 'five child',
 'food',
 'four',
 'four child',
 'friend',
 'fruit',

In [10]:
from sklearn.decomposition import LatentDirichletAllocation

lda_model = LatentDirichletAllocation(n_components=20,
                                      doc_topic_prior=None,
                                      topic_word_prior=None,
                                      max_iter=200, 
                                      learning_method='batch', 
                                      random_state=123,
                                      n_jobs=2,
                                      verbose=0)
lda_output = lda_model.fit(dtm)

# Log Likelyhood: Higher the better
ll = lda_model.score(dtm)

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
perp = lda_model.perplexity(dtm)

In [11]:
# Theta = document-topic matrix
# Beta = components_ = topic-term matrix
theta = pd.DataFrame(lda_model.transform(dtm))
beta = pd.DataFrame(lda_model.components_)

In [12]:
theta.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.008651,0.008651,0.008651,0.008651,0.008651,0.008651,0.008651,0.008651,0.008651,0.008651,0.008651,0.008651,0.835626,0.008651,0.008651,0.008651,0.008651,0.008651,0.008651,0.008651
1,0.007398,0.007398,0.007398,0.007398,0.007398,0.859442,0.007398,0.007398,0.007398,0.007398,0.007398,0.007398,0.007398,0.007398,0.007398,0.007398,0.007398,0.007398,0.007398,0.007398
2,0.00825,0.00825,0.00825,0.00825,0.00825,0.00825,0.00825,0.00825,0.00825,0.00825,0.00825,0.00825,0.00825,0.00825,0.00825,0.00825,0.629895,0.221606,0.00825,0.00825
3,0.010299,0.010299,0.010299,0.010299,0.010299,0.010299,0.010299,0.010299,0.010299,0.010299,0.010299,0.010299,0.80431,0.010299,0.010299,0.010299,0.010299,0.010299,0.010299,0.010299
4,0.821479,0.009396,0.009396,0.009396,0.009396,0.009396,0.009396,0.009396,0.009396,0.009396,0.009396,0.009396,0.009396,0.009396,0.009396,0.009396,0.009396,0.009396,0.009396,0.009396


In [13]:
beta.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,260,261,262,263,264,265,266,267,268,269
0,28.050024,55.885978,48.988058,19.909011,10.18986,2.354766,4.727876,20.474525,0.378533,32.161692,...,76.032001,18.699409,8.562082,12.561041,11.625641,21.284361,27.360169,33.142604,0.05,13.295494
1,12.353798,0.05,0.05,0.05,0.05,0.05,0.307471,0.05,0.05,0.05,...,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,2.539648,0.05
2,12.586755,24.684213,22.409094,7.685108,4.615725,2.389386,4.056614,6.342891,0.479914,20.06803,...,25.377447,5.031498,5.083152,6.820433,5.804337,6.534267,13.890104,16.563073,0.05,4.510646
3,38.838733,0.8245,0.05,12.743786,2.420963,8.016108,43.052334,6.001601,13.088882,22.83017,...,4.430356,19.89601,14.307302,26.647323,14.355717,4.389093,14.081605,0.05,14.013865,21.392063
4,31.457703,8.363259,8.11657,1.794826,0.997785,1.756221,1.628283,3.12688,0.05,23.366078,...,1.606124,6.808428,2.562854,1.849732,0.209091,3.321678,2.772328,1.799196,1.223385,6.912374


In [14]:
# Build Topic Summary
feature_names = vectorizer.get_feature_names()
weight = theta.sum(axis=0)
support50 = (theta > 0.5).sum(axis=0)
support10 = (theta > 0.1).sum(axis=0)
termss = list()
for topic_id, topic in enumerate(lda_model.components_):
    terms = " ".join([feature_names[i] for i in topic.argsort()[:-6 - 1:-1]])
    termss.append(terms)
topic_summary = pd.DataFrame({'TopicID': range(0, len(termss)), "Support50": support50, "Support10": support10, "Weight": weight, "Terms": termss})

In [15]:
pd.set_option('display.max_colwidth', 100)
pd.set_option('display.max_columns', 0)
topic_summary

Unnamed: 0,TopicID,Support50,Support10,Weight,Terms
0,0,692,787,658.712333,group member usd woman group woman school
1,1,71,127,124.272127,fee school fee pay school expand expand business
2,2,381,492,404.407805,cow dairy milk dairy cow group farming
3,3,718,810,668.212187,group life small member dominican hope
4,4,314,388,330.666992,stock primary school primary school able increase
5,5,775,847,709.021666,school requesting requesting loan family buy lack
6,6,136,174,167.686112,expand expand business father family main living
7,7,1,75,70.961151,farming maize land farmer bean farm
8,8,292,382,322.659926,santo domingo santo domingo community entrepreneur clothing
9,9,164,230,203.457774,kenya service community yearold applied purchase


In [16]:
import pyLDAvis.sklearn
 
pyLDAvis.enable_notebook()
% time pyLDAvis.sklearn.prepare(lda_model, dtm, vectorizer, mds="tsne")

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


Wall time: 18.3 s
