# Playground for Topic Modeling slides - Simple Version
- Stephen W. Thomas
- Used for MMA 865; MMAI 891; Exec Ed

In [1]:
import datetime
print(datetime.datetime.now())

In [2]:
import pandas as pd

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Read in the Data

In [3]:
kiva_df=pd.read_csv("data/kiva_cleaned.csv")
kiva_df.info()

kiva_df = kiva_df.dropna()
kiva_df = kiva_df.reset_index(drop=True)

kiva_df.info()
kiva_df.shape
kiva_df.head()
kiva_df.tail()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6818 entries, 0 to 6817
Data columns (total 9 columns):
loan_id        6818 non-null int64
status         6818 non-null object
sector         6818 non-null object
en             6818 non-null object
country        6818 non-null object
gender         6818 non-null object
loan_amount    6818 non-null float64
nonpayment     6818 non-null object
en_clean       6802 non-null object
dtypes: float64(1), int64(1), object(7)
memory usage: 479.5+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6802 entries, 0 to 6801
Data columns (total 9 columns):
loan_id        6802 non-null int64
status         6802 non-null object
sector         6802 non-null object
en             6802 non-null object
country        6802 non-null object
gender         6802 non-null object
loan_amount    6802 non-null float64
nonpayment     6802 non-null object
en_clean       6802 non-null object
dtypes: float64(1), int64(1), object(7)
memory usage: 478.3+ KB


(6802, 9)

Unnamed: 0,loan_id,status,sector,en,country,gender,loan_amount,nonpayment,en_clean
0,0,defaulted,Agriculture,<i>This description is written by Rowland Amul...,Kenya,M,500.0,lender,"Robert, 40, is married and has 6 children. In ..."
1,1,defaulted,Food,<i>This description is written by Rowland Amul...,Kenya,F,500.0,lender,"Petronilla, 30, was deserted by her husband an..."
2,2,defaulted,Food,<i>This description was written by Richard Maz...,Kenya,M,500.0,lender,"Tom Mung'ahu, 45, is married and has 6 childre..."
3,3,defaulted,Services,<i>This description was written by Rowland Amu...,Kenya,F,500.0,lender,"Benedina, 42, is married and has 4 girls. In a..."
4,4,defaulted,Construction,<i>This description was written by Rowland Amu...,Kenya,M,500.0,lender,"Vincent Ondego 40, is married and has 8 childr..."


Unnamed: 0,loan_id,status,sector,en,country,gender,loan_amount,nonpayment,en_clean
6797,7983,paid,Clothing,Rosa is a member of the 3 de Mayo Communal Ban...,Ecuador,F,1050.0,lender,Rosa is a member of the 3 de Mayo Communal Ban...
6798,7984,paid,Agriculture,"The communal bank, ""Cumbe,"" is located on the ...",Ecuador,F,2650.0,partner,"The communal bank, ""Cumbe,"" is located on the ..."
6799,7985,paid,Food,Gladys Burgos Macas is a member of the 3 de Ma...,Ecuador,F,775.0,partner,Gladys Burgos Macas is a member of the 3 de Ma...
6800,7986,paid,Retail,The Communal Bank <em>Las Caleras</em> is loca...,Ecuador,F,275.0,partner,The Communal Bank Las Caleras is located in th...
6801,7987,paid,Services,Eliud Ouma owns the Purple Hut salon located o...,Kenya,M,275.0,partner,Eliud Ouma owns the Purple Hut salon located o...


# Text Preprocessing

In [4]:
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer
import re
import unidecode

#import warnings

#with warnings.catch_warnings():
#    warnings.filterwarnings("ignore",category=DeprecationWarning)

stop_words = set(stopwords.words('english') + stopwords.words('spanish'))

lemmer = WordNetLemmatizer()

def preprocess(x):
    # Lower case
    x = x.lower()
    
    # Remove punctuation
    x = re.sub(r'[^\w\s]', '', x)
    
    # Remove non-unicode
    x = unidecode.unidecode(x)
    
    # Remove numbers
    x = re.sub(r'\d+', '', x)
    
    # Remove stopwords and lemmatize
    x = [lemmer.lemmatize(w) for w in x.split() if w not in stop_words]
    return ' '.join(x) 

kiva_df['en_clean_pre'] = kiva_df['en_clean'].apply(preprocess)

In [5]:
kiva_df.head()

Unnamed: 0,loan_id,status,sector,en,country,gender,loan_amount,nonpayment,en_clean,en_clean_pre
0,0,defaulted,Agriculture,<i>This description is written by Rowland Amul...,Kenya,M,500.0,lender,"Robert, 40, is married and has 6 children. In ...",robert married child addition family take care...
1,1,defaulted,Food,<i>This description is written by Rowland Amul...,Kenya,F,500.0,lender,"Petronilla, 30, was deserted by her husband an...",petronilla deserted husband responsible upbrin...
2,2,defaulted,Food,<i>This description was written by Richard Maz...,Kenya,M,500.0,lender,"Tom Mung'ahu, 45, is married and has 6 childre...",tom mungahu married child child attending elem...
3,3,defaulted,Services,<i>This description was written by Rowland Amu...,Kenya,F,500.0,lender,"Benedina, 42, is married and has 4 girls. In a...",benedina married girl addition family also tak...
4,4,defaulted,Construction,<i>This description was written by Rowland Amu...,Kenya,M,500.0,lender,"Vincent Ondego 40, is married and has 8 childr...",vincent ondego married child beside family vin...


In [6]:
kiva_df.iloc[0,:].en_clean

'Robert, 40, is married and has 6 children. In addition to his family of 8, he takes care of his mother and 5 brothers. Robert started by planting vegetables and selling at the local market. He then diversified with a tea nursery which is more profitable. Given a loan of $500, Robert will be able to improve his activities by buying fertilizers, pesticides, a pump and seedlings. The pump will remove the uncertainties of the weather. The current capital in his venture is Ksh 10500 or $150, which does not allow Robert to maximize his potential. Robert completed high school and has never got employment. He got apprentice training from a tea extension officer. He is a go getter, and his main hobby is teaching music.'

In [7]:
kiva_df.iloc[0,:].en_clean_pre

'robert married child addition family take care mother brother robert started planting vegetable selling local market diversified tea nursery profitable given loan robert able improve activity buying fertilizer pesticide pump seedling pump remove uncertainty weather current capital venture ksh allow robert maximize potential robert completed high school never got employment got apprentice training tea extension officer go getter main hobby teaching music'

## Topic Modeling with Sci-kit Learn

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_df=0.5, min_df=0.05, 
                             max_features=1000, ngram_range=[1,3])
dtm = vectorizer.fit_transform(kiva_df['en_clean_pre'])
print(dtm.shape)

(6802, 270)


In [9]:
vectorizer.get_feature_names()

['able',
 'active',
 'active member',
 'activity',
 'add',
 'additional',
 'age',
 'ago',
 'allow',
 'also',
 'always',
 'amount',
 'another',
 'applied',
 'applied loan',
 'area',
 'attend',
 'back',
 'basic',
 'bean',
 'began',
 'belief',
 'better',
 'born',
 'bought',
 'boy',
 'business selling',
 'buy',
 'buying',
 'capital',
 'care',
 'cement',
 'child school',
 'childrens',
 'church',
 'city',
 'client',
 'clothes',
 'clothing',
 'college',
 'community',
 'continue',
 'cost',
 'could',
 'cow',
 'currently',
 'customer',
 'dairy',
 'dairy cow',
 'daughter',
 'day',
 'decided',
 'demand',
 'different',
 'domingo',
 'dominican',
 'dream',
 'due',
 'earn',
 'educate',
 'education',
 'enable',
 'enough',
 'entrepreneur',
 'every',
 'every day',
 'expand',
 'expand business',
 'expense',
 'experience',
 'explains',
 'family',
 'farm',
 'farmer',
 'farming',
 'father',
 'fee',
 'feed',
 'first',
 'first loan',
 'five',
 'five child',
 'food',
 'four',
 'four child',
 'friend',
 'fruit',

In [10]:
from sklearn.decomposition import LatentDirichletAllocation

lda_model = LatentDirichletAllocation(n_components=20,
                                      doc_topic_prior=None,
                                      topic_word_prior=None,
                                      max_iter=200, 
                                      learning_method='batch', 
                                      random_state=123,
                                      n_jobs=2,
                                      verbose=0)
lda_output = lda_model.fit(dtm)

# Log Likelyhood: Higher the better
ll = lda_model.score(dtm)

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
perp = lda_model.perplexity(dtm)

In [11]:
# Theta = document-topic matrix
# Beta = components_ = topic-term matrix
theta = pd.DataFrame(lda_model.transform(dtm))
beta = pd.DataFrame(lda_model.components_)

In [12]:
theta.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.008651,0.008651,0.008651,0.008651,0.483942,0.008651,0.008651,0.008651,0.008651,0.008651,0.008651,0.008651,0.360335,0.008651,0.008651,0.008651,0.008651,0.008651,0.008651,0.008651
1,0.007398,0.007398,0.007398,0.007398,0.007398,0.859442,0.007398,0.007398,0.007398,0.007398,0.007398,0.007398,0.007398,0.007398,0.007398,0.007398,0.007398,0.007398,0.007398,0.007398
2,0.00825,0.00825,0.00825,0.00825,0.00825,0.00825,0.00825,0.00825,0.00825,0.00825,0.00825,0.00825,0.00825,0.00825,0.00825,0.00825,0.658675,0.192827,0.00825,0.00825
3,0.010299,0.010299,0.010299,0.010299,0.010299,0.010299,0.010299,0.010299,0.010299,0.010299,0.010299,0.010299,0.80431,0.010299,0.010299,0.010299,0.010299,0.010299,0.010299,0.010299
4,0.821479,0.009396,0.009396,0.009396,0.009396,0.009396,0.009396,0.009396,0.009396,0.009396,0.009396,0.009396,0.009396,0.009396,0.009396,0.009396,0.009396,0.009396,0.009396,0.009396


In [13]:
beta.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,260,261,262,263,264,265,266,267,268,269
0,28.450467,55.671889,49.078044,20.139093,10.131653,2.238602,4.844017,20.392001,0.379399,32.24018,...,75.758588,19.303593,8.688853,12.839707,11.922191,21.248398,27.594151,33.102967,0.05,13.793507
1,12.547361,0.05,0.05,0.05,0.05,0.05,0.314016,0.05,0.05,0.05,...,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,2.543769,0.05
2,12.299628,24.208785,22.18549,7.702309,4.612913,2.409955,3.894666,6.478088,0.479245,19.723512,...,24.990815,4.740272,4.946245,6.660111,5.61297,6.710831,13.696085,16.584664,0.05,4.169932
3,38.018599,0.625488,0.05,12.383906,2.379312,8.130464,42.873529,5.346212,12.875778,22.203624,...,5.003734,19.526729,13.453686,26.031663,14.001482,3.80913,13.377419,0.05,13.818494,20.742405
4,31.489722,8.511836,8.250891,1.976989,1.11515,1.790282,2.07456,2.987586,0.05,23.10439,...,1.60442,6.378504,2.433041,1.958979,0.05,3.182817,2.796047,1.714359,1.312406,6.922944


In [15]:
# Build Topic Summary
feature_names = vectorizer.get_feature_names()
weight = theta.sum(axis=0)
support50 = (theta > 0.5).sum(axis=0)
support10 = (theta > 0.1).sum(axis=0)
termss = list()
for topic_id, topic in enumerate(lda_model.components_):
    terms = " ".join([feature_names[i] for i in topic.argsort()[:-6 - 1:-1]])
    termss.append(terms)
topic_summary = pd.DataFrame({'TopicID': range(0, len(termss)), "Support50": support50, "Support10": support10, "Weight": weight, "Terms": termss})

In [16]:
pd.set_option('display.max_colwidth', 100)
pd.set_option('display.max_columns', 0)
topic_summary

Unnamed: 0,TopicID,Support50,Support10,Weight,Terms
0,0,700,789,660.594204,group member usd woman group woman school
1,1,68,133,124.158622,fee school fee pay school expand expand business
2,2,383,489,402.629142,cow dairy milk dairy cow group farming
3,3,699,800,655.510271,group life small member hope dominican
4,4,311,383,328.322045,stock primary school able primary school increase
5,5,781,851,712.446289,school requesting requesting loan family buy lack
6,6,137,181,169.067136,expand expand business father family main living
7,7,1,79,70.703796,farming maize land farmer farm bean
8,8,334,415,354.291304,santo domingo santo domingo store community husband
9,9,166,227,202.459533,kenya service community yearold applied expand


In [None]:
import pyLDAvis.sklearn
 
pyLDAvis.enable_notebook()
% time pyLDAvis.sklearn.prepare(lda_model, dtm, vectorizer, mds="tsne")