In [2]:
import pandas as pd
import requests
import json
import csv
import time
import datetime
import re
import string
import gzip
import os

import numpy as np
import pickle #for saving output files, pickles

def datetime_to_unix_time(d):
    return int(time.mktime(datetime.datetime.strptime(d, "%m/%d/%Y").timetuple()))

In [3]:
## Python packages - you may have to pip install sqlalchemy, sqlalchemy_utils, and psycopg2.
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
from sqlalchemy.sql import table, column, select, update, insert
import psycopg2
from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT

import pandas as pd

#In Python: Define your username and password used above. I've defined the database name (we're 
#using a dataset on births, so I call it birth_db). 
dbname = 'donors_db'
username = 'russell'
pswd = 'bradypodion'

## 'engine' is a connection to a database
## Here, we're using postgres, but sqlalchemy can connect to other things too.
engine = create_engine('postgresql://%s:%s@localhost/%s'%(username,pswd,dbname))
print('postgresql://%s:%s@localhost/%s'%(username,pswd,dbname))
print(engine.url)
# Replace localhost with IP address if accessing a remote server

## create a database (if it doesn't exist)
if not database_exists(engine.url):
    create_database(engine.url)
print(database_exists(engine.url))
print(engine.url)


postgresql://russell:bradypodion@localhost/donors_db
postgresql://russell:bradypodion@localhost/donors_db
True
postgresql://russell:bradypodion@localhost/donors_db


## Process trailers from DonorsChoose (from donors_db in postgreSQL)

In [4]:
# connect:
con = None
con = psycopg2.connect(database = dbname, user = username, host='localhost', password=pswd)

### query: from historical data
merge_query = """
SELECT * FROM merge_time;
"""

merged_data = pd.read_sql_query(merge_query,con)

In [5]:
data = merged_data[['id','fulfillmentTrailer']]
data.columns = ['id','text']

###########drop rows that have www or donorchoose in their fulfillmentTrailers---the wrong info was grabbed
print('Rows coming in '+str(len(data.index)))
data= data[~data.text.str.contains("www")]
data= data[~data.text.str.contains("donorschoose")]
print('Row coming out after website/junk filtering '+str(len(data.index)))

pd.set_option('max_colwidth', 400)
del merged_data
data.head()

Rows coming in 82091
Row coming out after website/junk filtering 79992


Unnamed: 0,id,text
0,846450,My students need a rug.
1,2116678,My students need storage cabinets in our choir room to house our music library.
2,1892178,My students need 4 clarinets and 1 trumpet to play in the band.
3,1143364,My students need 6 new stands at the high school in order to accommodate our rapidly growing program.
4,779936,My students need a projector to be able to view live performances and other teaching resources from the Internet.


### Clean up text

In [6]:
# not using re.sub(r'[^a-zA-Z\s]', '', t) to avoid losing emojis
text = [re.sub(r'([0-9]+?)', ' ', t).lower() for t in data['text']] # remove all numbers and symbols
text = [re.sub(r'(!|"|#|\$|%|&|\'|\(|\)|\*|\+|,|-|\.|/|:|;|<|=|>|\?|@|\[|\\|\]|\^|_|`|{|\||}|~)+', ' ', t) for t in text]
data['text'] = [re.sub(r'\s+\s', ' ', t).strip() for t in text] # repace double spaces with single spaces

data = data.loc[data['text'].map(len) > 3].reset_index(drop = True) # keep only strings longer than 5 characters

# Discovering and Visualizing Topics in Texts

Most typical cases of text classification in NLP (named entity recognition, question answering, etc) require training datasets where each piece of text is associted with a label. However, in real-life scenarios, text collections rarely come with metadata labels that tell you what the texts are about. When people answer open-ended survey questions, for example, they don't repeat detectable keywords in their answer with the topics they discuss.

**Topic modeling** is an unsupervised classification technique that is able to discover the topics in a collection of texts by looking at their commonalities. In this context, "topics" refers to groups of related words that often occur together in the same text. For example, in a collection of newspaper articles a topic model may identify one topic that is made up of words such as "politician", "law", and "parliament", and another characterized by words such as "player", "match" and "penalty". Topic models only go as far as identifying clusters of related words; a human is still needed to interpret these clusters and give them labels such as "politics" and "football". 

One of the most popular topic models is Latent Dirichlet Allocation (LDA). LDA is a generative model that sees every text as a mixture of topics and each sentence as a mixture of words. For example, the "football" topic will generate the word "penalty" with a high probability, while the "politics" topic will have a much higher probability for "politician" than for "penalty". Other words, such as "the" and "an", will have similar probabilities in all topics. LDA takes its name from the Dirichlet probability distribution. This is the prior distribution it assumes the topics in a text will have.

Modified from https://github.com/nlptown/nlp-notebooks/blob/master/Discovering%20and%20Visualizing%20Topics%20in%20Texts%20with%20LDA.ipynb

## Data

Insight fellows frequently come up with project ideas that revolve around topic modeling of online reviews. Here, we'll use a dataset of project 'trailers' from the website DonorsChoose providing a brief description of the reason/project for which a teacher is requesting funding.

In [37]:
data.head(2)

Unnamed: 0,id,text
0,846450,my students need a rug
1,2116678,my students need storage cabinets in our choir room to house our music library


## Preprocessing

Before we train a topic model, we need to tokenize our texts. Let's do this with the [spaCy](https://spacy.io/) NLP library. We need to load a statisti English and use spaCy to perform our first preprocessing pass:

In [7]:
import spacy

# If you haven't installed the spaCy language model, uncomment the following line and run this cell
#! python -m spacy download en_core_web_sm

# You will need to restart the notebook (go to the menu Kernel -> Restart) and re-run cells up to this point

In [8]:
nlp = spacy.load('en_core_web_sm')

texts = data['text'].tolist()
%time spacy_docs = list(nlp.pipe(texts))

CPU times: user 2min 24s, sys: 99 ms, total: 2min 24s
Wall time: 2min 24s


The text of each review is now a spaCy Doc that we can transform into a list of tokens. Instead of the original tokens, we're going to work with the **lemmas** instead. This will allow our model to generalize and understand that different forms of a word should be treated as one.

Stemming and Lemmatization both generate the root form of the words. Lemmatization uses the rules about a language and the resulting tokens are all actual words. For example, the word "thought" becomes the lemma "think". Stemming is a crude heuristic that chops off the ends of words such that the resulting tokens may not be actual words. Stemming is faster but only works well for simple words like "toys" and "toy".

This is the full list of our initial preprocessing steps: 
 
- remove all words shorter than 2 characters (these are often fairly uninteresting from a topical point of view)
- drop all stopwords
- lowercase remaining lemmas

In [9]:
docs = [[t.lemma_.lower() for t in doc if len(t.orth_) > 2 and not t.is_stop] for doc in spacy_docs]
for i in range(5):
    print(docs[i])
    print('\n')

['student', 'need', 'rug']


['student', 'need', 'storage', 'cabinet', 'choir', 'room', 'house', 'music', 'library']


['student', 'need', 'clarinet', 'trumpet', 'play', 'band']


['student', 'need', 'new', 'stand', 'high', 'school', 'order', 'accommodate', 'rapidly', 'grow', 'program']


['student', 'need', 'projector', 'able', 'view', 'live', 'performance', 'teaching', 'resource', 'internet']




Next, we also want to take frequent bigrams into account. **Bigrams are multiword units**, such as "colored pencil" that actually form one word rather than two. We'll use Gensim to first identify the frequent bigrams in the corpus, then append them to the list of tokens for the documents in which they appear. This means the bigrams will not be in their correct position in the text, but that's fine: topic models are bag-of-word models that ignore word position anyway.

In [10]:
import re
from gensim.models import Phrases

bigram = Phrases(docs, min_count=10)
tokens = []

for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:  # bigrams can be recognized by the "_" that joins the invidual words
            docs[idx].append(token)
            tokens.append(token)
            
print(list(set(tokens))[:10])

['time_kid', 'ant_farm', 'binder_sheet', 'channel_energy', 'speed_stack', 'big_joe', 'complement_amazing', 'glitter_glue', 'oil_pastel', 'live_butterfly']


In [11]:

pickle_out = open('/home/russell/Documents/GitHub/DonorBooster/cleantrailersbig.pickle',"wb")
pickle.dump(docs, pickle_out)
pickle_out.close()

pickle_out = open('/home/russell/Documents/GitHub/DonorBooster/cleantokensbig.pickle',"wb")
pickle.dump(tokens, pickle_out)
pickle_out.close()


Next, we move on to the final Gensim-specific preprocessing steps. First, we create a dictionary representation of the documents. This dictionary will map each word to a unique ID and help us create bag-of-word representations of each document. These bag-of-word representations contain the ids of the words in the document, together with their frequency. Additionally, we can remove the least and most frequent words from the vocabulary. This improves the quality of our topic model and speeds up its training. The minimum frequency of a word is expressed as an absolute number, the maximum frequency is the proportion of documents a word is allowed to occur in.

In [12]:
from gensim.corpora import Dictionary

dictionary = Dictionary(docs)
print('Number of unique words in original documents:', len(dictionary))

dictionary.filter_extremes(no_below=3, no_above=0.25)
print('Number of unique words after removing rare and common words:', len(dictionary))

print("Example representation of document 3:", dictionary.doc2bow(docs[2]))

Number of unique words in original documents: 19775
Number of unique words after removing rare and common words: 10353
Example representation of document 3: [(9, 1), (10, 1), (11, 1), (12, 1)]


Then we create bag-of-word representations for each document in the corpus:

In [13]:
corpus = [dictionary.doc2bow(doc) for doc in docs]

## Training

Now it's time to train our topic model. We do this with the following parameters: 

- `corpus`: the bag-of-word representations of our documents
- `id2token`: the mapping from indices to words
- `num_topics`: the number of topics we want the model to identify
- `chunksize`: the number of documents the model sees for every update
- `passes`: the number of times we show the total corpus to the model during training
- `random_state`: we use a seed to ensure reproducibility.

On a corpus of this size, the training will typically about a minute.

In [14]:
from gensim.models import LdaModel

%time model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=20, chunksize=500, passes=3, random_state=1)

CPU times: user 51.8 s, sys: 128 ms, total: 51.9 s
Wall time: 52.3 s


## Results

Let's take a look at what the model has learnt. We do this by printing out the ten words that are most characteristic for each of the topics. Most topics show common words like "experience", "item" and "school" but it's hard to identify any other patterns in the data.

In [15]:
for (topic, words) in model.print_topics():
    print(topic+1, ":", words, '\n')

1 : 0.044*"group" + 0.034*"balance" + 0.031*"dry" + 0.030*"rug" + 0.029*"help" + 0.028*"erase" + 0.028*"focus" + 0.027*"dry_erase" + 0.026*"number" + 0.026*"active" 

2 : 0.076*"equipment" + 0.075*"camera" + 0.049*"document" + 0.044*"ball" + 0.037*"video" + 0.035*"band" + 0.033*"space" + 0.033*"large" + 0.033*"pack" + 0.033*"document_camera" 

3 : 0.085*"new" + 0.081*"play" + 0.071*"music" + 0.053*"year" + 0.047*"school" + 0.033*"glue" + 0.031*"instrument" + 0.031*"stick" + 0.029*"concept" + 0.026*"ipod" 

4 : 0.098*"art" + 0.044*"paint" + 0.040*"supply" + 0.035*"like" + 0.034*"paper" + 0.031*"write" + 0.028*"color" + 0.027*"manipulative" + 0.024*"material" + 0.024*"history" 

5 : 0.068*"table" + 0.046*"water" + 0.038*"curriculum" + 0.038*"projector" + 0.037*"understand" + 0.036*"grow" + 0.027*"energy" + 0.027*"help" + 0.027*"light" + 0.025*"exercise" 

6 : 0.058*"allow" + 0.048*"activity" + 0.042*"day" + 0.041*"classroom" + 0.040*"movement" + 0.039*"system" + 0.036*"sit" + 0.036*"carp

Another way of inspecting the topics is by visualizing them. This can be done with the [pyLDAvis](https://github.com/bmabey/pyLDAvis) library. PyLDAvis will show us how popular the topics are in our corpus, how similar the topics are, and which are the most salient words for this topic. Note it's important to set `sort_topics=False` on the call to pyLDAvis. If you don't, it will order the topics differently than Gensim. 

In [16]:
import pyLDAvis.gensim
import warnings

pyLDAvis.enable_notebook()
warnings.filterwarnings("ignore", category=DeprecationWarning) 

pyLDAvis.gensim.prepare(model, corpus, dictionary, sort_topics=False)

Finally, let's inspect the topics the model recognizes in some of the individual documents. Here we see how LDA tends to assign a high probability to a low number of topics for each documents, which makes its results easily interpretable.

In [17]:
for (text, doc) in zip(texts, docs):
    val=([(topic+1, prob) for (topic, prob) in model[dictionary.doc2bow(doc)] if ((prob > 0.5) and topic==8)])
       
    if len(val)!=0:
        print(text)
        print('-'*10)
        print(val)
        

    del val

my students need sets of centers books and games to practice reading comprehension skills
----------
[(9, 0.704843)]
my students need a binding machine a set of rotary cutters and card stock
----------
[(9, 0.5611189)]
my students need reading and math games to keep the learning interesting
----------
[(9, 0.6312703)]
my students need your help to purchase two graphing calculators
----------
[(9, 0.66624355)]
my students need your help to purchase two graphing calculators
----------
[(9, 0.66623926)]
my students need a calendar math activity program
----------
[(9, 0.6099988)]
my students need subscriptions to time for kids magazine
----------
[(9, 0.8642832)]
my students need lakeshore stem science kits for hands on activities to promote problem solving skills
----------
[(9, 0.56249976)]
my students need hot dots cards and activities to give them a fun hands on way to self assess their progress in both reading and math
----------
[(9, 0.5275485)]
my students need tablets and cases fo

my students need story problem boxes to help them practice problem solving skills while i work with small math groups
----------
[(9, 0.5976389)]
my students need stock market game team fees paid this is the only expense for this activity
----------
[(9, 0.50836325)]
my students need the entry fee for the stock market game
----------
[(9, 0.68332916)]
my students need stock market game fees
----------
[(9, 0.6833291)]
my students need subscriptions to time for kids magazine
----------
[(9, 0.8642832)]
my students need subscriptions to time for kids magazine to enhance their learning about our world
----------
[(9, 0.60272163)]
my students need subscriptions to time for kids and beanbags to make a reading center
----------
[(9, 0.6312506)]
my students need hot plates and challenging problem solving workbooks including the art of problem solving problem solving strategies and the art and craft of problem solving
----------
[(9, 0.5478052)]
my students need graphing calculators to push th

my students need subscriptions to time for kids magazine
----------
[(9, 0.8642832)]
my students need two ipads for independent learning during center time
----------
[(9, 0.53511405)]
my students need math and reading stations
----------
[(9, 0.51250076)]
my students need subscriptions to time for kids magazine
----------
[(9, 0.8642832)]
my students need calculators
----------
[(9, 0.52499735)]
my students need graphing calculators
----------
[(9, 0.7624983)]
my students need math instant learning centers can do reading games sony headphones and other supplies for multi leveled learning centers to keep them actively engaged in meaningful literacy and math activities
----------
[(9, 0.50308883)]
my students need dice math whiteboards independent math activities and the calendar math program to engage in small group lessons and independent math centers
----------
[(9, 0.5035834)]
my students need math centers and math manipulatives to get a hands on math learning experience
----------


my students need math games and centers to review their math skills in a fun way
----------
[(9, 0.561043)]
my students need hands on math and science learning center activities and materials to best meet their needs
----------
[(9, 0.6339144)]
my students need ny times upfront magazines
----------
[(9, 0.6833306)]
my students need snap circuits games and dash dot robots
----------
[(9, 0.51250046)]
my students need graphing calculators to help them see mathematics
----------
[(9, 0.8099988)]
my students need copies of prisoner b for further wwii reading and comprehension
----------
[(9, 0.5078131)]
my students need five ti nspire cx calculators
----------
[(9, 0.52499735)]
my students need these math games and activities to increase their math skills these skills include addition subtraction geometry money and problem solving
----------
[(9, 0.5718023)]
my students need cards and dice to help with their math facts and concepts
----------
[(9, 0.59274966)]
my students need games and ac

my students need calculators in my classroom to help with math computation skills
----------
[(9, 0.6535276)]
my students need subscriptions to time for kids to foster learning
----------
[(9, 0.5785843)]
my students need a class set subscription to time for kids magazine
----------
[(9, 0.6722173)]
my project needs individual subscriptions to time for kids
----------
[(9, 0.50833046)]
my students need different board games such as blockers and math chase advanced edition for oportunities to practice math skills and develop both social and problem solving skills
----------
[(9, 0.5739643)]
my students need ten ti graphing calculators to introduce them to the power of technology and its connection to math
----------
[(9, 0.6275147)]
my students need flash cards problem solving kits and various math quizzes to create a center for them to learn and practice skills they are not proficient in
----------
[(9, 0.5898383)]
my students need a united states rug for our reading center
----------


my students need subscriptions to time for kids magazine grade and subscriptions to time for kids grade
----------
[(9, 0.69615495)]
my students need various math hands on materials such as magnetic coins magnetic bills money activity station and a time measurement activity station
----------
[(9, 0.6050049)]
my students need daily center manipulatives to learn math writing and reading skills math and phonics computer games and writing alphabet reading and math activities
----------
[(9, 0.525787)]
my students need five ti nspire graphing calculators
----------
[(9, 0.7624984)]
my students need calculators to help them achieve success
----------
[(9, 0.6833317)]
my students need the materials pasta oregano tomatoes dressing and garlic powder to make spaghetti as a hands on project the fujifilm will be used for a time capsule
----------
[(9, 0.60999995)]
my students need ipads for our math center to help them raise their math skills
----------
[(9, 0.7104681)]
my students need tablets t

my students need an ipod for their center time
----------
[(9, 0.5125025)]
my students need transportation to and from the holocaust memorial center
----------
[(9, 0.52499914)]
my students need more graphing calculators in order to get them acclimated with the calculator s interface and functionality
----------
[(9, 0.76117396)]
my students need games like clue blokus and king of tokyo to support social emotional and problem solving skills
----------
[(9, 0.5500139)]
my students need game buzzers fidgets and batteries
----------
[(9, 0.68333125)]
my students need subscriptions to time for kids classroom magazine
----------
[(9, 0.50019187)]
my students need resources such as activity centers flash cards and clocks to practice telling time and counting money
----------
[(9, 0.5011669)]
my students need subscriptions to time for kids first grade edition
----------
[(9, 0.5083331)]
my students need playing cards six sided dice and ten sided dice for playing math games
----------
[(9, 0.5

my students need math centers and math games to help study and practice mathematical concepts
----------
[(9, 0.65229464)]
my students need copies of time for kids magazine
----------
[(9, 0.6099967)]
my students need fun and exciting math centers games and noodlers to practice all of the math skills learned in third grade
----------
[(9, 0.5471677)]
my students need skill centered board games including my money game grocery cart a shopping and math game and ethics on the job game
----------
[(9, 0.54233927)]
my students need osmo systems which will help them build problem solving skills
----------
[(9, 0.5321966)]
my students need card stock and x drawing paper
----------
[(9, 0.6099936)]
my students need a rug to sit on during our math calendar time
----------
[(9, 0.508333)]
my students need a class subscription to time for kids magazine copies
----------
[(9, 0.5322982)]
my students need fun interactive math centers and games to practice the every day mathematical skills they need


my students need board and card games like uno connect four battleship and more
----------
[(9, 0.50833625)]
my students need copies of time for kids magazine
----------
[(9, 0.6099967)]
my students need math materials that will address their different learning styles such as math manipulatives and math journals
----------
[(9, 0.5195792)]
my students need tabletop communication centers
----------
[(9, 0.5249991)]
my students need graphing calculators for integrated algebra
----------
[(9, 0.50835264)]
my students need graphing calculators as we are currently sharing for a class of just a few more can help a lot
----------
[(9, 0.53168255)]
my students need batteries
----------
[(9, 0.52499336)]
my students need geometric boards that will help hone their math skills
----------
[(9, 0.503598)]
my students need subscriptions to time for kids k
----------
[(9, 0.609997)]
my students need subscriptions to time for kids magazine
----------
[(9, 0.8642832)]
my students need subscriptions to 

my students need hands on science materials to use in learning centers
----------
[(9, 0.52872694)]
my students need glockenspiels to help increase hands on learning time and cut instrument wait time in half
----------
[(9, 0.525224)]
my students need books computer games and hands on activities like phonics stamps and learning centers to help enhance their reading skills
----------
[(9, 0.5409791)]
my students need lego s learntolearn sets for motivated engaged problem solving skills
----------
[(9, 0.50628155)]
my students need literacy and math centers to increase their skills in reading and math and to also give them a desire to have fun learning
----------
[(9, 0.55575)]
my students need subscriptions to time for kids magazine
----------
[(9, 0.8642832)]
my students need a the kekow toolbox and various locks to solve a breakout box which will improve their problem solving and collaboration skills
----------
[(9, 0.6425615)]
my students need fraction math games computer quiz games 

my students need one class subscription to time for kids magazine
----------
[(9, 0.5816815)]
my students need graphing calculators to use in class in order to enhance their technical math skills and succeed on the math portion of the sat
----------
[(9, 0.63718647)]
my students need math games and manipulatives for extra practice
----------
[(9, 0.5083414)]
my students need knee pads and volleyballs for practices and games
----------
[(9, 0.5125067)]
my project needs subscriptions to time for kids magazine
----------
[(9, 0.75624764)]
my students need a subscription to time for kids magazine
----------
[(9, 0.8642832)]
my students need subscriptions to the time for kids magazine
----------
[(9, 0.86428326)]
my students need scholastic magazines in spanish to improve their reading and comprehension skills in spanish class
----------
[(9, 0.5566264)]
my students need graphing calculators for their chemistry and physics classes
----------
[(9, 0.5083466)]
my students need graphing calcul

my students need colorful reusable center activities and materials for everyday learning
----------
[(9, 0.52878433)]
my students need more science reading materials and hands on materials
----------
[(9, 0.6293433)]
my students need ipads for math centers about finance
----------
[(9, 0.5124999)]
my students need calculators to check their math problems
----------
[(9, 0.6100086)]
my students need another year subscription for ixl to practice our math skills
----------
[(9, 0.67500067)]
my students need math problem solving and practice journals to become better problem solvers
----------
[(9, 0.7562597)]
my students need math manipulatives to help them strengthen their math skills through hands on learning
----------
[(9, 0.58855766)]
my students need a year subscription to ixl to practice their math and language arts skills
----------
[(9, 0.5062543)]
my students need math centers that provide hands on learning
----------
[(9, 0.6549534)]
my students need a carpet for reading and ce

my students need games and hands on activities to help them learn basic math skills
----------
[(9, 0.7078653)]
my students need a variety of hands on math activities like tracing numbers math centers counters and measurement centers to support our math program
----------
[(9, 0.5247798)]
my students need math manipulatives games and visuals to reinforce basic math skills
----------
[(9, 0.561117)]
my students need hot dot pens and independent reading math and science practice quiz cards
----------
[(9, 0.5386589)]
my students need six graphing calculators
----------
[(9, 0.7624983)]
my finance students need calculators that work
----------
[(9, 0.6833291)]
my students need calculators for higher level problem solving
----------
[(9, 0.578572)]
my students need subscriptions to the time for kids magazine
----------
[(9, 0.8642832)]
i would like to have subscriptions to time for kids magazine for my students so that they can become aware of world news
----------
[(9, 0.57863754)]
my stu

In [18]:
for (text, doc) in zip(texts[:20], docs[:20]):
    print(text)
    print('-'*10)
    print([(topic+1, prob) for (topic, prob) in model[dictionary.doc2bow(doc)] if prob > 0.7])
    print('\n')

my students need a rug
----------
[]


my students need storage cabinets in our choir room to house our music library
----------
[]


my students need clarinets and trumpet to play in the band
----------
[]


my students need new stands at the high school in order to accommodate our rapidly growing program
----------
[]


my students need a projector to be able to view live performances and other teaching resources from the internet
----------
[]


my students need black binders to hold our music for daily and concert use
----------
[]


my students need a wireless laptop my students need a clean classroom environment my students need bar code scanners for fast check out in library
----------
[]


my students need a set of bucket drums and stability balls to become rhythmic wonders in the general music classroom
----------
[]


my students need updated rhythm instruments to replace the current supply some of which are broken and all of which are over years old they need bells egg shake

In [19]:
topic_nums = []
for (text, doc) in zip(texts, docs):
    probs = np.array(model[dictionary.doc2bow(doc)])
    topic_nums.append(probs[np.argsort(probs[:,-1])][-1,0])
    
data['topic'] = topic_nums

product_vs_topic = pd.crosstab(data['id'], data['topic'])
product_vs_topic = product_vs_topic.T / product_vs_topic.sum(axis = 1) * 100
product_vs_topic = product_vs_topic.T

In [20]:
pd.set_option('display.max_columns', None)
#pd.set_option("max_rows", None) #undo by resetting --- 
#pd.reset_option("display.max_rows")
#pd.set_option('display.max_rows', 500)
pd.set_option('display.max_rows', 50)

In [21]:
data.head()

Unnamed: 0,id,text,topic
0,846450,my students need a rug,0.0
1,2116678,my students need storage cabinets in our choir room to house our music library,2.0
2,1892178,my students need clarinets and trumpet to play in the band,2.0
3,1143364,my students need new stands at the high school in order to accommodate our rapidly growing program,3.0
4,779936,my students need a projector to be able to view live performances and other teaching resources from the internet,7.0


In [None]:
import matplotlib.pyplot as plt
from matplotlib import colors
def background_gradient(s, m, M, cmap='PuBu', low=0, high=0):
    rng = M - m
    norm = colors.Normalize(m - (rng * low),
                            M + (rng * high))
    normed = norm(s.values)
    c = [colors.rgb2hex(x) for x in plt.cm.get_cmap(cmap)(normed)]
    return ['background-color: %s' % color for color in c]

In [None]:
product_vs_topic.round(2).style.apply(background_gradient,
               cmap='YlGnBu',
               m=product_vs_topic.min().min(),
               M=product_vs_topic.max().max(),
               low=0.5,
               high=0.8)

In [None]:
data.loc[data['id'].isin(['4957430','4957502','4957562'])]

## Conclusions

Many collections of unstructured texts don't come with any labels. Topic models such as Latent Dirichlet Allocation are a useful technique to discover the most prominent topics in such documents. Gensim makes training these topics model easy, and pyLDAvis presents the results in a visually attractive way. Together they form a powerful toolkit to better understand what's inside large sets of documents and to explore subsets of related texts. However, these methods can perform poorly in short texts with vague or unspecified subjects. Although traditional topic models are lacking in more semantic information (they don't use word embeddings, for instance), they can be really quick way of getting insights into large collections of documents.