In [1]:
import pandas as pd
import requests
import json
import csv
import time
import datetime
import re
import string
import gzip
import os

import numpy as np
import pickle #for saving output files, pickles

def datetime_to_unix_time(d):
    return int(time.mktime(datetime.datetime.strptime(d, "%m/%d/%Y").timetuple()))

## Import trailers from my pickle file!

In [2]:
with open('/home/russell/Documents/DataScience/DonorsChoose/Data/trailers.pickle', 'rb') as handle:
    trailers = pickle.load(handle)

In [3]:
## Python packages - you may have to pip install sqlalchemy, sqlalchemy_utils, and psycopg2.
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
from sqlalchemy.sql import table, column, select, update, insert
import psycopg2
from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT

import pandas as pd

#In Python: Define your username and password used above. I've defined the database name (we're 
#using a dataset on births, so I call it birth_db). 
dbname = 'donors_db'
username = 'russell'
pswd = 'bradypodion'

## 'engine' is a connection to a database
## Here, we're using postgres, but sqlalchemy can connect to other things too.
engine = create_engine('postgresql://%s:%s@localhost/%s'%(username,pswd,dbname))
print('postgresql://%s:%s@localhost/%s'%(username,pswd,dbname))
print(engine.url)
# Replace localhost with IP address if accessing a remote server

## create a database (if it doesn't exist)
if not database_exists(engine.url):
    create_database(engine.url)
print(database_exists(engine.url))
print(engine.url)


postgresql://russell:bradypodion@localhost/donors_db
postgresql://russell:bradypodion@localhost/donors_db
True
postgresql://russell:bradypodion@localhost/donors_db


## Process trailers from DonorsChoose (from donors_db in postgreSQL)

In [4]:
# connect:
con = None
con = psycopg2.connect(database = dbname, user = username, host='localhost', password=pswd)

### query: from historical data
merge_query = """
SELECT * FROM merge_time;
"""

merged_data = pd.read_sql_query(merge_query,con)

In [35]:
data = merged_data[['id','fulfillmentTrailer']]
data.columns = ['id','text']

###########drop rows that have www or donorchoose in their fulfillmentTrailers---the wrong info was grabbed
print('Rows coming in '+str(len(data.index)))
data= data[~data.text.str.contains("www")]
data= data[~data.text.str.contains("donorschoose")]
print('Row coming out after website/junk filtering '+str(len(data.index)))

pd.set_option('max_colwidth', 400)
data.head()

Rows coming in 82091
Row coming out after website/junk filtering 79992


Unnamed: 0,id,text
0,846450,My students need a rug.
1,2116678,My students need storage cabinets in our choir room to house our music library.
2,1892178,My students need 4 clarinets and 1 trumpet to play in the band.
3,1143364,My students need 6 new stands at the high school in order to accommodate our rapidly growing program.
4,779936,My students need a projector to be able to view live performances and other teaching resources from the Internet.


### Clean up text

In [36]:
# not using re.sub(r'[^a-zA-Z\s]', '', t) to avoid losing emojis
text = [re.sub(r'([0-9]+?)', ' ', t).lower() for t in data['text']] # remove all numbers and symbols
text = [re.sub(r'(!|"|#|\$|%|&|\'|\(|\)|\*|\+|,|-|\.|/|:|;|<|=|>|\?|@|\[|\\|\]|\^|_|`|{|\||}|~)+', ' ', t) for t in text]
data['text'] = [re.sub(r'\s+\s', ' ', t).strip() for t in text] # repace double spaces with single spaces

data = data.loc[data['text'].map(len) > 3].reset_index(drop = True) # keep only strings longer than 5 characters

# Discovering and Visualizing Topics in Texts

Most typical cases of text classification in NLP (named entity recognition, question answering, etc) require training datasets where each piece of text is associted with a label. However, in real-life scenarios, text collections rarely come with metadata labels that tell you what the texts are about. When people answer open-ended survey questions, for example, they don't repeat detectable keywords in their answer with the topics they discuss.

**Topic modeling** is an unsupervised classification technique that is able to discover the topics in a collection of texts by looking at their commonalities. In this context, "topics" refers to groups of related words that often occur together in the same text. For example, in a collection of newspaper articles a topic model may identify one topic that is made up of words such as "politician", "law", and "parliament", and another characterized by words such as "player", "match" and "penalty". Topic models only go as far as identifying clusters of related words; a human is still needed to interpret these clusters and give them labels such as "politics" and "football". 

One of the most popular topic models is Latent Dirichlet Allocation (LDA). LDA is a generative model that sees every text as a mixture of topics and each sentence as a mixture of words. For example, the "football" topic will generate the word "penalty" with a high probability, while the "politics" topic will have a much higher probability for "politician" than for "penalty". Other words, such as "the" and "an", will have similar probabilities in all topics. LDA takes its name from the Dirichlet probability distribution. This is the prior distribution it assumes the topics in a text will have.

Modified from https://github.com/nlptown/nlp-notebooks/blob/master/Discovering%20and%20Visualizing%20Topics%20in%20Texts%20with%20LDA.ipynb

## Data

Insight fellows frequently come up with project ideas that revolve around topic modeling of online reviews. Here, we'll use a dataset of project 'trailers' from the website DonorsChoose providing a brief description of the reason/project for which a teacher is requesting funding.

In [37]:
data.head(2)

Unnamed: 0,id,text
0,846450,my students need a rug
1,2116678,my students need storage cabinets in our choir room to house our music library


## Preprocessing

Before we train a topic model, we need to tokenize our texts. Let's do this with the [spaCy](https://spacy.io/) NLP library. We need to load a statisti English and use spaCy to perform our first preprocessing pass:

In [38]:
import spacy

# If you haven't installed the spaCy language model, uncomment the following line and run this cell
#! python -m spacy download en_core_web_sm

# You will need to restart the notebook (go to the menu Kernel -> Restart) and re-run cells up to this point

In [None]:
nlp = spacy.load('en_core_web_sm')

texts = data['text'].tolist()
%time spacy_docs = list(nlp.pipe(texts))

The text of each review is now a spaCy Doc that we can transform into a list of tokens. Instead of the original tokens, we're going to work with the **lemmas** instead. This will allow our model to generalize and understand that different forms of a word should be treated as one.

Stemming and Lemmatization both generate the root form of the words. Lemmatization uses the rules about a language and the resulting tokens are all actual words. For example, the word "thought" becomes the lemma "think". Stemming is a crude heuristic that chops off the ends of words such that the resulting tokens may not be actual words. Stemming is faster but only works well for simple words like "toys" and "toy".

This is the full list of our initial preprocessing steps: 
 
- remove all words shorter than 2 characters (these are often fairly uninteresting from a topical point of view)
- drop all stopwords
- lowercase remaining lemmas

In [10]:
docs = [[t.lemma_.lower() for t in doc if len(t.orth_) > 2 and not t.is_stop] for doc in spacy_docs]
for i in range(5):
    print(docs[i])
    print('\n')

['student', 'need', 'rug']


['student', 'need', 'storage', 'cabinet', 'choir', 'room', 'house', 'music', 'library']


['student', 'need', 'clarinet', 'trumpet', 'play', 'band']


['student', 'need', 'new', 'stand', 'high', 'school', 'order', 'accommodate', 'rapidly', 'grow', 'program']


['student', 'need', 'projector', 'able', 'view', 'live', 'performance', 'teaching', 'resource', 'internet']




Next, we also want to take frequent bigrams into account. **Bigrams are multiword units**, such as "colored pencil" that actually form one word rather than two. We'll use Gensim to first identify the frequent bigrams in the corpus, then append them to the list of tokens for the documents in which they appear. This means the bigrams will not be in their correct position in the text, but that's fine: topic models are bag-of-word models that ignore word position anyway.

In [11]:
import re
from gensim.models import Phrases

bigram = Phrases(docs, min_count=10)
tokens = []

for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:  # bigrams can be recognized by the "_" that joins the invidual words
            docs[idx].append(token)
            tokens.append(token)
            
print(list(set(tokens))[:10])

['differentiate_instruction', 'fifth_grade', 'snap_circuit', 'english_dictionary', 'think_critically', 'soprano_xylophone', 'increase_motivation', 'property_earth', 'lego_robotic', 'paint_brush']


In [12]:

pickle_out = open('/home/russell/Documents/GitHub/DonorBooster/cleantrailersbig.pickle',"wb")
pickle.dump(docs, pickle_out)
pickle_out.close()

pickle_out = open('/home/russell/Documents/GitHub/DonorBooster/cleantokensbig.pickle',"wb")
pickle.dump(tokens, pickle_out)
pickle_out.close()


Next, we move on to the final Gensim-specific preprocessing steps. First, we create a dictionary representation of the documents. This dictionary will map each word to a unique ID and help us create bag-of-word representations of each document. These bag-of-word representations contain the ids of the words in the document, together with their frequency. Additionally, we can remove the least and most frequent words from the vocabulary. This improves the quality of our topic model and speeds up its training. The minimum frequency of a word is expressed as an absolute number, the maximum frequency is the proportion of documents a word is allowed to occur in.

In [13]:
from gensim.corpora import Dictionary

dictionary = Dictionary(docs)
print('Number of unique words in original documents:', len(dictionary))

dictionary.filter_extremes(no_below=3, no_above=0.25)
print('Number of unique words after removing rare and common words:', len(dictionary))

print("Example representation of document 3:", dictionary.doc2bow(docs[2]))

Number of unique words in original documents: 20297
Number of unique words after removing rare and common words: 10570
Example representation of document 3: [(9, 1), (10, 1), (11, 1), (12, 1)]


Then we create bag-of-word representations for each document in the corpus:

In [14]:
corpus = [dictionary.doc2bow(doc) for doc in docs]

## Training

Now it's time to train our topic model. We do this with the following parameters: 

- `corpus`: the bag-of-word representations of our documents
- `id2token`: the mapping from indices to words
- `num_topics`: the number of topics we want the model to identify
- `chunksize`: the number of documents the model sees for every update
- `passes`: the number of times we show the total corpus to the model during training
- `random_state`: we use a seed to ensure reproducibility.

On a corpus of this size, the training will typically about a minute.

In [16]:
from gensim.models import LdaModel

%time model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=20, chunksize=500, passes=3, random_state=1)

CPU times: user 42.5 s, sys: 35.5 ms, total: 42.6 s
Wall time: 42.6 s


## Results

Let's take a look at what the model has learnt. We do this by printing out the ten words that are most characteristic for each of the topics. Most topics show common words like "experience", "item" and "school" but it's hard to identify any other patterns in the data.

In [17]:
for (topic, words) in model.print_topics():
    print(topic+1, ":", words, '\n')

1 : 0.070*"work" + 0.070*"chair" + 0.034*"ball" + 0.032*"help" + 0.028*"day" + 0.028*"stool" + 0.026*"wobble" + 0.025*"bag" + 0.025*"learn" + 0.025*"place" 

2 : 0.074*"tablet" + 0.041*"use" + 0.036*"love" + 0.036*"wireless" + 0.036*"organize" + 0.035*"chart" + 0.034*"help" + 0.034*"cover" + 0.028*"community" + 0.027*"novel" 

3 : 0.063*"computer" + 0.059*"project" + 0.051*"classroom" + 0.048*"create" + 0.048*"book" + 0.045*"level" + 0.042*"library" + 0.035*"video" + 0.031*"research" + 0.030*"chromebook" 

4 : 0.090*"supply" + 0.079*"art" + 0.067*"paper" + 0.062*"pencil" + 0.052*"board" + 0.041*"marker" + 0.035*"paint" + 0.029*"basic" + 0.023*"material" + 0.021*"fiction" 

5 : 0.201*"book" + 0.112*"set" + 0.086*"read" + 0.049*"copy" + 0.037*"center" + 0.034*"class" + 0.023*"reading" + 0.023*"include" + 0.019*"story" + 0.019*"unit" 

6 : 0.105*"play" + 0.094*"music" + 0.078*"tool" + 0.044*"item" + 0.039*"set" + 0.037*"block" + 0.029*"drum" + 0.023*"outdoor" + 0.018*"accessory" + 0.018*"

Another way of inspecting the topics is by visualizing them. This can be done with the [pyLDAvis](https://github.com/bmabey/pyLDAvis) library. PyLDAvis will show us how popular the topics are in our corpus, how similar the topics are, and which are the most salient words for this topic. Note it's important to set `sort_topics=False` on the call to pyLDAvis. If you don't, it will order the topics differently than Gensim. 

In [18]:
import pyLDAvis.gensim
import warnings

pyLDAvis.enable_notebook()
warnings.filterwarnings("ignore", category=DeprecationWarning) 

pyLDAvis.gensim.prepare(model, corpus, dictionary, sort_topics=False)

Finally, let's inspect the topics the model recognizes in some of the individual documents. Here we see how LDA tends to assign a high probability to a low number of topics for each documents, which makes its results easily interpretable.

In [33]:
for (text, doc) in zip(texts, docs):
    val=([(topic+1, prob) for (topic, prob) in model[dictionary.doc2bow(doc)] if ((prob > 0.5) and topic==17)])
       
    if len(val)!=0:
        print(text)
        print('-'*10)
        print(val)
        

    del val

my students need a durable and dependable cassette and cd player
----------
[(18, 0.61659014)]
the cost of materials to learn about time including a learn all day clock a set of books about time and a set of clock puzzles is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.75297713)]
the cost of a kidney shape activity table with chairs is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.88535327)]
the cost of a kidney shape activity table with chairs is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------

the cost of copies of scrabble and scrabble dictionaries from barnes noble is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.9509691)]
the cost of a califone stereo boombox purchased from abc school supply is including shipping and a target new href https www donorschoose org html fulfillment htm onclick g openwindow https www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.90353596)]
the cost of the midland two way radios is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.9512188)]
the cost of the inkjet cardtridge sets is including shipping and a target new href https www donorschoose org html fulfillmen

the cost of sending students and four chaperones to alcatraz is including transportation and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.97432417)]
the cost of copies of house of the scorpion is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.92775583)]
the cost of subscriptions to time for kids magazine from time for kids is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.7925664)]
the cost of an overhead projector from quill corporation is including shipping and a target new href http www donorschoose org ht

the cost of the lcd projector is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.9059635)]
the cost of the digital camcorder and a set of tapes is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.90816194)]
the cost of a fluent plus science classroom set is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.8701043)]
the cost of a trip to the chabot space and science center is including a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose or

the cost of a class set of yoga mats is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.90814734)]
the cost of these soccer goals is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.92805594)]
the cost of these soccer balls is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.9059698)]
the cost of thirty jump ropes from nasco is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindo

the cost of a logitech x speaker system from the quill corporation is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.9055234)]
the cost of various art supplies including glue paint and colored pencils from quill corporation highsmith inc and nasco is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.78503597)]
the cost of an ocean drum tambourines and wrist ankle bells is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.8645216)]
the cost of a sony dcrhc digital video camera with two minute ta

my students need four coxorbs to help our rowing team achieve their potential as athletes and as a team
----------
[(18, 0.51249856)]
the cost of this proposal is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.9756409)]
the cost of this proposal is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.9756409)]
the cost of this proposal is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.9756409)]
the cost of a large teacher s easel is including shipping and a target new href http www donorschoose

my students need storybooks to target different developmental speech sounds
----------
[(18, 0.51250136)]
my students need durable backpacks
----------
[(18, 0.5249887)]
the cost of the power grid board game is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.9278085)]
the cost of anti gang books is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.95096636)]
ten boxes of pens two boxes of pencils ten uni ball pens sixty notebooks twenty pocket calculators and twenty correction pens from quill corporation will cost including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfil

the cost of a library book sorting cart is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.90585434)]
the cost of the books is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.9509663)]
the cost of these math books is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.9277734)]
the cost of materials to create these work portfolios is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillw

the cost of a discover america rug is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.9512541)]
the cost of copies of cursive writing is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.927347)]
the cost of yoga classes is including a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.9243661)]
the cost of pairs of binoculars from carolina biological supply co is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm ful

the cost of a hockey floor set and a caddy bag is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.92977047)]
the cost of the gymnastic mats is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.9512543)]
the cost of packs of cameras from office depot and books from barnes noble including courage bernard waber the little engine that could and new york s bravest is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.8691764)]
the cost of the books is including shipping and a target new href http www 

the cost of a model skeleton from sargent welch is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.9512559)]
the cost of microscope kits and a microscope that can be used with a projector is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.8841538)]
the cost of poster board laminating sheets colored pencils and construction paper is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.7916856)]
the cost of illustrated books on human biology is including shipping and a target new href https www don

the cost of copies each of holes james and the giant peach charlie and the chocolate factory and the giver is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.9509691)]
the cost of the geocoaching gps units is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.950975)]
the cost of the magazine subscriptions is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.92805225)]
the cost of rubbermaid recycling container wastebaskets is including shipping and a target new href http www donorschoose org htm

the cost of these art textbooks is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.92796147)]
the cost of the supplies is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.9762498)]
the cost of these art supplies is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.90893567)]
the cost of a printer and printer ink cartridges from quill corporation and office depot is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose 

the cost of the books is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.9509663)]
the cost of thirty copies of the lion the witch and the wardrobe is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.927813)]
the cost of different travel books for countries such as peru argentina venezuela and spain is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.9521695)]
the cost of the draper luma manual wall screen matte white screen finish h x w is including shipping and a target new href http www don

the cost of the baby carriers and car seats is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.90626884)]
the cost of this playground equipment is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.9059864)]
the cost of team building activities including a nebula track and a team labyrinth set is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.8945229)]
the cost of the dr jean cds is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow htt

the cost of the pairs of snowshoes including a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.9487069)]
the cost of various playground and organizational equipment from kaplan early learning company cannon sports and nasco is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.81592864)]
the cost of this proposal is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.9756409)]
the cost of grammar and punctuation manipulatives and supply organizers is including shipping and a target new href http www donorschoose org html f

the cost of a viewing of the lion king and an educational class backstage for people is including a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.901286)]
the cost of sending students and chaperones to see aesop s fables is including a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.97432417)]
the cost of making copies of the dvd of the students show is including a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.9239473)]
the cost of sending students and chaperones to see a christmas carol is including a target new href http www donorschoose org html fulfillment 

the cost of the child size magnetic white board magnetic numbers and shapes dry erase markers and small child size erasers is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.7311031)]
the cost of career costumes and dress up trunks from lakeshore learning is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.8440361)]
the cost of the dr seuss books is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.9756409)]
the cost of a sony digital camcorder and a kodak easyshare camera and printer dock from

the cost of the copies of wicked is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.9509691)]
the cost of two buses for a trip to princeton is including a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.9486961)]
the cost of multiple copies of nine young adult titles is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.9057353)]
the cost of mini tape recorders is including shipping and a target new href https www donorschoose org html fulfillment htm onclick g openwindow https www donorschoose org html fulfillment htm

the cost of a vacuum and four brooms from the quill corporation six dustpans a sponge mop a mop refill and a bucket from nasco is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.8902563)]
the cost of panasonic digital camcorders is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.9280658)]
the cost of sandisk cruzers for this proposal is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.97121197)]
the cost of this projector is including shipping and a target new href http www donorschoose org h

the cost of a day at yankee stadium for students and teachers is including a target new href https www donorschoose org html fulfillment htm onclick g openwindow https www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.9240774)]
the cost of twenty five copies of dreams in the golden country and six packs of post it notes is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.84528506)]
the cost of three computer programs english spanish interpreter standard from wordmagicsoft com and transwhiz chinese standard version and chinese partner standard version from translation net is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
---

the cost of sport balls cones and outside games is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.8627559)]
the cost of life skills math games is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.8848553)]
the cost of a canon powershot a digital camera from audio graphic systems is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.8646969)]
the cost of the sony handycam and carrying case is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwin

the cost of copies of harry potter and the sorcerer s stone a teacher s guide to the book and an audiobook of it all from akj educational services is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.86358076)]
various art supplies from dick blick art materials including one laguna pacifica pottery wheel two sets of sculpture wire and lb of crea stone will cost including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.8060671)]
the cost of basic starter chess sets a demo chess board and a quartz chess clock from wholesale chess is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html

the cost of counting sets and placemats from really good stuff filmic archives is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.9296776)]
the cost of a califone dvd vcr combo from audio graphic systems is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.90587157)]
the cost of three field trips and water bottles is including shipping for the water bottles and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.79272985)]
the cost of chess sets sewing machines and writing journals is including shipping and a target new

the cost of inspirational art supplies that inspire a good or service but are not ready made is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.8483694)]
the cost of the history books is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.9277795)]
the cost of subscriptions to scholastic news magazine for grade age appropriate current events stories in colorful weekly issues from scholastic is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.6791096)]
the cost of subscriptions to kindergarten stu

my students need team hoodies and team t shirts
----------
[(18, 0.51250935)]
the cost of the minidv handycam is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.9756409)]
the cost of notebooks binders paper pencils and highlighters is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.86428094)]
the cost of these jump ropes is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.9059582)]
the cost of this soccer net is including shipping and a target new href http www donorschoose org html fulfillme

the cost of pedometers from nasco binding material from highsmith and cardstock from quill is including shipping and a target new href https www donorschoose org html fulfillment htm onclick g openwindow https www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.8846871)]
the cost of mats from cannon sports is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.9259525)]
the cost of pedometers purchased from cannon sports inc is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.9128272)]
the cost of american symbols photo book sets from lakeshore learning is including shipping and a target new href http www donors

the cost of these books about artists from barnes and noble is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.9509663)]
the cost of the glaze is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.9512722)]
the cost of the word puzzles and artistic coloring pages is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.9057537)]
the cost of this flight and aerodynamics of a dragonfly project is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindo

my students need smocks and aprons
----------
[(18, 0.52499264)]
the cost of copies of the alchemist is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.9509691)]
the cost of a sony camcorder is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.95126474)]
the cost of two tables for this proposal is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.93959135)]
the cost of literacy based games read a long books and classic children s literature is including shipping and a target new href http www do

the cost of the digital camera and accessories is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.884736)]
the cost of ti calculators is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.951253)]
the cost of this proposal is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfillwindow return false fulfillment a
----------
[(18, 0.9756409)]
the cost of calculators from school specialty inc is including shipping and a target new href http www donorschoose org html fulfillment htm onclick g openwindow http www donorschoose org html fulfillment htm fulfil

In [None]:
    print(text)
    print('-'*10)
    
    print('\n')
    del val

In [25]:
text=text[0]
doc=docs[0]

In [19]:
for (text, doc) in zip(texts[:20], docs[:20]):
    print(text)
    print('-'*10)
    print([(topic+1, prob) for (topic, prob) in model[dictionary.doc2bow(doc)] if prob > 0.3])
    print('\n')

my students need a rug
----------
[(19, 0.52499825)]


my students need storage cabinets in our choir room to house our music library
----------
[(11, 0.38124067)]


my students need clarinets and trumpet to play in the band
----------
[(11, 0.40993887)]


my students need new stands at the high school in order to accommodate our rapidly growing program
----------
[]


my students need a projector to be able to view live performances and other teaching resources from the internet
----------
[(11, 0.33889565)]


my students need black binders to hold our music for daily and concert use
----------
[(6, 0.300085)]


my students need a wireless laptop my students need a clean classroom environment my students need bar code scanners for fast check out in library
----------
[]


my students need a set of bucket drums and stability balls to become rhythmic wonders in the general music classroom
----------
[(6, 0.60123456)]


my students need updated rhythm instruments to replace the current s

In [None]:
docs

Looping through all texts, let's save the most likely topic number.

In [None]:
docs[0]

In [None]:
texts[0]

In [None]:
topic_nums = []
for (text, doc) in zip(texts, docs):
    probs = np.array(model[dictionary.doc2bow(doc)])
    topic_nums.append(probs[np.argsort(probs[:,-1])][-1,0])
    
data['topic'] = topic_nums

product_vs_topic = pd.crosstab(data['id'], data['topic'])
product_vs_topic = product_vs_topic.T / product_vs_topic.sum(axis = 1) * 100
product_vs_topic = product_vs_topic.T

In [None]:
pd.set_option('display.max_columns', None)
#pd.set_option("max_rows", None) #undo by resetting --- 
#pd.reset_option("display.max_rows")
#pd.set_option('display.max_rows', 500)
pd.set_option('display.max_rows', 50)

In [None]:
data.head()

In [None]:
import matplotlib.pyplot as plt
from matplotlib import colors
def background_gradient(s, m, M, cmap='PuBu', low=0, high=0):
    rng = M - m
    norm = colors.Normalize(m - (rng * low),
                            M + (rng * high))
    normed = norm(s.values)
    c = [colors.rgb2hex(x) for x in plt.cm.get_cmap(cmap)(normed)]
    return ['background-color: %s' % color for color in c]

product_vs_topic.round(2).style.apply(background_gradient,
               cmap='YlGnBu',
               m=product_vs_topic.min().min(),
               M=product_vs_topic.max().max(),
               low=0.5,
               high=0.8)

In [None]:
data.loc[data['id'].isin(['4957430','4957502','4957562'])]

## Conclusions

Many collections of unstructured texts don't come with any labels. Topic models such as Latent Dirichlet Allocation are a useful technique to discover the most prominent topics in such documents. Gensim makes training these topics model easy, and pyLDAvis presents the results in a visually attractive way. Together they form a powerful toolkit to better understand what's inside large sets of documents and to explore subsets of related texts. However, these methods can perform poorly in short texts with vague or unspecified subjects. Although traditional topic models are lacking in more semantic information (they don't use word embeddings, for instance), they can be really quick way of getting insights into large collections of documents.