# Step 1: Read & Process Transcript Files
We first read the transcript text files, grouping together the files in the "Free" group as one set, and the "D-C" group as another set.

To avoid processing errors, we first remove all non-UTF characters in the text (such as distinct opening and closing quotes)
and replace them with corresponding UTF characters. These transcripts also come with the transcriber's observations, such as [laughter], or [silent]. We remove all such observations so that we are only looking at team members' speech.

Finally we remove all stop words

In [1]:
import re
# Locations of input files. These files have been anonymized, and the files themselves will not be available
# in this folder, to preserve the privacy of the teams.
textPathFree = "../../../private/brainstorming_viz_data/free_group/"
textPathDC = "../../../private/brainstorming_viz_data/d-c_group/"

# Teamwise transcript (of divergent processes) of the "Free Brainstorming" group
fileNamesFreeDivergent = ['teamA_part1_divergent.txt', 'teamB_part1_divergent.txt', 'teamC_part1_divergent.txt',
                          'teamD_part1_divergent.txt', 'teamE_part1_divergent.txt', 'teamF_part1_divergent.txt',
                          'teamG_part1_divergent.txt', 'teamH_part1_divergent.txt', 'teamI_part1_divergent.txt']

# Teamwise transcript (of divergent processes) of the "Divergent-Convergent (D-C) Brainstorming" group
fileNamesDCDivergent = ['teamJ_part1_divergent.txt', 'teamJ_part3_divergent.txt', 'teamK_part1_divergent.txt',
                        'teamK_part3_divergent.txt', 'teamL_part1_divergent.txt', 'teamL_part3_divergent.txt', 
                        'teamM_part1_divergent.txt', 'teamM_part2_3_convergent_divergent.txt', 'teamN_part1_divergent.txt',
                        'teamN_part3_divergent.txt', 'teamO_part1_divergent.txt', 'teamO_part2_3_conv_div.txt',
                        'teamP_part1_divergent.txt', 'teamP_part3_divergent.txt', 'teamQ_part1_divergent.txt',
                        'teamQ_part3_4_div_conv.txt', 'teamR_part1_divergent.txt', 'teamR_part3_divergent.txt'] 

# Teamwise transcript (of divergent processes) of the "Free Brainstorming" group
fileNamesFreeConvergent = ['teamA_part2_convergent.txt', 'teamB_part2_convergent.txt', 'teamC_part2_convergent.txt',
                           'teamD_part2_convergent.txt', 'teamE_part2_convergent.txt', 'teamF_part2_convergent.txt',
                           'teamG_part2_convergent.txt', 'teamH_part2_convergent.txt', 'teamI_part2_convergent.txt']

# Teamwise transcript (of divergent processes) of the "Divergent-Convergent (D-C) Brainstorming" group
fileNamesDCConvergent = ['teamJ_part2_convergent.txt', 'teamJ_part4_convergent.txt', 'teamK_part2_convergent.txt',
                         'teamK_part4_convergent.txt', 'teamL_part2_convergent.txt', 'teamL_part4_convergent.txt',
                         'teamM_part4_convergent.txt', 'teamN_part2_convergent.txt', 'teamN_part4_convergent.txt',
                         'teamO_part4_convergent.txt', 'teamP_part2_convergent.txt', 'teamP_part4_convergent.txt',
                         'teamQ_part2_convergent.txt', 'teamR_part2_convergent.txt', 'teamR_part4_convergent.txt']  

def openFiles(fileNamesList, filePath):
    # Given a list of file names and a path for the files, returns a
    # list of file objects
    fObjList = []
    for fileName in fileNamesList:
        fObj = open(filePath + fileName, 'r', encoding='utf8')
        fObjList.append(fObj)
    return fObjList

filesListFree = openFiles(fileNamesFreeDivergent + fileNamesFreeConvergent, textPathFree)
filesListDC = openFiles(fileNamesDCDivergent + fileNamesDCConvergent, textPathDC)

# Some of these files have non-UTF characters that may need to be removed.
def removeSpecials(filesList):
    # Remove some incompatible unicode characters such as distinct
    # opening and closing quotes, ellipsis etc.
    # Also finally converts all text to lowercase
    outString = ''
    for inFile in filesList:
        # print("reading file",inFile.name.split("/")[-1], "...", end="")
        temp = inFile.read()
        subsFileQuotes = re.sub(u"(\u2018|\u2019)", "'", temp)
        subsFileDQuotes = re.sub(u'(\u201c|\u201d)', '"', subsFileQuotes)
        subsFileEllipsis = re.sub(u"\u2026", "...", subsFileDQuotes)
        outString += subsFileEllipsis
        # print("done.")
    return outString.lower()

group1_raw = removeSpecials(filesListFree)
group2_raw = removeSpecials(filesListDC)

# These files have the transcriber's observations like [laughter], [silent], [cross-talk] etc.
# These may throw off the text analysis, so we remove them. These are typically within parantheses,
# so we can remove everything between these parantheses.

def removeObservations(test_str) :
    # removes all text within parantheses
    # these are usually observations by the transcriber
    # and thus should not be considered in content analysis
    # code from http://stackoverflow.com/questions/14596884/remove-text-between-and-in-python
    ret = ''
    skip1c = 0
    skip2c = 0
    squareparen = 0
    for i in test_str:
        if i == '[':
            skip1c += 1
            squareparen += 1
        elif i == '(':
            skip2c += 1
        elif i == ']' and skip1c > 0:
            skip1c -= 1
        elif i == ')'and skip2c > 0:
            skip2c -= 1
        elif skip1c == 0 and skip2c == 0:
            ret += i
    return ret

group1 = removeObservations(group1_raw)
group2 = removeObservations(group2_raw)

group1_lines = group1.split('\n')
group2_lines = group2.split('\n')

print("Number of lines in the Free Group:", len(group1_lines))
print("Number of lines in the D-C Group:", len(group2_lines))
print("Each line will be considered a separate document for computing the Document-Term Matrix")

Number of lines in the Free Group: 7641
Number of lines in the D-C Group: 3872
Each line will be considered a separate document for computing the Document-Term Matrix


# Step 2: Computing the Latent Dirichlet Allocation
Detailed explanation to follow...


In [2]:
from __future__ import print_function
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.corpus import stopwords

  spec = inspect.getargspec(func)
  spec = inspect.getargspec(func)
  spec = inspect.getargspec(func)
  spec = inspect.getargspec(func)
  spec = inspect.getargspec(func)
  spec = inspect.getargspec(func)
  def _formatters_default(self):
  def _deferred_printers_default(self):
  def _singleton_printers_default(self):
  def _type_printers_default(self):
  regargs, varargs, varkwargs, defaults = inspect.getargspec(func)


## Step 2.1 Computing the Document-Term Matrix

The document-term matrix looks like this:

|      | T1     | T2     | T3     | T4     | T5     |... 
|:----:|:------:|:------:|:------:|:------:|:------:|:---:
| D1   | N_d1t1 | N_d1t2 | N_d1t2 | N_d1t3 | N_d1t4 |...
| D2   | N_d2t1 | N_d2t2 | N_d2t2 | N_d2t3 | N_d2t4 |...
| D3   | N_d3t1 | N_d3t2 | N_d3t2 | N_d3t3 | N_d3t4 |...
| D4   | N_d4t1 | N_d4t2 | N_d4t2 | N_d4t3 | N_d4t4 |...
| D5   | N_d5t1 | N_d5t2 | N_d5t2 | N_d5t3 | N_d5t4 |...
| D6   | N_d6t1 | N_d6t2 | N_d6t2 | N_d6t3 | N_d6t4 |...
| ..   | ...... | ...... | ...... | ...... | ...... |...

Where N_ditj is called the **term frequency**, and is equal to the the number of occurrences of term Tj in document Di.

For our purpose, each term is a word longer than two letters, and each document is a line in the transcript. We need to focus on words that matter, so this means removing any stop words, as well as words from the transcript that we deem irrelevant. These include filler words such as "yeah", "like", etc. and non-words such as "hmm". Since this is a transcript, we find a number of time-stamped entries embedded in the text, so these need to go as well. A complete list of words to remove from the list of terms is below (excluding stop words).

Each document is represented as an M-dimensional vector where M is the number of terms computed for the corpus. Here, the vector corresponds to the row of the document-term matrix.

In [17]:
words_to_remove = ['yea', 'like', 'up', 'down', "!", "$", "'", "''", "'cause", 
                   "'d", "'em", "'ll", "'m", "'re", "'s", "'ve", ",", "yeah",
                   "-", ".", "1", "10", "12", "12.", "15", "20", "3-d", 
                   "30", "300", "352.", "36", "375", "3d", "452", "475", 
                   "475.", "90", ":", ";", "?", "``", "one", "would", "right",
                   "okay", 'could', 'know', 'make', 'something', 'yes', 'good',
                   "think", "oh", "maybe", 'kinda', 'actually', 'thing', 'things',
                   'gonna', 'wanna', 'really', 'hmm',
                   # The below set was after topic modeling
                   'cuz', 'huh', 'didn', 'wouldn', 'doesn', 'whatever', 'guys'
                  ]

complete_stop_words = stopwords.words('english') + words_to_remove

tf_vectorizer_grp1 = CountVectorizer(strip_accents = 'unicode',
                                     analyzer = 'word',
                                     stop_words = complete_stop_words,
                                     lowercase = True,
                                     token_pattern = r'\b[a-zA-Z]{3,}\b',
                                     max_df = 0.5, 
                                     min_df = 10)

tf_vectorizer_grp2 = CountVectorizer(strip_accents = 'unicode',
                                     analyzer = 'word',
                                     stop_words = complete_stop_words,
                                     lowercase = True,
                                     token_pattern = r'\b[a-zA-Z]{3,}\b',
                                     max_df = 0.5, 
                                     min_df = 10)
dtm_tf_grp1 = tf_vectorizer_grp1.fit_transform(group1_lines)
print("Document-Term Matrix size for Free Group:", dtm_tf_grp1.shape)
dtm_tf_grp2 = tf_vectorizer_grp2.fit_transform(group2_lines)
print("Document-Term Matrix size for D-C Group:", dtm_tf_grp2.shape)

Document-Term Matrix size for Free Group: (7641, 475)
Document-Term Matrix size for D-C Group: (3872, 285)


## 2.2 Compute the Term Frequency - Inverse Document Frequency
In order to use a scheme for weighting the document-term matrix, we use the popular **Inverse Document Frequency (IDF)** weighting measure.
The IDF is calculated as:

$IDF = log(N / n_j)$

Where $N$ is the total number of documents, and $n_j$ is the number of documents containing term $j$.
This system gives higher weightage to terms that occur in relatively fewer documents (and thus are more characteristic of the *topic* of that document)

The weighted matrix is called Term Frequency-Inverse Document Frequency, or **Tf-idf**, where each cell (term frequency) is multiplied by the weight (inverse document frequency)

In [18]:
tfidf_vectorizer_grp1 = TfidfVectorizer(**tf_vectorizer_grp1.get_params())
dtm_tfidf_grp1 = tfidf_vectorizer_grp1.fit_transform(group1_lines)

tfidf_vectorizer_grp2 = TfidfVectorizer(**tf_vectorizer_grp2.get_params())
dtm_tfidf_grp2 = tfidf_vectorizer_grp2.fit_transform(group2_lines)

## 2.3 Compute Latent Dirichlet Allocation (LDA)
The following explanation is from [Edwin Chen's website](http://blog.echen.me/2011/08/22/introduction-to-latent-dirichlet-allocation/)

LDA represents documents as mixtures of topics that spit out words with certain probabilities. It assumes that documents are produced in the following fashion: when writing each document, you:

* Decide on the number of words N the document will have (say, according to a Poisson distribution).
* Choose a topic mixture for the document (according to a Dirichlet distribution over a fixed set of K topics). For example, assuming that we have the two food and cute animal topics above, you might choose the document to consist of 1/3 food and 2/3 cute animals.
* Generate each word w_i in the document by:
    * First picking a topic (according to the multinomial distribution that you sampled above; for example, you might pick the food topic with 1/3 probability and the cute animals topic with 2/3 probability).
    * Using the topic to generate the word itself (according to the topic’s multinomial distribution). For example, if we selected the food topic, we might generate the word “broccoli” with 30% probability, “bananas” with 15% probability, and so on.
* Assuming this generative model for a collection of documents, LDA then tries to backtrack from the documents to find a set of topics that are likely to have generated the collection.

### Example

Let’s make an example. According to the above process, when generating some particular document D, you might

* Pick 5 to be the number of words in D.
* Decide that D will be 1/2 about food and 1/2 about cute animals. (Number of topics = 2)
* Pick the first word to come from the food topic, which then gives you the word “broccoli”.
* Pick the second word to come from the cute animals topic, which gives you “panda”.
* Pick the third word to come from the cute animals topic, giving you “adorable”.
* Pick the fourth word to come from the food topic, giving you “cherries”.
* Pick the fifth word to come from the food topic, giving you “eating”.
So the document generated under the LDA model will be “broccoli panda adorable cherries eating” (note that LDA is a bag-of-words model).


In [19]:
# for TF DTM
lda_tf_grp1 = LatentDirichletAllocation(n_topics=4, random_state=0)
lda_tf_grp1.fit(dtm_tf_grp1)
lda_tf_grp2 = LatentDirichletAllocation(n_topics=4, random_state=0)
lda_tf_grp2.fit(dtm_tf_grp2)
# for TFIDF DTM
lda_tfidf_grp1 = LatentDirichletAllocation(n_topics=4, random_state=0)
lda_tfidf_grp1.fit(dtm_tfidf_grp1)
lda_tfidf_grp2 = LatentDirichletAllocation(n_topics=4, random_state=0)
lda_tfidf_grp2.fit(dtm_tfidf_grp2)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_jobs=1, n_topics=4, perp_tol=0.1, random_state=0,
             topic_word_prior=None, total_samples=1000000.0, verbose=0)

## 2.4 Visualize Free Brainstorming Topics Using Term Frequencies (unweighted)
Using PyLDAvis, visualize topics and salient terms. To focus on terms that are more salient to each topic, set $\lambda$ to 0.4.

In [20]:
# pyLDAvis.sklearn.prepare(lda_tf_grp1, dtm_tf_grp1, tf_vectorizer_grp1)

In [21]:
# pyLDAvis.sklearn.prepare(lda_tf_grp2, dtm_tf_grp2, tf_vectorizer_grp2)

## 2.4 Visualize Free Brainstorming Topics Using TF-IDF (weighted)
Using PyLDAvis, visualize topics and salient terms. To focus on terms that are more salient to each topic, set $\lambda$ to 0.4.

In [22]:
pyLDAvis.sklearn.prepare(lda_tfidf_grp1, dtm_tfidf_grp1, tfidf_vectorizer_grp1)

## 2.4 Visualize D-C Brainstorming Topics Using TF-IDF (weighted)

In [23]:
pyLDAvis.sklearn.prepare(lda_tfidf_grp2, dtm_tfidf_grp2, tfidf_vectorizer_grp2)

In [24]:
filesListFreeDiv = openFiles(fileNamesFreeDivergent, textPathFree)
filesListFreeConv = openFiles(fileNamesFreeConvergent, textPathFree)
filesListDCDiv = openFiles(fileNamesDCDivergent, textPathDC)
filesListDCConv = openFiles(fileNamesDCConvergent, textPathDC)

group1_raw_div = removeSpecials(filesListFreeDiv)
group1_raw_conv = removeSpecials(filesListFreeConv)
group2_raw_div = removeSpecials(filesListDCDiv)
group2_raw_conv = removeSpecials(filesListDCConv)

group1Div = removeObservations(group1_raw_div)
group1Conv = removeObservations(group1_raw_conv)
group2Div = removeObservations(group2_raw_div)
group2Conv = removeObservations(group2_raw_conv)

group1_div_lines = group1Div.split('\n')
group1_conv_lines = group1Conv.split('\n')
group2_div_lines = group2Div.split('\n')
group2_conv_lines = group2Conv.split('\n')

print("Number of lines in the Free Group (divergent process):", len(group1_div_lines))
print("Number of lines in the Free Group (convergent process):", len(group1_conv_lines))
print("Number of lines in the D-C Group (divergent process):", len(group2_div_lines))
print("Number of lines in the D-C Group (convergent process):", len(group2_conv_lines))
print("Each line will be considered a separate document for computing the Document-Term Matrix")

Number of lines in the Free Group (divergent process): 5023
Number of lines in the Free Group (convergent process): 2619
Number of lines in the D-C Group (divergent process): 3872
Number of lines in the D-C Group (convergent process): 2769
Each line will be considered a separate document for computing the Document-Term Matrix


In [25]:
tf_vectorizer_grp1_div = CountVectorizer(strip_accents = 'unicode',
                                         analyzer = 'word',
                                         stop_words = complete_stop_words,
                                         lowercase = True,
                                         token_pattern = r'\b[a-zA-Z]{3,}\b',
                                         max_df = 0.5, 
                                         min_df = 10)

tf_vectorizer_grp1_conv = CountVectorizer(strip_accents = 'unicode',
                                          analyzer = 'word',
                                          stop_words = complete_stop_words,
                                          lowercase = True,
                                          token_pattern = r'\b[a-zA-Z]{3,}\b',
                                          max_df = 0.5, 
                                          min_df = 10)

tf_vectorizer_grp2_div = CountVectorizer(strip_accents = 'unicode',
                                         analyzer = 'word',
                                         stop_words = complete_stop_words,
                                         lowercase = True,
                                         token_pattern = r'\b[a-zA-Z]{3,}\b',
                                         max_df = 0.5, 
                                         min_df = 10)

tf_vectorizer_grp2_conv = CountVectorizer(strip_accents = 'unicode',
                                          analyzer = 'word',
                                          stop_words = complete_stop_words,
                                          lowercase = True,
                                          token_pattern = r'\b[a-zA-Z]{3,}\b',
                                          max_df = 0.5, 
                                          min_df = 10)
dtm_tf_grp1_div = tf_vectorizer_grp1.fit_transform(group1_div_lines)
print("Document-Term Matrix size for Free Group (divergent process):", dtm_tf_grp1_div.shape)
dtm_tf_grp1_conv = tf_vectorizer_grp1.fit_transform(group1_conv_lines)
print("Document-Term Matrix size for Free Group (convergent process):", dtm_tf_grp1_conv.shape)

dtm_tf_grp2_div = tf_vectorizer_grp2.fit_transform(group2_div_lines)
print("Document-Term Matrix size for Free Group (divergent process):", dtm_tf_grp2_div.shape)
dtm_tf_grp2_conv = tf_vectorizer_grp2.fit_transform(group2_conv_lines)
print("Document-Term Matrix size for Free Group (convergent process):", dtm_tf_grp2_conv.shape)

Document-Term Matrix size for Free Group (divergent process): (5023, 353)
Document-Term Matrix size for Free Group (convergent process): (2619, 159)
Document-Term Matrix size for Free Group (divergent process): (3872, 285)
Document-Term Matrix size for Free Group (convergent process): (2769, 184)


In [26]:
tfidf_vectorizer_grp1_div = TfidfVectorizer(**tf_vectorizer_grp1_div.get_params())
dtm_tfidf_grp1_div = tfidf_vectorizer_grp1_div.fit_transform(group1_div_lines)
tfidf_vectorizer_grp1_conv = TfidfVectorizer(**tf_vectorizer_grp1_conv.get_params())
dtm_tfidf_grp1_conv = tfidf_vectorizer_grp1_conv.fit_transform(group1_conv_lines)

tfidf_vectorizer_grp2_div = TfidfVectorizer(**tf_vectorizer_grp2_div.get_params())
dtm_tfidf_grp2_div = tfidf_vectorizer_grp2_div.fit_transform(group2_div_lines)
tfidf_vectorizer_grp2_conv = TfidfVectorizer(**tf_vectorizer_grp2_conv.get_params())
dtm_tfidf_grp2_conv = tfidf_vectorizer_grp2_conv.fit_transform(group2_conv_lines)

# for TFIDF DTM
lda_tfidf_grp1_div = LatentDirichletAllocation(n_topics=4, random_state=0)
lda_tfidf_grp1_div.fit(dtm_tfidf_grp1_div)
lda_tfidf_grp1_conv = LatentDirichletAllocation(n_topics=4, random_state=0)
lda_tfidf_grp1_conv.fit(dtm_tfidf_grp1_conv)

lda_tfidf_grp2_div = LatentDirichletAllocation(n_topics=4, random_state=0)
lda_tfidf_grp2_div.fit(dtm_tfidf_grp2_div)
lda_tfidf_grp2_conv = LatentDirichletAllocation(n_topics=4, random_state=0)
lda_tfidf_grp2_conv.fit(dtm_tfidf_grp2_conv)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_jobs=1, n_topics=4, perp_tol=0.1, random_state=0,
             topic_word_prior=None, total_samples=1000000.0, verbose=0)

In [27]:
pyLDAvis.sklearn.prepare(lda_tfidf_grp1_div, dtm_tfidf_grp1_div, tfidf_vectorizer_grp1_div)

In [28]:
pyLDAvis.sklearn.prepare(lda_tfidf_grp1_conv, dtm_tfidf_grp1_conv, tfidf_vectorizer_grp1_conv)

In [29]:
pyLDAvis.sklearn.prepare(lda_tfidf_grp2_div, dtm_tfidf_grp2_div, tfidf_vectorizer_grp2_div)

In [30]:
pyLDAvis.sklearn.prepare(lda_tfidf_grp2_conv, dtm_tfidf_grp2_conv, tfidf_vectorizer_grp2_conv)