## Topic Modeling and Visualization

In [1]:
#General Imports
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
import re
from string import punctuation
from scipy.stats import ttest_ind
from sklearn.metrics import accuracy_score


import nltk
import pandas as pd
#from spacy.en import English

from utils.permutation import print_pvalues
from utils.text_representation import _levels, _multinomial
import nmf_visuals 
feature_vectors, nmf_labels, nmf_inspect, nmf_subset

import spacy
nlp = spacy.load('en_core_web_sm')
#nlp = English(tagger=True, entity=False)

%matplotlib inline
from sklearn.decomposition import NMF

ModuleNotFoundError: No module named 'utils'

In [None]:
#Support Functions
# Text Representation


def _levels(demographics, d_levels=None, print_levels=False):
    """The demographic levels to iterate over
    
    Parameters
    ----------
    demographics : pd.Series
        Demographic labels
    d_levels : list, default None
        The specific demographic levels desired
    print_levels : bool, default False
        Whether to print the demographic levels
    
    Returns
    -------
    levels : iterable
        The unique (sorted) levels in `demographics`
    """
    levels = demographics.unique()
    if d_levels:
        assert set(d_levels).issubset(levels)
        levels = d_levels
    levels.sort()
    if print_levels:
        print('Levels (in order):', levels, end='\n\n')
    return levels

def _multinomial(corpus, kwargs):
    """Tokens counts by document using the spaCy tokenizer

    Parameters
    ----------
    corpus : array-like
        A collection of documents
    kwargs : dict or None
        Keyword arguments of variable length

    Returns
    -------
    X : scipy.sparse.csr.csr_matrix
        The multinomial representation shape (n_samples, n_features)
    v : list
        Vocabulary
    """
    if kwargs:
        cv = CountVectorizer(tokenizer=spacy_tokenize, **kwargs)
    else:
        cv = CountVectorizer(tokenizer=spacy_tokenize)
    X = cv.fit_transform(corpus)
    v = cv.get_feature_names()
    return X, v

def nmf_labels(tfidfmatrix, k):
    """For getting the labels (group assignment) associated with
    each sample (user, in this case)

    Parameters
    ----------
    tfidfmatrix : scipy.sparse.csr.csr_matrix
        The output from calling `TfidfVectorizer` on the users/features data

    k : int
        The number of groupings to create

    Returns
    -------
    labels : np.ndarray
        An array of group assignments of length tfidfmatrix.shape[0] (users)
    """
    H = NMF(n_components=k, random_state=42).fit_transform(tfidfmatrix)
    labels = np.argmax(H, axis=1)
    return labels


def nmf_inspect(tfidfmatrix, feature_names, k_vals=[3, 5, 7, 9], n_words=10):
    """For looping over various values of `k` and printing the
    top `n_words`

    Parameters
    ----------
    tfidfmatrix : scipy.sparse.csr.csr_matrix
        The output from calling `TfidfVectorizer` on the users/features data

    feature_names : list
        The output from calling the `.get_feature_names()` on
        the TfidfVectorizer object

    k_vals : list
        A list of values for `k`, the number of groupings

    n_words : int
        The top n words to print for each grouping

    Returns
    -------
    None
    """
    for k in k_vals:
        nmf = NMF(n_components=k, random_state=42).fit(tfidfmatrix)
        print(k, end='\n')
        _print_words(nmf, feature_names, n_words)
        
def subset_df(df, col, vals):
    """Return a subset of `df` based on particular `vals` for `col`

    Parameters
    ----------
    df : pd.DataFrame
        Input DataFrame
    col : str
        Valid column name
    vals : list
        Values to subset on

    Returns
    -------
    subset : pd.DataFrame
        The rows in `df` with values in `val` for `col`
    """
    df = df.copy()
    subset = df[df[col].isin(vals)]
    return subset

def group_pct(df, demographic):
    """Calculate the percentage of users in each `demographic` level

    Parameters
    ----------
    df : pd.DataFrame
        Where applicable, this should be a subset of the original DataFrame and 
        should include a `group` column corresponding to the NMF groupings
    demographic : str
        Valid column name

    Returns
    -------
    by_dg : pd.DataFrame
        Including `demographic` levels and `group` percentages
    """
    df = df.copy()
    by_dg = pd.DataFrame({'count' :
                          df.groupby([demographic, 'group'])['group'].count()}).reset_index()
    by_d = by_dg.groupby(demographic, as_index=False)['count'].sum()
    by_dg = pd.merge(by_dg, by_d, on=demographic)
    by_dg['pct'] = by_dg.count_x / by_dg.count_y
    return by_dg

def feature_vectors(corpus, kwargs=None):
    """Multinomial and TF-IDF representations

    Paramaters
    ----------
    corpus : array-like
        A collection of documents
    kwargs : dict, default None
        Keyword arguments of variable length
        See sklearn.feature_extraction.text.CountVectorizer
        for accepted keyword arguments

    Returns
    -------
    count : scipy.sparse.csr.csr_matrix
        The multinomial representation shape (n_samples, n_features)
    tfidf : scipy.sparse.csr.csr_matrix
        The tf-idf representation
    vocab : list
        Vocabulary
    """
    assert isinstance(corpus, (list, pd.Series))
    count, vocab = _multinomial(corpus, kwargs)
    tfidf = _tfidf(count)
    return count, tfidf, vocab


def tagger(doc):
    """For tagging a document
    Yields a (token, part-of-speech) tag tuple

    Parameters
    ----------
    doc : str
        A document with tokens to tag

    Yields
    ------
    tuple
        (token, tag)
    """
    text = nlp(doc)
    for sent in text.sents:
        for token in sent:
            yield (str(token), str(token.pos_))

def tag_corpus(corpus):
    """For tagging corpus document tokens

    Parameters
    ----------
    corpus : array-like
        A collection of documents

    Returns
    -------
    tagged : list
        (token, tag) tuples
    """
    assert isinstance(corpus, (list, pd.Series))
    tagged = []
    for doc in corpus:
        tagged.extend(tagger(doc))
    return tagged

def pos_tokens(tagged, pos):
    """Extract particular part-of-speech tokens

    Parameters
    ----------
    tagged : list
        (token, tag) tuples
    pos : str
        A valid part-of-speech tag

    Returns
    -------
    list

    Notes
    -----
    The available tags are:
        ADJ, ADP, ADV, AUX, CONJ, DET, INTJ, NOUN, NUM, PART,
        PRON, PROPN, PUNCT, SCONJ, SYM, VERB, X, EOL, SPACE
    Source: https://spacy.io/docs#token-postags
    """
    return [t for t, p in tagged if p == pos]

def _pos_freq(doc):
    """Part of speech frequencies for individual documents
    
    Parameters
    -----------
    doc : str
        A document with tokens to tag
        
    Returns
    -------
    pos : dict
        With counts by tag
    """
    pos = defaultdict(float)
    for _, p in tagger(doc):
        pos[p] += 1
    return pos

def pos_df(corpus):
    """Create a DataFrame of part of speech
    frequencies for a corpus of documents
    
    Parameters
    ----------
    corpus : array-like
        A collection of documents
        
    Returns
    -------
    df : pd.DataFrame
    """
    assert isinstance(corpus, (list, pd.Series))
    pos_dfs = []
    for doc in corpus:
        frequencies = pd.DataFrame(_pos_freq(doc), index=[0])
        pos_dfs.append(frequencies)
    df = pd.concat(pos_dfs, ignore_index=True)
    df.fillna(0.0, inplace=True)
    return df

def pos_normalize(df):
    """Normalize (row-wise) part-of-speech frequencies

    Parameters
    ----------
    df : pd.DataFrame
        `pos_df()` DataFrame

    Returns
    -------
    pd.DataFrame
    """
    assert isinstance(df, pd.DataFrame)
    return (df.T / df.sum(axis=1)).T

def _arrs_pos(df_orig, df_pos, demographic, pos,
              d_levels=None, print_levels=False):
    """Individual part-of-speech
    arrays for a particular demographic
    
    Parameters
    ----------
    df_orig : pd.DataFrame
        The DataFrame from which `df_pos` was created
    df_pos : pd.DataFrame
        The part-of-speech DataFrame
    demographic : str
        A valid demographic-data column in `df_orig`
    pos : str
        A column in `df_pos` corresponding
        to a part of speech
    d_levels : list, default None
        The specific demographic levels desired
    print_levels : bool, default False
        Whether to print the demographic levels
    
    Returns
    -------
    arrs : tuple of np.arrays
        The corresponding `pos` values for each `demographic`
    """
    df_pos = df_pos.copy() # so we don't modify it
    df_pos[demographic] = df_orig[demographic].values
    levels = _levels(df_orig[demographic], d_levels, print_levels)
    arrs = []
    for d in levels:
        arr = df_pos[df_pos[demographic] == d][pos].values
        n = arr.shape[0]
        if n < 0.1 * df_pos.shape[0]:
            print("Warning: '" + d +
                  "' category has less than 10% of observations (" +
                  str(n) + ")")
        arrs.append(arr)
    return tuple(arrs)

def pos_by_split(df_orig, df_pos, demographic, pos=None,
                 d_levels=None, print_levels=False):
    """Wrapper for handling multiple parts-of-speech with `_arrs_pos()`

    Parameters
    ----------
    df_orig : pd.DataFrame
        The DataFrame from which `df_pos` was created
    df_pos : pd.DataFrame
        The part-of-speech DataFrame
    demographic : str
        A valid demographic-data column in `df_orig`
    pos : list, default None
        Parts-of-speech to compare
    d_levels : list, default None
        The specific demographic levels desired
    print_levels : bool, default False
        Whether to print the demographic levels

    Returns
    -------
    None

    Notes
    -----
    The number of unique values in `demographic` must be two
    """
    assert (isinstance(df_orig, pd.DataFrame) and
            isinstance(df_pos, pd.DataFrame))
    assert df_orig.shape[0] == df_pos.shape[0]
    assert demographic in df_orig.columns
    assert set(pos).issubset(df_pos.columns)
    for p in pos:
        a, b = _arrs_pos(df_orig, df_pos, demographic, p, d_levels, print_levels)
        print(p)
        print_pvalues(a, b)
        print()

def load_words(path):
    """To load profane and slang words

    Parameters
    ----------
    path : str
        Relative or absolute filepath

    Returns
    -------
    list
    """
    assert isinstance(path, str)
    with open(path, 'r') as f:
        return list(set([w.rstrip() for w in f.readlines()]))

def _contains_n(words, corpus):
    """Count the number of times a document contains particular words

    Parameters
    ----------
    words : list
        Words to check for
    corpus : array-like
        A collection of documents

    Returns
    -------
    np.ndarray
        Number of tokens by document
    """
    X, _ = _multinomial(corpus, {'vocabulary' : words})
    return X.toarray().sum(axis=1)

def contains(words, corpus):
    """Determine whether a document contains particular words

    Parameters
    ----------
    words : list
        Words to check for
    corpus : array-like
        A collection of documents

    Returns
    -------
    n_words : np.ndarray
        Binary representation
    """
    assert isinstance(words, list)
    assert isinstance(corpus, (list, pd.Series))
    n_words = _contains_n(words, corpus)
    n_words[n_words > 0] = 1
    return n_words

def _token_counts(a, b, pos):
    """Create a DataFrame of `pos` token frequencies for particular
    demographic splits. `a` and `b` are lists of token, part-of-speech
    tuples (output from `tag_corpus()`).

    Parameters
    ----------
    a : list
        token, pos tuples
    b : list
        token, pos tuples
    pos : str
        A valid part-of-speech tag

    Returns
    -------
    df : pd.DataFrame
        With row 0 corresponding to `a` and row 1 to `b`
    """
    pos_a = nltk.FreqDist(pos_tokens(a, pos))
    pos_b = nltk.FreqDist(pos_tokens(b, pos))
    df_a = pd.DataFrame(pos_a, index=[0])
    df_b = pd.DataFrame(pos_b, index=[0])
    df = pd.concat([df_a, df_b], ignore_index=True)
    df.fillna(0, inplace=True)
    return df

def print_terms(df, n):
    measure = df.columns[0]
    print(" | ".join(df.sort_values(measure, ascending=False)[:n].index))
    print()
    print(" | ".join(df.sort_values(measure)[:n].index))

def top_terms(a, b, pos, fn, n):
    """Print the top `n` tokens (resulting from `fn`) for
    demographic splits associated with `a` and `b`

    Parameters
    ----------
    a : list
        token, pos tuples
    b : list
        token, pos tuples
    pos : str
        A valid part-of-speech tag
    fn : callable
        Either `diff_prop` of `log_odds_ratio`
    n : int
        Number of terms to print for each demographic split

    Returns
    -------
    None
    """
    df = _token_counts(a, b, pos)
    df = fn(df.values, df.columns.tolist())
    print_terms(df, n)


## Part 1: Generating NMF Model

This part of the code is largely derived from the work of Juan Shishido and the University of Michigan, which were referenced in the readme for this repository

#### First, we generate the topics and assign some meaning to them

In [None]:
df = pd.read_csv('compressed_okcupid.csv')

In [None]:
#The major part of the algorithm- can take some time
specs = {'stop_words' : 'english', 'ngram_range' : (1, 3), 'min_df' : 0.005}
counts, tfidf, vocab = feature_vectors(df.essay0, specs)

In [None]:
K = 25
nmf_inspect(tfidf, vocab, k_vals=[K], n_words=50)

In [None]:
#These labels are based on the categories as assessed by Juan Shishido, then modified by me
labels=['Reach Out!','Relocated', 'About Me', 'Hesitation', 'Casual', 'The City',
       'Novelty', 'Cool', 'Likes', 'Passions', 'Easy Going', 'Region', 'Seeking', 'Thoughts', 'Fun', 'New Here',
        'Travel','Self-summary', 'Nots', 'Growing Up','Carpe Diem', 'Good Company','Hobbies',
        'Cultural Interests', 'Ambitious']

label_dict = {}
for c, value in enumerate(labels):
    label_dict[c] = value
print(label_dict)

#### Next, we find a way of calculating and visualizing these topic distributions across our 4 chosen demographic variables

In [None]:
def split_by_demog(model, feature_names, n_top_words):
    """For printing the `n_top_words` for each grouping

    Parameters
    ----------
    model : sklearn.decomposition.nmf.NMF
        The NMF object

    feature_names : list
        The output from calling `TfidfVectorizer` on the users/features data

    n_top_words : int
        The top n words to print for a particular grouping

    Returns
    -------
    None
    """
    for topic_idx, topic in enumerate(model.components_):
        print("Group %d:" % topic_idx)
        print(" | ".join([feature_names[i]
            for i in topic.argsort()[ : -n_top_words-1 : -1]]))
        print()
    print()

In [None]:
def get_label(group_num):
    return label_dict[group_num]

def format_df(df, demog, tfidf): 
    df['group'] = nmf_labels(tfidf, k=K)
    subset = subset_df(df, demog, df[demog].unique())
    grouped = group_pct(subset, demog)
    percent_only = grouped.drop(['count_x', 'count_y'], axis=1)
    #percent_only
    pivoted = percent_only.pivot(index='group', columns=demog)
    pivoted['max_value'] = pivoted.max(axis=1)
    ordered_df = pivoted.sort_values(by='max_value', ascending=True)
    #Getting rid of the multi-line index
    ordered_df.columns = ordered_df.columns.droplevel(0)
    ordered_df = ordered_df.reset_index().rename_axis(None, axis=1)
    #Renaming the max
    ordered_df = ordered_df.rename(columns={'':'max'})
    #Linking to label
    ordered_df['label'] = ordered_df['group'].apply(get_label)
    return ordered_df

In [None]:
height_df, race_df, edu_df, fit_df= format_df(df, 'height_group', tfidf), 
                                    format_df(df, 'race_ethnicity', tfidf), 
                                    format_df(df, 'edu', tfidf), 
                                    format_df(df, 'fit', tfidf)

In [None]:
#Plot for Education Levels
ordered_df = edu_df
import matplotlib.patches as mpatches

my_range=range(1,len(ordered_df.index)+1)
fig, ax = plt.subplots(figsize=(18, 15))
ttl = ax.title
ttl.set_position([.5, 1.05])

# The vertival plot is made using the hline function
# I load the seaborn library only to benefit the nice looking feature
import seaborn as sns
plt.hlines(y=my_range, xmin=0, xmax=ordered_df['max'], color='Gray')
plt.plot(ordered_df['High School or less'], my_range, "o", markersize=20, color='blue')
plt.plot(ordered_df['More than High School'], my_range, "o", markersize=20, color='red')
plt.rc('ytick',labelsize=28)
plt.rc('xtick',labelsize=28)
# Add titles and axis names
plt.yticks(my_range, ordered_df['label'])
plt.title("Topics in OkCupid Male Self-Introductions Across Education Levels", loc='center', fontsize=40)
plt.xlabel('Proportion of Users Using This Topic', fontsize=32)
plt.ylabel('Topics Inferred from Essay',fontsize=32)
maroon_patch = mpatches.Patch(color='red', label='More than High School')
blue_patch = mpatches.Patch(color='blue', label='Less than High School')
plt.legend(handles=[maroon_patch, blue_patch], loc='center right', fontsize='xx-large', borderpad=2)
plt.savefig('opinions.png', bbox_inches='tight')


In [None]:
#Plot for Fitness Levels
ordered_df = fit_df
import matplotlib.patches as mpatches

my_range=range(1,len(fit_df.index)+1)
fig, ax = plt.subplots(figsize=(18, 15))
ttl = ax.title
ttl.set_position([.5, 1.05])

# The vertival plot is made using the hline function
# I load the seaborn library only to benefit the nice looking feature
import seaborn as sns
plt.hlines(y=my_range, xmin=0, xmax=ordered_df['max'], color='Gray')
plt.plot(ordered_df['fit'], my_range, "o", markersize=20, color='blue')
plt.plot(ordered_df['not_fit'], my_range, "o", markersize=20, color='red')
plt.rc('ytick',labelsize=28)
plt.rc('xtick',labelsize=28)
# Add titles and axis names
plt.yticks(my_range, ordered_df['label'])
plt.title("Topics in OkCupid Male Self-Introductions Across Fitness Levels", loc='center', fontsize=40)
plt.xlabel('Proportion of Users Using This Topic', fontsize=32)
plt.ylabel('Topics Inferred from Essay',fontsize=32)
maroon_patch = mpatches.Patch(color='red', label='Fit')
blue_patch = mpatches.Patch(color='blue', label='Not Fit')
plt.legend(handles=[maroon_patch, blue_patch], loc='center right', fontsize='xx-large', borderpad=2)
plt.savefig('fit.png', bbox_inches='tight')

In [None]:
#The Plot for Height
ordered_df = height_df
import matplotlib.patches as mpatches

my_range=range(1,len(ordered_df.index)+1)
fig, ax = plt.subplots(figsize=(18, 15))
ttl = ax.title
ttl.set_position([.5, 1.05])

# The vertival plot is made using the hline function
# I load the seaborn library only to benefit the nice looking feature
import seaborn as sns
plt.hlines(y=my_range, xmin=0, xmax=ordered_df['max'], color='Gray')
plt.plot(ordered_df['short'], my_range, "o", markersize=20, color='blue')
plt.plot(ordered_df['not_short'], my_range, "o", markersize=20, color='red')
plt.rc('ytick',labelsize=28)
plt.rc('xtick',labelsize=28)
# Add titles and axis names
plt.yticks(my_range, ordered_df['label'])
plt.title("Topics in OkCupid Male Self-Introductions Across Height Groups", loc='center', fontsize=40)
plt.xlabel('Proportion of Users Using This Topic', fontsize=32)
plt.ylabel('Topics Inferred from Essay',fontsize=32)
maroon_patch = mpatches.Patch(color='red', label='Short')
blue_patch = mpatches.Patch(color='blue', label='Not Short')
plt.legend(handles=[maroon_patch, blue_patch], loc='center right', fontsize='xx-large', borderpad=2)
plt.savefig('height.png', bbox_inches='tight')

In [None]:
# The Plot for Races
ordered_df = race_df
my_range=range(1,len(ordered_df.index)+1)
fig, ax = plt.subplots(figsize=(18, 15))
ttl = ax.title
ttl.set_position([.5, 1.05])

# The vertival plot is made using the hline function
# I load the seaborn library only to benefit the nice looking feature
import seaborn as sns
plt.hlines(y=my_range, xmin=0, xmax=ordered_df['max'], color='Gray')
plt.plot(ordered_df['White'], my_range, "o", markersize=20, color='blue')
plt.plot(ordered_df['Black'], my_range, "o", markersize=20, color='red')
plt.plot(ordered_df['Asian'], my_range, "o", markersize=20, color='green')
plt.plot(ordered_df['Latinx'], my_range, "o", markersize=20, color='cyan')
plt.plot(ordered_df['multiple'], my_range, "o", markersize=20, color='magenta')
plt.rc('ytick',labelsize=28)
plt.rc('xtick',labelsize=28)
# Add titles and axis names
plt.yticks(my_range, ordered_df['label'])
plt.title("Topics in OkCupid Male Self-Introductions Across Racial Groups", loc='center', fontsize=40)
plt.xlabel('Proportion of Users Using This Topic', fontsize=32)
plt.ylabel('Topics Inferred from Essay',fontsize=32)
blue_patch = mpatches.Patch(color='blue', label='White')
maroon_patch = mpatches.Patch(color='red', label='Black')
green_patch = mpatches.Patch(color='green', label='Asian')
cyan_patch = mpatches.Patch(color='cyan', label='Latinx')
magenta_patch = mpatches.Patch(color='magenta', label='multiple')
plt.legend(handles=[maroon_patch, blue_patch, green_patch, cyan_patch, magenta_patch], loc='center right', fontsize='xx-large', borderpad=2)
plt.savefig('race.png', bbox_inches='tight')