## Topic Modeling and Visualization

In [1]:
#General Imports
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
import re
from string import punctuation
from scipy.stats import ttest_ind
from sklearn.metrics import accuracy_score

import nltk
import pandas as pd
#from spacy.en import English

#from utils.permutation import print_pvalues
#from utils.text_representation import _levels, _multinomial
import nmf_visuals 
from nmf_utils import feature_vectors, nmf_labels, nmf_inspect, nmf_subset
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

import spacy
nlp = spacy.load('en_core_web_sm')
#nlp = English(tagger=True, entity=False)

%matplotlib inline
from sklearn.decomposition import NMF

ModuleNotFoundError: No module named 'utils'

In [None]:
tf_vectorizer = CountVectorizer(max_features=2000, stop_words='english', ngram_range=(1,3))

print("Vectorizing text by word counts...")
tf_text = tf_vectorizer.fit_transform(profiles['essay0'])

tmp = tf_text.get_shape()
print("Our transformed text has", tmp[0], "rows and", tmp[1], "columns.")

In [None]:
seed = 2019
profiles = pd.read_csv('final_okcupid.csv').dropna(subset=['essay0'])
profiles.head(2)

In [None]:
# Setting up variable in case we switch to different essay in future
profile_section_to_use = 'essay0'
documents = profiles[profile_section_to_use]

In [None]:
# This section may be jumped if simply loading existing saved model
model = NMF(n_components=ntopics, 
            alpha=.1, 
            l1_ratio=.5, 
            init='nndsvd', random_state=seed)

print('Performing NMF on vectors...')
nmf = model.fit(tfidf_text)
nmf_topics = nmf.components_

print('Done!')

In [None]:
nmf_filename = 'nmf_model.sav'
pickle.dump(model, open(nmf_filename, 'wb'))
nmf_profile_topics= nmf.transform(tfidf_text)

In [None]:
#Functions to help visualize topic distributions
def common_topics_bars(topics):
    popularity = pd.DataFrame(topics).mean()
    popularity = popularity.rename_axis('Topic')
    popularity = popularity.sort_values(ascending=False)
    popularity.plot.bar(title='Topic popularity')
    plt.savefig('barplot.png')
    return

def rank_groups(data, trait, topic):
    groups = data[trait].value_counts().index.values
    result = {}
    
    for g in groups:
        result[g] = data[data[trait] == g][topic].mean()
    
    r = pd.DataFrame.from_dict(result, orient='index')
    r.columns = [topic]
    r = r.sort_values(by=topic, ascending=False)
    
    return r.round(3)

def top_topics(data, trait, value, n_top_topics=3, distinctive=False):
    topics = [col for col in data if col.startswith('topic_')]
    vals = {}
    means = {}
    if distinctive:
        for t in topics:
            means[t] = data[t].mean()
    else:
        for t in topics:
            means[t] = 1
    
    data = data[data[trait] == value]
    
    for t in topics:
        vals[t] = data[t].mean() / means[t]
    vals = pd.DataFrame.from_dict(vals, orient='index')    
    vals = vals.sort_values(by=0, ascending=False).head(n_top_topics)

    return list(vals.index.values)

In [None]:
#Use this section to generate the top topics for each categorical variable
nmf_topic_info = pd.DataFrame(nmf_profile_topics).add_prefix('topic_')
nmf_together = profiles.merge(nmf_topic_info, left_index=True, right_index=True)
#We output this as a csv for generating future analysis
nmf_together.to_csv('nmf_topics_profiles.csv')

In [None]:
#Functions to Generate Plots
def common_topics_bars(topics):
    popularity = pd.DataFrame(topics).mean()
    popularity = popularity.rename_axis('Topic')
    popularity = popularity.sort_values(ascending=False)
    popularity.plot.bar(title='Topic popularity')
    plt.savefig('barplot.png')
    return

def rank_groups(data, trait, topic):
    groups = data[trait].value_counts().index.values
    result = {}
    
    for g in groups:
        result[g] = data[data[trait] == g][topic].mean()
    
    r = pd.DataFrame.from_dict(result, orient='index')
    r.columns = [topic]
    r = r.sort_values(by=topic, ascending=False)
    
    return r.round(3)

def top_topics(data, trait, value, n_top_topics=3, distinctive=False):
    topics = [col for col in data if col.startswith('topic_')]
    vals = {}
    means = {}
    if distinctive:
        for t in topics:
            means[t] = data[t].mean()
    else:
        for t in topics:
            means[t] = 1
    
    data = data[data[trait] == value]
    
    for t in topics:
        vals[t] = data[t].mean() / means[t]
    vals = pd.DataFrame.from_dict(vals, orient='index')    
    vals = vals.sort_values(by=0, ascending=False).head(n_top_topics)

    return list(vals.index.values)

In [None]:
common_topics_bars(profile_topics)

In [None]:
# Test For Differences in Popularity of Topics grouped by Categorical Variables

def _levels(demographics, d_levels=None, print_levels=False):
    """The demographic levels to iterate over
    
    Parameters
    ----------
    demographics : pd.Series
        Demographic labels
    d_levels : list, default None
        The specific demographic levels desired
    print_levels : bool, default False
        Whether to print the demographic levels
    
    Returns
    -------
    levels : iterable
        The unique (sorted) levels in `demographics`
    """
    levels = demographics.unique()
    if d_levels:
        assert set(d_levels).issubset(levels)
        levels = d_levels
    levels.sort()
    if print_levels:
        print('Levels (in order):', levels, end='\n\n')
    return levels

def _multinomial(corpus, kwargs):
    """Tokens counts by document using the spaCy tokenizer

    Parameters
    ----------
    corpus : array-like
        A collection of documents
    kwargs : dict or None
        Keyword arguments of variable length

    Returns
    -------
    X : scipy.sparse.csr.csr_matrix
        The multinomial representation shape (n_samples, n_features)
    v : list
        Vocabulary
    """
    if kwargs:
        cv = CountVectorizer(tokenizer=spacy_tokenize, **kwargs)
    else:
        cv = CountVectorizer(tokenizer=spacy_tokenize)
    X = cv.fit_transform(corpus)
    v = cv.get_feature_names()
    return X, v


        
def subset_df(df, col, vals):
    """Return a subset of `df` based on particular `vals` for `col`

    Parameters
    ----------
    df : pd.DataFrame
        Input DataFrame
    col : str
        Valid column name
    vals : list
        Values to subset on

    Returns
    -------
    subset : pd.DataFrame
        The rows in `df` with values in `val` for `col`
    """
    df = df.copy()
    subset = df[df[col].isin(vals)]
    return subset

def group_pct(df, demographic):
    """Calculate the percentage of users in each `demographic` level

    Parameters
    ----------
    df : pd.DataFrame
        Where applicable, this should be a subset of the original DataFrame and 
        should include a `group` column corresponding to the NMF groupings
    demographic : str
        Valid column name

    Returns
    -------
    by_dg : pd.DataFrame
        Including `demographic` levels and `group` percentages
    """
    df = df.copy()
    by_dg = pd.DataFrame({'count' :
                          df.groupby([demographic, 'group'])['group'].count()}).reset_index()
    by_d = by_dg.groupby(demographic, as_index=False)['count'].sum()
    by_dg = pd.merge(by_dg, by_d, on=demographic)
    by_dg['pct'] = by_dg.count_x / by_dg.count_y
    return by_dg

## Part 1: Generating NMF Model

This part of the code is largely derived from the work of Juan Shishido and the University of Michigan, which were referenced in the readme for this repository

#### First, we generate the topics and assign some meaning to them

In [None]:
df = pd.read_csv('compressed_okcupid.csv')

In [None]:
#The major part of the algorithm- can take some time
specs = {'stop_words' : 'english', 'ngram_range' : (1, 3), 'min_df' : 0.005}
counts, tfidf, vocab = feature_vectors(df.essay0, specs)

In [None]:
K = 25
nmf_inspect(tfidf, vocab, k_vals=[K], n_words=50)

In [None]:
#These labels are based on the categories as assessed by Juan Shishido, then modified by me
labels=['Reach Out!','Relocated', 'About Me', 'Hesitation', 'Casual', 'The City',
       'Novelty', 'Cool', 'Likes', 'Passions', 'Easy Going', 'Region', 'Seeking', 'Thoughts', 'Fun', 'New Here',
        'Travel','Self-summary', 'Nots', 'Growing Up','Carpe Diem', 'Good Company','Hobbies',
        'Cultural Interests', 'Ambitious']

label_dict = {}
for c, value in enumerate(labels):
    label_dict[c] = value
print(label_dict)

#### Next, we find a way of calculating and visualizing these topic distributions across our 4 chosen demographic variables

In [None]:
def get_label(group_num):
    """
    Returns the assigned label in the label_dict
    
    Parameters:
    ---------
    group_num: integer
    
    Returns
    --------
    label: str
    
    """
    return label_dict[group_num]

def format_df(df, demog, tfidf): 
    """
    Creates a separate dataframe for each topic value from dataframe
    
    Parameters:
    ----------
    df: DataFrame
        original data frame for analysis
    
    group: str
        name of categorical variable
    
    tfidf- TF-IDF object
        to be used in the calculation
        
    Returns
    -------
    ordered_df: DataFraeme
        Dataframe suitably adjusted for final visualization
    """
    
    # Add a column to dataframe based on topic model label
    df['group'] = nmf_labels(tfidf, k=K)
    # Now get the data subsetted by the categorical variable
    subset = subset_df(df, demog, df[demog].unique())
    #
    grouped = group_pct(subset, demog)
    percent_only = grouped.drop(['count_x', 'count_y'], axis=1)
    #percent_only
    pivoted = percent_only.pivot(index='group', columns=demog)
    pivoted['max_value'] = pivoted.max(axis=1)
    ordered_df = pivoted.sort_values(by='max_value', ascending=True)
    #Getting rid of the multi-line index
    ordered_df.columns = ordered_df.columns.droplevel(0)
    ordered_df = ordered_df.reset_index().rename_axis(None, axis=1)
    #Renaming the max
    ordered_df = ordered_df.rename(columns={'':'max'})
    #Linking to label
    ordered_df['label'] = ordered_df['group'].apply(get_label)
    return ordered_df

In [None]:
height_df, race_df, edu_df, fit_df= format_df(df, 'height_group', tfidf), 
                                    format_df(df, 'race_ethnicity', tfidf), 
                                    format_df(df, 'edu', tfidf), 
                                    format_df(df, 'fit', tfidf)

In [None]:
#Abstracting these four into a function
my_range=range(1,len(fit_df.index)+1)
fig, ax = plt.subplots(figsize=(18, 15))
ttl = ax.title
ttl.set_position([.5, 1.05])

# The vertival plot is made using the hline function
# I load the seaborn library only to benefit the nice looking feature
import seaborn as sns
plt.hlines(y=my_range, xmin=0, xmax=ordered_df['max'], color='Gray')
plt.plot(ordered_df['fit'], my_range, "o", markersize=20, color='blue')
plt.plot(ordered_df['not_fit'], my_range, "o", markersize=20, color='red')
plt.rc('ytick',labelsize=28)
plt.rc('xtick',labelsize=28)
# Add titles and axis names
plt.yticks(my_range, ordered_df['label'])
plt.title("Topics in OkCupid Male Self-Introductions Across Levels of {}".format(), loc='center', fontsize=40)
plt.xlabel('Proportion of Users Using This Topic', fontsize=32)
plt.ylabel('Topics Inferred from Essay',fontsize=32)
maroon_patch = mpatches.Patch(color='red', label='Fit')
blue_patch = mpatches.Patch(color='blue', label='Not Fit')
plt.legend(handles=[maroon_patch, blue_patch], loc='center right', fontsize='xx-large', borderpad=2)
plt.savefig('fit.png', bbox_inches='tight')

In [None]:
#Plot for Education Levels
ordered_df = edu_df

my_range=range(1,len(ordered_df.index)+1)
fig, ax = plt.subplots(figsize=(18, 15))
ttl = ax.title
ttl.set_position([.5, 1.05])

# The vertival plot is made using the hline function
# I load the seaborn library only to benefit the nice looking feature
import seaborn as sns
plt.hlines(y=my_range, xmin=0, xmax=ordered_df['max'], color='Gray')
plt.plot(ordered_df['High School or less'], my_range, "o", markersize=20, color='blue')
plt.plot(ordered_df['More than High School'], my_range, "o", markersize=20, color='red')
plt.rc('ytick',labelsize=28)
plt.rc('xtick',labelsize=28)
# Add titles and axis names
plt.yticks(my_range, ordered_df['label'])
plt.title("Topics in OkCupid Male Self-Introductions Across Education Levels", loc='center', fontsize=40)
plt.xlabel('Proportion of Users Using This Topic', fontsize=32)
plt.ylabel('Topics Inferred from Essay',fontsize=32)
maroon_patch = mpatches.Patch(color='red', label='More than High School')
blue_patch = mpatches.Patch(color='blue', label='Less than High School')
plt.legend(handles=[maroon_patch, blue_patch], loc='center right', fontsize='xx-large', borderpad=2)
plt.savefig('opinions.png', bbox_inches='tight')


In [None]:
#Plot for Fitness Levels
ordered_df = fit_df

my_range=range(1,len(fit_df.index)+1)
fig, ax = plt.subplots(figsize=(18, 15))
ttl = ax.title
ttl.set_position([.5, 1.05])

# The vertival plot is made using the hline function
# I load the seaborn library only to benefit the nice looking feature
import seaborn as sns
plt.hlines(y=my_range, xmin=0, xmax=ordered_df['max'], color='Gray')
plt.plot(ordered_df['fit'], my_range, "o", markersize=20, color='blue')
plt.plot(ordered_df['not_fit'], my_range, "o", markersize=20, color='red')
plt.rc('ytick',labelsize=28)
plt.rc('xtick',labelsize=28)
# Add titles and axis names
plt.yticks(my_range, ordered_df['label'])
plt.title("Topics in OkCupid Male Self-Introductions Across Fitness Levels", loc='center', fontsize=40)
plt.xlabel('Proportion of Users Using This Topic', fontsize=32)
plt.ylabel('Topics Inferred from Essay',fontsize=32)
maroon_patch = mpatches.Patch(color='red', label='Fit')
blue_patch = mpatches.Patch(color='blue', label='Not Fit')
plt.legend(handles=[maroon_patch, blue_patch], loc='center right', fontsize='xx-large', borderpad=2)
plt.savefig('fit.png', bbox_inches='tight')

In [None]:
#The Plot for Height
ordered_df = height_df
import matplotlib.patches as mpatches

my_range=range(1,len(ordered_df.index)+1)
fig, ax = plt.subplots(figsize=(18, 15))
ttl = ax.title
ttl.set_position([.5, 1.05])

# The vertival plot is made using the hline function
# I load the seaborn library only to benefit the nice looking feature
import seaborn as sns
plt.hlines(y=my_range, xmin=0, xmax=ordered_df['max'], color='Gray')
plt.plot(ordered_df['short'], my_range, "o", markersize=20, color='blue')
plt.plot(ordered_df['not_short'], my_range, "o", markersize=20, color='red')
plt.rc('ytick',labelsize=28)
plt.rc('xtick',labelsize=28)
# Add titles and axis names
plt.yticks(my_range, ordered_df['label'])
plt.title("Topics in OkCupid Male Self-Introductions Across Height Groups", loc='center', fontsize=40)
plt.xlabel('Proportion of Users Using This Topic', fontsize=32)
plt.ylabel('Topics Inferred from Essay',fontsize=32)
maroon_patch = mpatches.Patch(color='red', label='Short')
blue_patch = mpatches.Patch(color='blue', label='Not Short')
plt.legend(handles=[maroon_patch, blue_patch], loc='center right', fontsize='xx-large', borderpad=2)
plt.savefig('height.png', bbox_inches='tight')

In [None]:
# The Plot for Races
ordered_df = race_df
my_range=range(1,len(ordered_df.index)+1)
fig, ax = plt.subplots(figsize=(18, 15))
ttl = ax.title
ttl.set_position([.5, 1.05])

# The vertival plot is made using the hline function
# I load the seaborn library only to benefit the nice looking feature
import seaborn as sns
plt.hlines(y=my_range, xmin=0, xmax=ordered_df['max'], color='Gray')
plt.plot(ordered_df['White'], my_range, "o", markersize=20, color='blue')
plt.plot(ordered_df['Black'], my_range, "o", markersize=20, color='red')
plt.plot(ordered_df['Asian'], my_range, "o", markersize=20, color='green')
plt.plot(ordered_df['Latinx'], my_range, "o", markersize=20, color='cyan')
plt.plot(ordered_df['multiple'], my_range, "o", markersize=20, color='magenta')
plt.rc('ytick',labelsize=28)
plt.rc('xtick',labelsize=28)
# Add titles and axis names
plt.yticks(my_range, ordered_df['label'])
plt.title("Topics in OkCupid Male Self-Introductions Across Racial Groups", loc='center', fontsize=40)
plt.xlabel('Proportion of Users Using This Topic', fontsize=32)
plt.ylabel('Topics Inferred from Essay',fontsize=32)
blue_patch = mpatches.Patch(color='blue', label='White')
maroon_patch = mpatches.Patch(color='red', label='Black')
green_patch = mpatches.Patch(color='green', label='Asian')
cyan_patch = mpatches.Patch(color='cyan', label='Latinx')
magenta_patch = mpatches.Patch(color='magenta', label='multiple')
plt.legend(handles=[maroon_patch, blue_patch, green_patch, cyan_patch, magenta_patch], loc='center right', fontsize='xx-large', borderpad=2)
plt.savefig('race.png', bbox_inches='tight')