In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.ticker as mticker

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics.pairwise import cosine_similarity

termdf = pd.read_csv('term.txt', sep='\t', header=None, names=[
                     'language', 'speaker', 'chip', 'term_abbrev'])
dictdf = pd.read_csv('dict.txt', sep='\t', skiprows=[0], names=[
                     'language', 'term', 'translation', 'term_abbrev'])
chipdf = pd.read_csv('chip.txt', sep='\t', names=[
                     'chip', 'letter', 'number', 'letternumber'])

# mappings between different indices
num_to_chip = {}
for c in range(1, 331):
    letter = chipdf.loc[chipdf['chip'] == c].iloc[0]['letter']
    number = chipdf.loc[chipdf['chip'] == c].iloc[0]['number']
    num_to_chip[c] = (letter, number)

chip_to_num = {}
for n in num_to_chip:
    chip_to_num[num_to_chip[n]] = n

table_to_chipnum = {}
for i, c in enumerate('ABCDEFGHIJ'):
    for j in range(41):
        if (c == 'A' or c == 'J') and j > 0:
            continue
        table_to_chipnum[(i,j)] = chip_to_num[(c, j)]

languages = [16, 20, 51, 56, 60, 64, 74, 87]
bct_counts = {16: 6, 20: 7, 51: 5, 56: 4, 60: 4, 64: 5, 74: 4, 87: 5}
num_chips = 330

In [None]:
rng = np.random.default_rng()
seed = 8
# to test other seeds uncomment next line
# seed = rng.integers(0, 10)

# heatmap plotting function following the guide in the matplotlib documentation
def plot_heatmap(data, ax):
    im = ax.imshow(data, cmap = 'hot_r')

    ax.set_xticks(np.arange(data.shape[1]))
    ax.set_yticks(np.arange(data.shape[0]))

    ax.set_xticklabels([str(i) for i in range(41)])
    ax.set_yticklabels([c for c in 'ABCDEFGHIJ'])

    ax.tick_params(top=True, bottom=False, labeltop=True, labelbottom=False)

    for _, spine in ax.spines.items():
        spine.set_visible(False)

    ax.set_xticks(np.arange(data.shape[1]+1)-.5, minor=True)
    ax.set_yticks(np.arange(data.shape[0]+1)-.5, minor=True)
    ax.grid(which="minor", color="w", linestyle='-', linewidth=3)
    ax.tick_params(which="minor", bottom=False, left=False)

    return im

# build the document word matrix for each language in languages
def build_word_counts(languages):
    word_counts = {}
    for language in languages:
        lang_dict = dictdf[dictdf['language'] == language]
        lang_terms = termdf[termdf['language'] == language]
        num_terms = lang_dict['term'].max()
        abbreviations = lang_dict['term_abbrev'].unique()
        termabbrev_map = {}
        for abbrev in abbreviations:
            # use the smallest term index for the abbreviation
            subset = lang_dict['term_abbrev'] == abbrev
            termabbrev_map[abbrev] = lang_dict[subset]['term'].min()

        word_count = np.zeros((num_terms, num_chips), dtype=int)
        for abbrev in abbreviations:
            for chip in range(num_chips):
                subset = (lang_terms['term_abbrev'] == abbrev) & (lang_terms['chip'] == chip + 1)
                word_count[termabbrev_map[abbrev]-1, chip] = lang_terms[subset]['chip'].count()
        
        word_counts[language] = word_count.copy()
        
    return word_counts

# build a topics over chips model   
def topics_over_chips(word_count, num_categories):
    lda = LatentDirichletAllocation(n_components=num_categories, random_state = seed, max_doc_update_iter = 500, max_iter= 100)
    lda.fit(word_count)
    return lda.components_

# build a topics over terms model
def topics_over_terms(word_count, num_categories):
    lda = LatentDirichletAllocation(n_components=num_categories, random_state = seed, max_doc_update_iter = 500, max_iter= 100)
    lda.fit(word_count.T)
    return lda.components_

# plot histograms for topics in the topics over terms model
def topics_over_terms_histograms(language, topics):
    # normalization for histograms
    topics /= topics.sum(axis=1)[:, np.newaxis]
    num_categories = topics.shape[0]
    num_rows = num_categories
    num_cols = 1
    if num_categories > 5:
        num_cols = 2
        num_rows = 5
    
    fig, axs = plt.subplots(num_rows, num_cols, sharey=True, figsize=(8, 12))
    for tid, topic in enumerate(topics):
        maximum = np.max(topic)
        minimum = np.max(topic)
        # we say something is representative if it has probability > 0.6
        # and potentially representative if it has probability > 0.3
        rep_threshold =  0.6
        pot_threshold =  0.3
        representative = topic >= rep_threshold
        pot_representative = (topic >= pot_threshold) & (topic < rep_threshold)
        not_representative = topic < pot_threshold
        bars = [topic[representative], topic[pot_representative], topic[not_representative]]
        
        if num_categories > 5:
            ax = axs[tid % num_rows, tid // num_rows]
        else:
            ax = axs[tid]
        bin_edges = ax.hist(bars, color = ['g', 'b', 'r'], histtype='barstacked', 
                     edgecolor = 'black', label = ['Representative', 'Potentially', 'Nonrepresentative'])
        if tid < num_rows:
            plt.setp(ax, ylabel='Count')
        
        if maximum < 0.1:
            ax.xaxis.set_major_formatter(mticker.FormatStrFormatter('%.1e'))

        if tid == 0:
            ax.legend()
    
    if num_categories > 5:
        for ind in range(num_categories, 10):
            ax = axs[ind % num_rows, ind // num_rows]
            ax.remove()
            ax  = None 
    plt.savefig('lang{}_{}topichistSEED{}.png'.format(language, num_categories, seed))

# create an topics over terms LDA model for each number in `num_cats_list`
# and plot topics over terms histograms for each
def test_num_topics(language, num_cats_list):
    word_count = word_counts[language]
    for num in num_cats_list:
        lda = LatentDirichletAllocation(n_components=num, random_state = seed, max_doc_update_iter = 500, max_iter= 100)
        lda.fit(word_count.T)
        topics = lda.components_
        
        topics_over_terms_histograms(language, topics)
        rep_terms = find_representative_terms(language, topics)
        print('For num = ', num, 'representative terms:')
        print(rep_terms)

# find the represetative terms of the topics using
# topics over terms model
def find_representative_terms(language, topics):
    rep_terms = {}
    for tid, topic in enumerate(topics):
        maximum = np.max(topic)
        minimum = np.max(topic)
        # we say something is representative if it has probability > 0.6
        rep_threshold =  0.6
        representatives = np.nonzero(topic >= rep_threshold)[0]
        lang_dict = dictdf[dictdf['language'] == language]
           
        abbrevs = {}
        for rep in representatives:
            abbrev = lang_dict.loc[lang_dict['term'] == rep + 1]['term_abbrev'].iloc[0] 
            abbrevs[rep+1] = abbrev
        
        rep_terms[tid] = abbrevs
    return rep_terms
    
# plot heatmaps for the topic distributions using
# the topics over chips model
def topics_over_chips_heatmaps(language, topics):
    # normalization for colormaps
    topics_max = np.max(topics)
    num_categories = topics.shape[0]
    num_rows = num_categories
    num_cols = 1
    if num_categories > 4:
        num_cols = 2
        num_rows = 4
    
    fig, ax = plt.subplots(num_categories, 1)
    for tid, topic in enumerate(topics):
        data = np.zeros((10, 41))
        data[0, 40] = topics_max
        for i in range(data.shape[0]):
            for j in range(data.shape[1]):
                if (i, j) in table_to_chipnum:
                    data[i, j] = topic[table_to_chipnum[(i, j)] - 1]
        
        im = plot_heatmap(data, ax[tid])
        ax[tid].set_title('Topic {}'.format(tid))
    
    fig.set_size_inches((20, 5*num_categories))
    plt.savefig('lang{}_topicmapSEED{}.png'.format(language, seed))

# finds the most strongly associated topic for each color term in bcts
def topics_over_chips_best_topics(language, num_categories, bcts):
    word_count = word_counts[language]
    lda = LatentDirichletAllocation(n_components=num_categories, random_state = seed, max_doc_update_iter = 500, max_iter= 100)
    lda.fit(word_count)
    term_dists = lda.transform(word_count)
    best_topics = {}
    
    for bct in bcts:
        dist = term_dists[bct-1, :]
        tid = np.argmax(dist)
        best_topics[bct] = (tid, dist[tid])
        
    return best_topics
   
# get the data used for heatmaps on the topics over chips model
def topics_over_chips_data(language, topics):
    # normalization for colormaps
    topics_max = np.max(topics)
    topics /= topics_max
    num_categories = topics.shape[0]
    
    to_return = {}
    
    fig, ax = plt.subplots(num_categories, 1)
    for tid, topic in enumerate(topics):
        data = np.zeros((10, 41))
        data[0, 40] = topics_max
        for i in range(data.shape[0]):
            for j in range(data.shape[1]):
                if (i, j) in table_to_chipnum:
                    data[i, j] = topic[table_to_chipnum[(i, j)] - 1]
        
        to_return[tid] = data
    
    return to_return

In [None]:
# build the document word matrices
word_counts = build_word_counts(languages)

In [None]:
# train an LDA models for each language with the number of topics
# equal to the BCT counts given in literature
chip_topics = {}
term_topics = {}
for language in languages:
    chip_topics[language] = topics_over_chips(word_counts[language], bct_counts[language])
    term_topics[language] = topics_over_terms(word_counts[language], bct_counts[language])

In [None]:
# for languages 20 and 16, try to determine the number of color categories
# by training models with various number of topics
print('Language 20')
test_num_topics(language = 20, num_cats_list=[4, 5, 6, 7, 8, 9, 10])
print('\nLanguage 16')
test_num_topics(language = 16, num_cats_list=[2, 3, 4, 5, 6, 7, 8])

In [None]:
# for each language plot the topic distribution histograms
# and heatmaps for the topics over chips model
for language in languages:
    topics_over_terms_histograms(language, term_topics[language])
    topics_over_chips_heatmaps(language, chip_topics[language])

In [None]:
# compute the cosine similarity of two matrices
# by flattening them into arrays
def cosine_sim(X, Y):
    vecX = X.flatten()
    vecY = Y.flatten()
    
    return np.dot(vecX, vecY) / (np.linalg.norm(vecX) * np.linalg.norm(vecY))

# how similar are the color categories the model finds with the color categories
# from the literature?
lang16bcts = range(1, 7)
lang16_bct_data = {}
lang20bcts = [1, 2, 4, 5, 6, 12, 13]
lang20_bct_data = {}

for i in lang16bcts: 
    df = pd.read_csv('Data for Ryan/L16_W{}.csv'.format(i), header = None)
    lang16_bct_data[i] = df.to_numpy()

for i in lang20bcts: 
    df = pd.read_csv('Data for Ryan/L20_W{}.csv'.format(i), header = None)
    lang20_bct_data[i] = df.to_numpy()

lang16heatmaps = topics_over_chips_data(16, chip_topics[16])    
lang20heatmaps = topics_over_chips_data(20, chip_topics[20])    

lang16_matches = {}
lang20_matches = {}
print('Lang 16 Comparison\n')
for tid in lang16heatmaps:
    print('Topic {}'.format(tid))
    best_sim = 0
    for i in lang16bcts:
        frobenius = np.linalg.norm(lang16heatmaps[tid] - lang16_bct_data[i])
        l1 = np.linalg.norm(lang16heatmaps[tid] - lang16_bct_data[i], ord = 1)
        cos_sim = cosine_sim(lang16heatmaps[tid], lang16_bct_data[i])
        if cos_sim > best_sim:
            lang16_matches[tid] = i
            best_sim = cos_sim
        print('BCT: {}, frob: {}, L1: {}, cos : {}'.format(i, frobenius, l1, cos_sim))
    print()

print('Lang 20 Comparison\n')
for tid in lang20heatmaps:
    print('Topic {}'.format(tid))
    best_sim = 0
    for i in lang20bcts:
        frobenius = np.linalg.norm(lang20heatmaps[tid] - lang20_bct_data[i])
        l1 = np.linalg.norm(lang20heatmaps[tid] - lang20_bct_data[i], ord = 1)
        cos_sim = cosine_sim(lang20heatmaps[tid], lang20_bct_data[i])
        if cos_sim > best_sim:
            lang20_matches[tid] = i
            best_sim = cos_sim
        print('BCT: {}, frob: {}, L1: {}, cos : {}'.format(i, frobenius, l1, cos_sim))
    print()
    
print('Lang 16 Matches:', lang16_matches)
print('Lang 20 Matches:', lang20_matches)

In [None]:
# compute the best topics using the topics over chips model
print('Lang 20 best_topics:\n')
lang20_best_topics = topics_over_chips_best_topics(20, 7, lang20bcts)
print(lang20_best_topics)
print('\nLang 16 best_topics:\n')
lang16_best_topics = topics_over_chips_best_topics(16, 7, lang16bcts)
print(lang16_best_topics)