# Homework 08

```yaml
Course:    DS 5001 
Module:    08 Homework
Topic:     LDA with SciKit Learn
Author:    Ryan Lipps
Date:      23 March 2023
```

## Setup

### Packages

In [1]:
import pandas as pd
import numpy as np
import configparser
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
import plotly_express as px

### Config

In [2]:
config = configparser.ConfigParser()
config.read("../../../env.ini")
data_home = config['DEFAULT']['data_home']
output_dir = config['DEFAULT']['output_dir']
local_lib = config['DEFAULT']['local_lib']

### Read data

In [3]:
CORPUS = pd.read_csv(f'{data_home}/novels/novels-CORPUS.csv')
LIB = pd.read_csv(f'{data_home}/novels/novels-LIB.csv').set_index('book_id')

In [4]:
CORPUS.head()

Unnamed: 0,book_id,chap_id,para_num,sent_num,token_num,pos,term_str
0,secretadversary,1,0,1,0,DT,the
1,secretadversary,1,0,1,1,NNP,young
2,secretadversary,1,0,1,2,NNP,adventurers
3,secretadversary,1,0,1,3,NNP,ltd
4,secretadversary,1,1,0,0,JJ,tommy


In [5]:
CORPUS = CORPUS.set_index(['book_id', 'chap_id', 'para_num', 'sent_num', 'token_num']).dropna()
CORPUS.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos,term_str
book_id,chap_id,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1
secretadversary,1,0,1,0,DT,the
secretadversary,1,0,1,1,NNP,young
secretadversary,1,0,1,2,NNP,adventurers
secretadversary,1,0,1,3,NNP,ltd
secretadversary,1,1,0,0,JJ,tommy


In [6]:
LIB.head()

Unnamed: 0_level_0,genre_id,author_id
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1
secretadversary,d,christie
styles,d,christie
moonstone,d,collins
adventures,d,doyle
baskervilles,d,doyle


## Topic Model Class

In [7]:
class TopicModel():
    '''
    DOCSTRING GOES HERE
    '''
    def __init__(self, corpus:pd.DataFrame, lib:pd.DataFrame, bag=list, pos_filter=str):
        '''
        DOCSTRING GOES HERE
        '''
        self.corpus = corpus
        self.lib = lib
        self.bag = bag
        self.pos_filter = pos_filter
        self.docs = self.__create_docs(pos_filter)


    def __create_docs(self, pos_filter:list):
        '''
        Function to generate docs by a specified bag from a corpus

        Corpus here is loosely defined, as this function will work provided the `level` parameter is in the multi-index of `corpus`

        PARAMETERS:

        `corpus` - pandas DataFrame of body of work. It must be multi-indexed by an OHCO

        `level` - string of OHCO to group by for bags

        OUTPUTS:

        pandas DataFrame of bag documents grouped by `level`

        EXAMPLE:

        `DOC = get_doc(CORPUS, 'chap_id')`
        '''
        # Get multi-index from `corpus` df
        idx = list(self.corpus.index.names)

        # Check to see that `level` exists in `corpus` OHCO
        # Raise error if not
        #if (self.bag not in idx):
            #raise KeyError (f'{self.bag} not found in corpus OHCO')

        # Filter corpus for pos
        # Split-apply-combine to generate DOCs grouped by `level`
        return self.corpus[self.corpus.pos.str.match(fr'^{pos_filter}?$')]\
            .groupby(self.bag).term_str\
            .apply(lambda x: ' '.join(x))\
            .to_frame()\
            .rename(columns={'term_str':'doc_str'})


    def count_vectorize(self, max_features=4000, ngram_range = (1,1), stop_words='english'):
        '''
        DOCSTRING GOES HERE
        '''
        self.count_engine = CountVectorizer(max_features=max_features, ngram_range=ngram_range, stop_words=stop_words)
        self.count_model = self.count_engine.fit_transform(self.docs.doc_str)
        self.terms = self.count_engine.get_feature_names_out()

        return None
    

    def generate_lda_model(self, random_state:int, n_topics:int=20, max_iter:int=5, learning_offset:float=50.):
        '''
        DOCSTRING GOES HERE
        '''
        self.lda_engine = LDA(n_components=n_topics, max_iter=max_iter, learning_offset=learning_offset, random_state=random_state)
        self.lda_model = self.lda_engine.fit_transform(self.count_model)
        self.topic_names = [f"T{str(x).zfill(len(str(n_topics)))}" for x in range(n_topics)]
        return None

    def generate_vocab(self):
        '''
        DOCSTRING GOES HERE
        '''
        self.vocab = pd.DataFrame(index=self.terms)
        self.vocab.index.name = 'term_str'
        return None
    

    def generate_dtm(self):
        '''
        DOCSTRING GOES HERE
        '''
        self.dtm = pd.DataFrame(self.count_model.toarray(), index=self.docs.index, columns=self.terms)
        self.vocab['doc_count'] = self.dtm.astype('bool').astype('int').sum()
        self.docs['term_count'] = self.dtm.sum(1)
        return None
    
    
    def generate_theta(self):
        '''
        DOCSTRING GOES HERE
        '''
        self.theta = pd.DataFrame(self.lda_model, index=self.docs.index)
        self.theta.columns.name = 'topic_id'
        self.theta.columns = self.topic_names
        return None
    

    def generate_phi(self):
        '''
        DOCSTRING GOES HERE
        '''
        self.phi = pd.DataFrame(self.lda_engine.components_, columns=self.terms, index=self.topic_names)
        self.phi.index.name = 'topic_id'
        self.phi.columns.name = 'term_str'
        return None
    

    def generate_topics(self, n_top_terms:int=7):
        '''
        DOCSTRING GOES HERE
        '''
        self.topics = self.phi.stack().groupby('topic_id')\
            .apply(lambda x: ' '.join(x.sort_values(ascending=False)\
                                      .head(n_top_terms)\
                                        .reset_index().term_str))\
            .to_frame('top_terms')
        self.topics['doc_weight_sum'] = self.theta.sum()
        self.topics['term_freq'] = self.phi.sum(1) / self.phi.sum(1).sum()
        return None
    
    def append_metadata_to_topics(self, metadata:str, join_on:str):
        '''
        DOCSTRING GOES HERE
        '''
        _md_frame = sorted(self.lib[metadata].value_counts().index.to_list())
        self.topics[_md_frame] = self.theta.join(self.lib, on=join_on).groupby(metadata)[self.topic_names].mean().T
        self.topics[metadata] = self.topics[_md_frame].idxmax(1)
        return None

In [8]:
paragraph_model = TopicModel(CORPUS, LIB, ['book_id', 'chap_id', 'para_num'], 'NNS')
paragraph_model.count_vectorize()
paragraph_model.generate_vocab()
paragraph_model.generate_dtm()
paragraph_model.generate_lda_model(random_state=0)
paragraph_model.generate_phi()
paragraph_model.generate_theta()
paragraph_model.generate_topics()
paragraph_model.append_metadata_to_topics('genre_id', 'book_id')
paragraph_model.append_metadata_to_topics('author_id', 'book_id')

In [9]:
chapter_model = TopicModel(CORPUS, LIB, ['book_id', 'chap_id'], 'NNS')
chapter_model.count_vectorize()
chapter_model.generate_vocab()
chapter_model.generate_dtm()
chapter_model.generate_lda_model(random_state=0)
chapter_model.generate_phi()
chapter_model.generate_theta()
chapter_model.generate_topics()
chapter_model.append_metadata_to_topics('genre_id', 'book_id')
chapter_model.append_metadata_to_topics('author_id', 'book_id')

In [10]:
paragraph_model.phi.head()

term_str,abbess,abbey,abhorrence,abilities,ability,abode,abroad,abruptly,absence,absent,...,yew,yonder,youd,youll,young,youre,youth,youths,youve,zeal
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
T00,0.05,1.392494,1.097508,0.05,5.556778,0.05,5.531952,0.05,16.093926,0.960684,...,0.05,0.05,0.05,0.05,0.05,0.05,3.598732,0.05,0.05,2.321462
T01,1.389969,0.05,0.05,0.05,1.05,0.05,0.05,0.05,2.151322,4.05,...,2.846534,5.585872,3.05,0.05,1.836272,2.599044,0.731493,0.05,0.05,0.05
T02,0.05,0.05,0.05,0.05,5.045453,0.05,0.05,0.05,1.460901,0.05,...,0.05,0.05,18.417949,0.05,0.05,31.982578,0.086155,1.05,0.064635,1.047011
T03,0.051125,0.05,0.05,0.05,0.05,0.05,0.05,1.184665,5.130487,0.05,...,0.05,0.05,0.05,0.05,0.05,0.05,0.244053,0.05,26.009869,0.091742
T04,0.05,0.1113,0.05,3.570093,0.05,9.544803,0.05,0.05,1.19821,0.756529,...,0.05,0.05,0.05,0.05,0.505932,0.05,3.277104,2.120083,0.051028,0.05


In [11]:
chapter_model.phi.head()

term_str,abbess,abbey,abhorrence,abilities,ability,abode,abroad,abruptly,absence,absent,...,yew,yonder,youd,youll,young,youre,youth,youths,youve,zeal
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
T00,0.05,0.05,0.05,0.05,0.050011,0.05,1.425109,0.05,0.703066,0.05,...,0.05,0.05,0.05,0.05,0.05,0.05,0.702322,0.05,0.05,0.05
T01,0.05,0.05,0.732067,1.381887,2.463208,1.428486,2.968166,5.892598,17.19947,1.056784,...,0.05,8.754761,21.900551,19.614846,5.881342,49.740533,8.642145,0.05,21.015508,4.510612
T02,0.05,7.035474,0.749836,0.05,0.05,0.05,0.05,0.05,0.05,0.05,...,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05
T03,0.05,0.064885,0.05,0.05,0.05,0.05,0.05,0.05,0.086202,0.05,...,0.05,0.05,1.199449,1.59606,4.477756,3.35327,0.055813,0.05,4.119849,0.05
T04,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,4.339374,0.05,...,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05


## Question 1:
Use the PHI table from each model to compute the entropy H of the distribution over topics. Which bag generates a lower entropy distribution? Hint: To get H work with the L1 normalized vector of word weight sums by topic in the PHI table.

### Answer 1:
**The paragraph model has a lower entropy distribution**

In [12]:
L1_para = paragraph_model.phi.apply(lambda x: x/x.sum(), axis=1)

In [13]:
L1_p_para = L1_para.apply(lambda x: x*np.log2(1/x), axis=1)
L1_p_para.head()

term_str,abbess,abbey,abhorrence,abilities,ability,abode,abroad,abruptly,absence,absent,...,yew,yonder,youd,youll,young,youre,youth,youths,youve,zeal
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
T00,6.4e-05,0.001308,0.001057,6.4e-05,0.004435,6.4e-05,0.004418,6.4e-05,0.011104,0.000938,...,6.4e-05,6.4e-05,6.4e-05,6.4e-05,6.4e-05,6.4e-05,0.003032,6.4e-05,6.4e-05,0.002059
T01,0.001632,8e-05,8e-05,8e-05,0.001271,8e-05,8e-05,8e-05,0.002403,0.004188,...,0.003075,0.005542,0.003267,8e-05,0.002089,0.002838,0.00092,8e-05,8e-05,8e-05
T02,0.00011,0.00011,0.00011,0.00011,0.006815,0.00011,0.00011,0.00011,0.002306,0.00011,...,0.00011,0.00011,0.020494,0.00011,0.00011,0.032345,0.000181,0.001721,0.000139,0.001717
T03,8.7e-05,8.5e-05,8.5e-05,8.5e-05,8.5e-05,8.5e-05,8.5e-05,0.0015,0.005446,8.5e-05,...,8.5e-05,8.5e-05,8.5e-05,8.5e-05,8.5e-05,8.5e-05,0.000363,8.5e-05,0.021718,0.000149
T04,0.000111,0.00023,0.000111,0.005086,0.000111,0.011858,0.000111,0.000111,0.001949,0.001295,...,0.000111,0.000111,0.000111,0.000111,0.000904,0.000111,0.00472,0.003225,0.000113,0.000111


In [14]:
L1_para['entropy'] = L1_p_para.sum(axis=1)
L1_para.entropy.describe()

count    20.000000
mean      9.272175
std       0.246850
min       8.856818
25%       9.065714
50%       9.250806
75%       9.574828
max       9.617709
Name: entropy, dtype: float64

In [15]:
L1_chap = chapter_model.phi.apply(lambda x: x/x.sum(), axis=1)

In [16]:
L1_p_chap = L1_chap.apply(lambda x: x*np.log2(1/x), axis=1)
L1_p_chap.head()

term_str,abbess,abbey,abhorrence,abilities,ability,abode,abroad,abruptly,absence,absent,...,yew,yonder,youd,youll,young,youre,youth,youths,youve,zeal
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
T00,0.000243,0.000243,0.000243,0.000243,0.000243,0.000243,0.004843,0.000243,0.002607,0.000243,...,0.000243,0.000243,0.000243,0.000243,0.000243,0.000243,0.002605,0.000243,0.000243,0.000243
T01,2.4e-05,2.4e-05,0.000284,0.000505,0.000849,0.00052,0.001004,0.001849,0.004744,0.000396,...,2.4e-05,0.002625,0.005853,0.005319,0.001846,0.011844,0.002595,2.4e-05,0.005647,0.001458
T02,0.000509,0.037179,0.005626,0.000509,0.000509,0.000509,0.000509,0.000509,0.000509,0.000509,...,0.000509,0.000509,0.000509,0.000509,0.000509,0.000509,0.000509,0.000509,0.000509,0.000509
T03,6.6e-05,8.4e-05,6.6e-05,6.6e-05,6.6e-05,6.6e-05,6.6e-05,6.6e-05,0.000109,6.6e-05,...,6.6e-05,6.6e-05,0.001181,0.001524,0.003788,0.002939,7.3e-05,6.6e-05,0.003521,6.6e-05
T04,0.000706,0.000706,0.000706,0.000706,0.000706,0.000706,0.000706,0.000706,0.033688,0.000706,...,0.000706,0.000706,0.000706,0.000706,0.000706,0.000706,0.000706,0.000706,0.000706,0.000706


In [17]:
L1_chap['entropy'] = L1_p_chap.sum(axis=1)
L1_chap.entropy.describe()

count    20.000000
mean      9.774324
std       0.556430
min       9.069158
25%       9.464394
50%       9.661949
75%      10.048807
max      11.532409
Name: entropy, dtype: float64

## Question 2:
Sort the topics in each model's PHI table by topic entropy in descending order. Are the first topics in the two models about the same? In other words, do they yield similar interpretations?

### Answer 2:
**Yes, they yield similar interpretations. They appear to be about father/son and creator/created relationships**

In [18]:
L1_para['entropy'].sort_values(ascending=False).head(1)

topic_id
T17    9.617709
Name: entropy, dtype: float64

In [19]:
paragraph_model.topics.loc['T11'].top_terms

'yes father son box matter place way'

In [20]:
L1_chap['entropy'].sort_values(ascending=False).head(1)

topic_id
T12    11.532409
Name: entropy, dtype: float64

In [21]:
chapter_model.topics.loc['T04'].top_terms

'man heart creator rage creature brother companion'

## Question 3:
What topic from each model is most strongly associated with each genre? Note that your answer have four parts.

### Answer 3:
**Paragraph Model**

* **Detective is most strongly associated with topic 8**
* **Gothic is most strongly associated with topic 5**

**Chapter Model**

* **Detective is most strongly associated with topic 1**
* **Gothic is most strongly associated with topic 5**


In [22]:
paragraph_model.topics[['top_terms', 'd']].sort_values('d', ascending=False).head(1)

Unnamed: 0_level_0,top_terms,d
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1
T08,morning house time place way thing mr,0.072432


In [23]:
paragraph_model.topics[['top_terms', 'g']].sort_values('g', ascending=False).head(1)

Unnamed: 0_level_0,top_terms,g
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1
T05,heart moment eyes life countenance tears world,0.112456


In [24]:
chapter_model.topics[['top_terms', 'd']].sort_values('d', ascending=False).head(1)

Unnamed: 0_level_0,top_terms,d
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1
T01,man time room yes way door sir,0.467562


In [25]:
chapter_model.topics[['top_terms', 'g']].sort_values('g', ascending=False).head(1)

Unnamed: 0_level_0,top_terms,g
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1
T05,heart eyes time hand moment night voice,0.211929


## Question 4:
Using the THETA table from the Chapters model, get the mean topic weights for each book. Which book is most strongly associated with the gothic genre g, based on the weight of that genre's most representative topic (as discovered in the previous question)?

### Answer 4:
**Monk**

In [31]:
chapter_model.theta.groupby('book_id')\
    .mean()\
    .sort_values('T05', ascending=False)

Unnamed: 0_level_0,T00,T01,T02,T03,T04,T05,T06,T07,T08,T09,T10,T11,T12,T13,T14,T15,T16,T17,T18,T19
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
monk,3.4e-05,0.001713,3.4e-05,0.00186,3.4e-05,0.977318,3.4e-05,0.00756,3.4e-05,3.4e-05,3.4e-05,3.4e-05,3.4e-05,3.4e-05,0.001467,3.4e-05,3.4e-05,0.002916,0.006724,3.4e-05
oldenglishbaron,5.5e-05,0.003529,5.5e-05,0.013446,5.5e-05,0.400547,0.001634,0.196985,5.5e-05,5.5e-05,5.5e-05,0.242466,5.5e-05,0.099948,0.009078,0.012164,0.01233,5.5e-05,0.007379,5.5e-05
usher,4.3e-05,4.3e-05,4.3e-05,4.3e-05,4.3e-05,0.292059,0.027631,4.3e-05,0.180845,0.11597,0.025197,4.3e-05,4.3e-05,4.3e-05,0.156455,4.3e-05,0.159632,0.006116,4.3e-05,0.035622
frankenstein,0.003804,0.000158,0.000158,0.00666,0.062474,0.28431,0.006261,0.025802,0.000158,0.002357,0.016381,0.005853,0.000158,0.001606,0.152014,0.036595,0.000158,0.195686,0.180214,0.019194
udolpho,0.005035,0.000762,0.000828,0.00251,0.000579,0.157503,0.000796,0.016003,8.7e-05,0.000306,8.7e-05,0.005993,8.7e-05,8.7e-05,0.103448,0.001391,0.031048,0.336,0.003899,0.333553
northangerabbey,0.00085,0.000163,0.032283,0.000163,0.032263,0.133702,0.000163,0.351083,0.007458,0.000163,0.013114,0.000163,0.000163,0.000163,0.000163,0.002167,0.082044,0.003637,0.310777,0.02932
reddeath,0.000121,0.000121,0.000121,0.000121,0.000121,0.068769,0.000121,0.000121,0.000121,0.057754,0.000121,0.000121,0.0745,0.000121,0.491069,0.000121,0.306092,0.000121,0.000121,0.000121
castleofotranto,4.1e-05,4.1e-05,4.1e-05,4.1e-05,4.1e-05,0.023148,4.1e-05,4.1e-05,0.000587,4.1e-05,4.1e-05,0.811165,4.1e-05,0.164443,4.1e-05,4.1e-05,4.1e-05,4.1e-05,4.1e-05,4.1e-05
dracula,6.2e-05,0.010079,0.040029,0.541501,6.2e-05,0.015929,6.2e-05,0.034058,6.2e-05,0.001156,0.028031,6.2e-05,6.2e-05,6.2e-05,0.01582,0.170249,0.017614,0.018019,0.107016,6.2e-05
styles,8.7e-05,0.9356,8.7e-05,0.020793,8.7e-05,0.014734,8.7e-05,8.7e-05,8.7e-05,8.7e-05,0.00481,8.7e-05,8.7e-05,8.7e-05,8.7e-05,8.7e-05,0.004922,8.7e-05,0.010485,0.007526


## Question 5:
How would you characterize the subject matter of the two genres based on their topic models? Consider the words associated with the dominant topics from each model, but also the models overall.

### Answer 5:
****

In [40]:
paragraph_model.topics[['top_terms', 'genre_id']].sort_values('genre_id', ascending=False)

Unnamed: 0_level_0,top_terms,genre_id
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1
T19,subject person tuppence chateau lord night course,g
T18,day country sun home sea mountains place,g
T17,heart man mind love time evening friend,g
T16,door room chamber hand apartment look moment,g
T05,heart moment eyes life countenance tears world,g
T07,day letter time dear friends friend way,g
T14,sir voice light sound steps door distance,g
T11,yes father son box matter place way,d
T15,face eyes hand hands night man lips,d
T13,work right word way time thou tm,d


In [42]:
paragraph_model.topics[['top_terms', 'genre_id']].sort_values('genre_id', ascending=False)

Unnamed: 0_level_0,top_terms,genre_id
topic_id,Unnamed: 1_level_1,Unnamed: 2_level_1
T19,subject person tuppence chateau lord night course,g
T18,day country sun home sea mountains place,g
T17,heart man mind love time evening friend,g
T16,door room chamber hand apartment look moment,g
T05,heart moment eyes life countenance tears world,g
T07,day letter time dear friends friend way,g
T14,sir voice light sound steps door distance,g
T11,yes father son box matter place way,d
T15,face eyes hand hands night man lips,d
T13,work right word way time thou tm,d
