In [1]:
import scattertext as st
import scattertextvl as stvl
import pandas as pd
import spacy


In [2]:
nlp = spacy.load('en_core_web_sm')

convention_df = st.SampleCorpora.ConventionData2012.get_data().assign(
    Party=lambda df: df.party.apply(lambda x: {'democrat': 'Dem', 'republican': 'GOP'}[x]),
    Parse=lambda df: df.text.progress_apply(nlp)
)

  0%|          | 0/189 [00:00<?, ?it/s]

In [3]:

biber_corpus = st.OffsetCorpusFactory(
    convention_df,
    category_col='Party',
    parsed_col='Parse',
    feat_and_offset_getter=stvl.BiberOffsetGetter()
).build(show_progress=True)

biber_stat_df = st.HedgesG(
    biber_corpus
).use_metadata().set_categories(
    category_name='Dem'
).get_score_df(
).assign(
    Frequency=lambda df: df.count1 + df.count2,
    X=lambda df: df.Frequency,
    Y=lambda df: df.hedges_g,
    Xpos=lambda df: st.Scalers.dense_rank(df.X),
    Ypos=lambda df: st.Scalers.scale_center_zero_abs(df.Y),
    ColorScore=lambda df: df.Ypos,
)

Elen Le Foll and Muhammad Shakir. MFTE Python 1.0. 2023. https://github.com/mshakirDr/MFTE


  0%|          | 0/189 [00:00<?, ?it/s]

In [39]:
plot_df = pd.merge(
    biber_stat_df,
    stvl.get_biber_feature_df(),
    left_index=True,
    right_index=True
).reset_index().rename(columns={'index': 'term'}).set_index('term')

In [40]:
plot_df

Unnamed: 0_level_0,cohens_d,cohens_d_se,cohens_d_z,cohens_d_p,hedges_g,hedges_g_se,hedges_g_z,hedges_g_p,m1,m2,...,X,Y,Xpos,Ypos,ColorScore,Category,Feature,Examples,Operationalization,NormalizationUnit
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
VPRT,0.046555,0.145502,0.319957,0.374500,0.046370,0.151662,0.305743,0.379900,0.036398,0.035930,...,9468,0.046370,0.979730,0.551099,0.551099,Verb features,Present tense,It’s ours. Who doesn’t love it? I know.,Subsumes the VBP (present tense other than thi...,Finite verbs
COMM,-0.046474,0.145502,-0.319406,0.625291,-0.046290,0.151662,-0.305216,0.619899,0.007153,0.007326,...,1945,-0.046290,0.777027,0.448989,0.448989,Verb semantics,Communication verbs,Describe it to your partner and say why. Write...,"Following Biber (2006: 247, based on the LGSWE...",Finite verbs
POLITE,-0.092687,0.145561,-0.636758,0.737859,-0.092319,0.151773,-0.608269,0.728495,0.001680,0.001940,...,377,-0.092319,0.432432,0.398266,0.398266,Stance-taking devices,Politeness markers,"Can you open the window, please? Would you min...","Assigned to all occurrences of thanks, thank y...",Words
PP2,-0.147069,0.145679,-1.009537,0.843641,-0.146484,0.151998,-0.963723,0.832408,0.006259,0.007055,...,1875,-0.146484,0.763514,0.338576,0.338576,Pronouns,Reference to addressee(s),"If your model was good enough, you’d be able t...","Following Biber (1988), all occurrences of you...",Finite verbs
EMPH,0.057484,0.145513,0.395042,0.346406,0.057255,0.151681,0.377470,0.352912,0.003212,0.002780,...,701,0.057255,0.621622,0.563095,0.563095,Stance-taking devices,Emphatics,I do wish I hadn't drunk quite so much. Oh rea...,"Originally based on Biber (1988), assigned to ...",Words
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ThNLIK,0.025878,0.145489,0.177866,0.429414,0.025775,0.151636,0.169978,0.432514,0.000015,0.000011,...,3,0.025775,0.013514,0.528403,0.528403,Syntax,that subordinate clauses (relative clauses) pr...,It was a feeling that I never had until that n...,That relative clause (THRC) tag preceded by an...,Nouns
QUTAG,-0.239293,0.146002,-1.638967,0.949390,-0.238342,0.152612,-1.561752,0.940827,0.000016,0.000064,...,8,-0.238342,0.040541,0.237350,0.237350,Discourse organisation,Question tags,"Do they? Were you? It’s just it’s repetitive, ...",Assigned to question marks preceded by (1) inn...,Finite verbs
ToJCRTN,-0.326109,0.146446,-2.226813,0.987020,-0.324813,0.153454,-2.116681,0.982857,0.000000,0.000029,...,3,-0.324813,0.013514,0.142060,0.142060,Syntax,to clauses preceded by certainty adjectives,I always make sure to have a big bottle of wat...,To immediately followed by a verb and precede...,Words
ThJLIK,-0.211455,0.145889,-1.449427,0.926391,-0.210615,0.152396,-1.382021,0.916517,0.000000,0.000005,...,1,-0.210615,0.000000,0.267905,0.267905,Syntax,that subordinate clauses (other than relatives...,This makes it unlikely that such a fund will b...,That complement clause (THSC) tag preceded by ...,Finite verbs


In [25]:

html = st.dataframe_scattertext(
    biber_corpus,
    plot_df=plot_df,
    category='Dem',
    category_name='Democratic',
    not_category_name='Republican',
    width_in_pixels=1000,
    suppress_text_column='Display',
    metadata=lambda c: c.get_df()['speaker'],
    use_non_text_features=True,
    ignore_categories=False,
    use_offsets=True,
    unified_context=False,
    horizontal_line_y_position=0,
    color_score_column='ColorScore',
    left_list_column='ColorScore',
    y_label='Hedges G',
    x_label='Frequency Ranks',
    y_axis_labels=[f'More Rep: g=-{plot_df.hedges_g.abs().max():.3f}',
                   '0',
                   f'More Dem: g={plot_df.hedges_g.abs().max():.3f}'],
    tooltip_columns=['Frequency', 'hedges_g'],
    term_description_columns=['Feature', 'Category', 'Examples',
                              'hedges_g', 'hedges_g_p', 'Frequency', 'Operationalization'],
    term_description_column_names={'hedges_g': "Hedge's g",
                                   'hedges_g_p': "Hedge's g p-value"},
    header_names={'upper': 'Top Democratic', 'lower': 'Top Republican'},
    term_word_in_term_description='Biber Tag',
)

fn = 'demo_biber.html'
with open(fn, 'w') as of:
    of.write(html)
print(f'run open ./{fn}')


There are metadata in the corpus which are not in the index of plot_df. These will not be available in the visualization. These are: ['H', '.', 'XX', 'OMP', 'P', ',', 'COMP', ':', '``', 'ToThNSTNCall', 'PH', '-LRB-', '_SP', 'NFP', "''", 'PP1all', 'PP3t', 'YPH', 'AFX', 'MP', '$', 'HYPH', 'PP3all'].s
run open ./demo_biber.html


In [26]:
biber_category_corpus = biber_corpus.rename_metadata(
    stvl.get_biber_feature_df().reset_index()[['index','Category']].values
)

In [41]:
biber_category_stat_df = st.CohensD(biber_category_corpus).use_metadata().set_categories(
    'Dem', ['GOP']
).get_score_df(
).assign(
    Frequency = lambda df: df.count1+df.count2,
    X=lambda df: df.Frequency,
    Y=lambda df: df.hedges_g,
    Xpos=lambda df: st.Scalers.dense_rank(df.X),
    Ypos=lambda df: st.Scalers.scale_center_zero_abs(df.Y),
    ColorScore=lambda df: df.Ypos,
    Features = stvl.get_biber_feature_df().groupby('Category').apply(
        lambda gdf: ', '.join(gdf.Feature)
    )
)
plot_df = pd.merge(
    biber_category_stat_df,
    stvl.get_biber_feature_df().set_index('Category'),
    left_index=True,
    right_index=True
).reset_index().rename(columns={'index': 'term'}).set_index('term')

In [42]:
plot_df

['Adjective semantics', 'Adjectives', 'Adverb semantics', 'Adverbials', 'Adverbs', 'Determinatives', 'Discourse organisation', 'Lexis', 'Negation', 'Noun semantics', 'Nouns', 'Prepositions', 'Pronouns', 'Stance-taking devices', 'Stative forms', 'Syntax', 'Verb features', 'Verb semantics']


Unnamed: 0_level_0,cohens_d,cohens_d_se,cohens_d_z,cohens_d_p,hedges_g,hedges_g_se,hedges_g_z,hedges_g_p,m1,m2,...,X,Y,Xpos,Ypos,ColorScore,Features,Feature,Examples,Operationalization,NormalizationUnit
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Adjective semantics,-0.140232,0.145661,-0.962728,0.832158,-0.139675,0.151964,-0.919132,0.820987,0.008229,0.008833,...,1977,-0.139675,0.176471,0.305760,0.305760,Attitudinal adjectives without a that clause a...,Attitudinal adjectives without a that clause a...,I was surprised when it arrived in a plastic w...,Any of the following words tagged as adjective...,Nouns
Adjective semantics,-0.140232,0.145661,-0.962728,0.832158,-0.139675,0.151964,-0.919132,0.820987,0.008229,0.008833,...,1977,-0.139675,0.176471,0.305760,0.305760,Attitudinal adjectives without a that clause a...,Adjectives related to color,"She had luxuriously long black hair, a sprinkl...",Any of the following words tagged as adjective...,Nouns
Adjective semantics,-0.140232,0.145661,-0.962728,0.832158,-0.139675,0.151964,-0.919132,0.820987,0.008229,0.008833,...,1977,-0.139675,0.176471,0.305760,0.305760,Attitudinal adjectives without a that clause a...,Epistemic adjectives without a that clause after,Tell me the true story then. Possible but unli...,Any of the following words tagged as adjective...,Nouns
Adjective semantics,-0.140232,0.145661,-0.962728,0.832158,-0.139675,0.151964,-0.919132,0.820987,0.008229,0.008833,...,1977,-0.139675,0.176471,0.305760,0.305760,Attitudinal adjectives without a that clause a...,Evaluative adjectives,"She was fine when she left them, she looked gr...",Any of the following words tagged as adjective...,Nouns
Adjective semantics,-0.140232,0.145661,-0.962728,0.832158,-0.139675,0.151964,-0.919132,0.820987,0.008229,0.008833,...,1977,-0.139675,0.176471,0.305760,0.305760,Attitudinal adjectives without a that clause a...,Relational adjectives,They were very different people. Her original ...,Any of the following words tagged as adjective...,Nouns
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Verb semantics,0.016570,0.145485,0.113893,0.454661,0.016504,0.151629,0.108844,0.456663,0.073426,0.073144,...,17571,0.016504,0.764706,0.522951,0.522951,"DO auxiliary, Necessity modals, Modal can, Mod...",factive verbs in other contexts,We're going to find her. It was a minute or tw...,Any word in the following list tagged as verb ...,Finite verbs
Verb semantics,0.016570,0.145485,0.113893,0.454661,0.016504,0.151629,0.108844,0.456663,0.073426,0.073144,...,17571,0.016504,0.764706,0.522951,0.522951,"DO auxiliary, Necessity modals, Modal can, Mod...",likelihood verbs in other contexts,"The joke was on him, we presume. To be able to...",Any word in the following list tagged as verb ...,Finite verbs
Verb semantics,0.016570,0.145485,0.113893,0.454661,0.016504,0.151629,0.108844,0.456663,0.073426,0.073144,...,17571,0.016504,0.764706,0.522951,0.522951,"DO auxiliary, Necessity modals, Modal can, Mod...",All modals of possibility,Can I give him a hint? You cannot. May I have ...,"Any word tagged as MDCA, MDCO, MDMM.",Finite verbs
Verb semantics,0.016570,0.145485,0.113893,0.454661,0.016504,0.151629,0.108844,0.456663,0.073426,0.073144,...,17571,0.016504,0.764706,0.522951,0.522951,"DO auxiliary, Necessity modals, Modal can, Mod...",All modals of prediction,It won’t do. Shall we see? Wouldn't you like t...,"Any word tagged as MDWS, MDWO, GTO.",Finite verbs


In [43]:

html = st.dataframe_scattertext(
    biber_category_corpus,
    plot_df=plot_df,
    category='Dem',
    category_name='Democratic',
    not_category_name='Republican',
    width_in_pixels=1000,
    suppress_text_column='Display',
    metadata=lambda c: c.get_df()['speaker'],
    use_non_text_features=True,
    ignore_categories=False,
    use_offsets=True,
    unified_context=False,
    horizontal_line_y_position=0,
    color_score_column='ColorScore',
    left_list_column='ColorScore',
    y_label='Hedges G',
    x_label='Frequency Ranks',
    y_axis_labels=[f'More Rep: g=-{plot_df.hedges_g.abs().max():.3f}',
                   '0',
                   f'More Dem: g={plot_df.hedges_g.abs().max():.3f}'],
    tooltip_columns=['Frequency', 'hedges_g'],
    term_description_columns=['Feature', 'Examples',
                              'hedges_g', 'hedges_g_p', 'Frequency', 'Operationalization'],
    term_description_column_names={'hedges_g': "Hedge's g",
                                   'hedges_g_p': "Hedge's g p-value"},
    header_names={'upper': 'Top Democratic', 'lower': 'Top Republican'},
    term_word_in_term_description='Biber Category',
)

fn = 'demo_biber_category.html'
with open(fn, 'w') as of:
    of.write(html)
print(f'run open ./{fn}')


ValueError: cannot reindex on an axis with duplicate labels

In [10]:
arglex_corpus = st.OffsetCorpusFactory(
    convention_df,
    category_col='Party',
    parsed_col='Parse',
    feat_and_offset_getter=stvl.get_arglex_offset_getter()
).build(show_progress=True)


  0%|          | 0/189 [00:00<?, ?it/s]

In [15]:
plot_df = st.HedgesG(
    arglex_corpus
).use_metadata().set_categories(
    category_name='Dem'
).get_score_df(
).assign(
    Frequency=lambda df: df.count1 + df.count2,
    X=lambda df: df.Frequency,
    Y=lambda df: df.hedges_g,
    Xpos=lambda df: st.Scalers.dense_rank(df.X),
    Ypos=lambda df: st.Scalers.scale_center_zero_abs(df.Y),
    ColorScore=lambda df: df.Ypos,
)

plot_df

Unnamed: 0_level_0,cohens_d,cohens_d_se,cohens_d_z,cohens_d_p,hedges_g,hedges_g_se,hedges_g_z,hedges_g_p,m1,m2,count1,count2,docs1,docs2,Frequency,X,Y,Xpos,Ypos,ColorScore
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
assessments,0.216977,0.14591,1.487063,0.068499,0.216115,0.152437,1.417736,0.078134,0.004899,0.001208,13,3,9,3,16,16,0.216115,0.214286,0.719432,0.719432
causation,-0.031535,0.145492,-0.216745,0.585797,-0.031409,0.151641,-0.207129,0.582046,0.493636,0.499779,1033,793,120,64,1826,1826,-0.031409,1.0,0.468109,0.468109
conditionals,0.066958,0.145523,0.460121,0.322715,0.066692,0.151702,0.439627,0.330103,0.013423,0.011327,29,26,23,17,55,55,0.066692,0.357143,0.567716,0.567716
difficulty,-0.04107,0.145498,-0.282272,0.611132,-0.040907,0.151653,-0.269739,0.606319,0.003784,0.004684,5,11,5,8,16,16,-0.040907,0.214286,0.458465,0.458465
emphasis,0.494406,0.147689,3.34763,0.000408,0.492442,0.155798,3.160776,0.000787,0.078903,0.030163,163,65,72,28,228,228,0.492442,0.785714,1.0,1.0
inconsistency,0.030399,0.145491,0.208941,0.417247,0.030278,0.15164,0.199672,0.420869,0.026857,0.025513,77,60,41,30,137,137,0.030278,0.5,0.530743,0.530743
necessity,-0.065892,0.145522,-0.4528,0.674654,-0.065631,0.151699,-0.432636,0.66736,0.12042,0.129556,309,219,89,47,528,528,-0.065631,0.928571,0.433362,0.433362
possibility,-0.326316,0.146448,-2.228212,0.987067,-0.32502,0.153456,-2.117996,0.982912,0.057513,0.082285,149,159,63,46,308,308,-0.32502,0.857143,0.169992,0.169992
priority,-0.128791,0.145633,-0.884348,0.811746,-0.128279,0.151911,-0.844434,0.800787,0.055955,0.066609,112,101,63,39,213,213,-0.128279,0.642857,0.369752,0.369752
structure,0.201623,0.145852,1.382386,0.083427,0.200822,0.152326,1.31837,0.09369,0.083742,0.059669,143,84,67,40,227,227,0.200822,0.714286,0.703905,0.703905


In [20]:

html = st.dataframe_scattertext(
    arglex_corpus,
    plot_df=plot_df,
    category='Dem',
    category_name='Democratic',
    not_category_name='Republican',
    width_in_pixels=1000,
    suppress_text_column='Display',
    metadata=lambda c: c.get_df()['speaker'],
    use_non_text_features=True,
    ignore_categories=False,
    use_offsets=True,
    unified_context=False,
    horizontal_line_y_position=0,
    color_score_column='ColorScore',
    left_list_column='ColorScore',
    y_label='Hedges G',
    x_label='Frequency Ranks',
    y_axis_labels=[f'More Rep: g=-{plot_df.hedges_g.abs().max():.3f}',
                   '0',
                   f'More Dem: g={plot_df.hedges_g.abs().max():.3f}'],
    tooltip_columns=['Frequency', 'hedges_g'],
    term_description_columns=['hedges_g', 'hedges_g_p', 'Frequency'],
    term_description_column_names={'hedges_g': "Hedge's g",
                                   'hedges_g_p': "Hedge's g p-value"},
    header_names={'upper': 'Top Democratic', 'lower': 'Top Republican'},
    term_word_in_term_description='Arglex Tag',
)

fn = 'demo_arglex.html'
with open(fn, 'w') as of:
    of.write(html)
print(f'run open ./{fn}')


run open ./demo_arglex.html


In [21]:
!open ./demo_arglex.html