In [1]:
import pandas as pd
import numpy as np
import hdbscan
from bertopic import BERTopic

2021-08-04 14:54:47.779605: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


## Test 1

In [3]:
def get_topic_documents(cluster_id, condensed_tree):
        result_points = np.array([])
        result_points_val = np.array([])
        
        #assert cluster_id > -1, "The topic's label should be greater than -1!"
        
        if cluster_id <= -1:
            return result_points.astype(np.int64), result_points_val.astype(np.float64)
            
        raw_tree = condensed_tree._raw_tree
        
        # Just the cluster elements of the tree, excluding singleton points
        cluster_tree = raw_tree[raw_tree['child_size'] > 1]
        
        # Get the leaf cluster nodes under the cluster we are considering
        leaves = hdbscan.plots._recurse_leaf_dfs(cluster_tree, cluster_id)
        
        # Now collect up the last remaining points of each leaf cluster (the heart of the leaf) 
        for leaf in leaves:
            #max_lambda = raw_tree['lambda_val'][raw_tree['parent'] == leaf].max()
            #points = raw_tree['child'][(raw_tree['parent'] == leaf) & (raw_tree['lambda_val'] == max_lambda)]
            #points_val = raw_tree['lambda_val'][(raw_tree['parent'] == leaf) & (raw_tree['lambda_val'] == max_lambda)]
            points = raw_tree['child'][(raw_tree['parent'] == leaf)]
            points_val = raw_tree['lambda_val'][(raw_tree['parent'] == leaf)]
            result_points = np.hstack((result_points, points))
            result_points_val = np.hstack((result_points_val, points_val))   
        return result_points.astype(np.int64), result_points_val.astype(np.float64)

def generate_topic_documents(bert_model):
    clusterer = bert_model.hdbscan_model
    tree = clusterer.condensed_tree_
    clusters = tree._select_clusters()

    number_of_topics = len(clusters)

    relevant_columns = ['topic', 'document', 'lambda_val']
    df_rel_docs = pd.DataFrame(columns=relevant_columns)

    for i in range(0, number_of_topics):
        rel_docs, lambda_vals = get_topic_documents(clusters[i], tree)
        topic_name = bert_model.topic_names[i]
        for j in range(0, len(rel_docs)):
            new_doc_rel = {}
            new_doc_rel['topic'] = topic_name
            new_doc_rel['document'] = rel_docs[j]
            new_doc_rel['lambda_val'] = round(lambda_vals[j],6)
            df_rel_docs = df_rel_docs.append(new_doc_rel, ignore_index=True)
    del bert_model
    return df_rel_docs

In [3]:
def generate_topic_documents_hdbscan(bert_model):
        clusterer = bert_model.hdbscan_model

        doc_topic_columns = ['document', 'topic', 'probabilities']
        df_doc_topic = pd.DataFrame(columns=doc_topic_columns)

        for i, _ in enumerate(clusterer.labels_):
            new_doc_topic = {}
            new_doc_topic['document'] = i
            new_doc_topic['topic'] = clusterer.labels_[i]
            new_doc_topic['probabilities'] = clusterer.probabilities_[i]
            df_doc_topic = df_doc_topic.append(new_doc_topic, ignore_index=True)
        del bert_model
        return df_doc_topic

In [4]:
topic_model = BERTopic.load('../raw_data/proj_final/50_docs_per_topic/2016_12_BERTopic_model_2_2_raw_content')

In [5]:
df = generate_topic_documents_hdbscan(topic_model)

In [6]:
df.groupby(by=['topic']).count()

Unnamed: 0_level_0,document,probabilities
topic,Unnamed: 1_level_1,Unnamed: 2_level_1
-1.0,1374,1374
0.0,127,127
1.0,118,118
2.0,173,173
3.0,123,123
4.0,74,74
5.0,60,60
6.0,118,118
7.0,73,73
8.0,125,125


In [67]:
df1 = generate_topic_documents(topic_model)

In [69]:
#df1.groupby(by=['topic']).count()
df1.shape

Unnamed: 0,topic,document,lambda_val
0,0_of the_in the_to the_and the,1172,1.458804
1,0_of the_in the_to the_and the,3495,1.568285
2,0_of the_in the_to the_and the,1418,1.602545
3,0_of the_in the_to the_and the,4008,1.605422
4,0_of the_in the_to the_and the,1314,1.640275
...,...,...,...
3343,20_the company_san francisco_the car_the model,889,2.242479
3344,20_the company_san francisco_the car_the model,536,2.242479
3345,20_the company_san francisco_the car_the model,4315,2.242479
3346,20_the company_san francisco_the car_the model,3265,2.242479


In [9]:
#topic_model = BERTopic.load('../raw_data/proj_final/50_docs_per_topic/2016_12_BERTopic_model_2_2_raw_content')
#topic_model = BERTopic.load('../raw_data/proj_final/50_docs_per_topic/2016_9_BERTopic_model_2_2_raw_content')
clusterer = topic_model.hdbscan_model
tree = clusterer.condensed_tree_
clusters = tree._select_clusters()

print(topic_model.topic_names[-1])
print(len(clusterer.labels_[clusterer.labels_==-1]))
print(topic_model.topic_names[0])
print(len(clusterer.labels_[clusterer.labels_==0]))
print(topic_model.topic_names[1])
print(len(clusterer.labels_[clusterer.labels_==1]))

-1_in the_to the_on the_to be
1374
0_of the_in the_to the_and the
127
1_the russian_the cia_the russians_that russia
118


In [10]:
len(clusterer.labels_)

5876

In [143]:
np.unique(clusterer.labels_)

array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
       16, 17, 18, 19, 20])

In [148]:
clusterer

bool

In [139]:
treexxxxx = clusterer.condensed_tree_.to_pandas()
cluster_treexxxx = treexxxxx[treexxxxx.child_size > 1]

In [144]:
cluster_treexxxx

Unnamed: 0,parent,child,lambda_val,child_size
105,5876,5877,0.658587,127
106,5876,5878,0.658587,5644
124,5878,5879,0.761768,118
125,5878,5880,0.761768,5518
458,5880,5881,0.910968,173
459,5880,5882,0.910968,5249
507,5882,5883,0.919004,123
508,5882,5884,0.919004,5103
559,5884,5885,0.936853,74
560,5884,5886,0.936853,5013


In [12]:
print(len(np.unique(clusterer.labels_)))
print(len(clusters))
print(topic_model.topic_names[1])

22
21
1_the russian_the cia_the russians_that russia


In [21]:
np.unique(clusters)

array([5877., 5879., 5881., 5883., 5885., 5887., 5889., 5891., 5894.,
       5895., 5899., 5901., 5903., 5906., 5907., 5909., 5910., 5911.,
       5915., 5917., 5918.])

In [129]:
df_dataset = pd.read_csv('../raw_data/proj_final/50_docs_per_topic/2016_12_dataset.csv')
print(len(clusterer.labels_))
print(df_dataset.shape)

5876
(5876, 7)


In [137]:
clusterer.prediction_data_

<hdbscan.prediction.PredictionData at 0x7f7593709790>

In [130]:
df[df['topic']==13]

Unnamed: 0,document,topic,probabilities
133,133.0,13.0,0.750616
145,145.0,13.0,0.991617
151,151.0,13.0,1.000000
159,159.0,13.0,0.917316
160,160.0,13.0,1.000000
...,...,...,...
5674,5674.0,13.0,1.000000
5732,5732.0,13.0,0.809462
5737,5737.0,13.0,0.612206
5809,5809.0,13.0,0.348455


In [158]:
print(clusterer.labels_[0])
topic_model.topic_names

10


{-1: '-1_in the_to the_on the_to be',
 0: '0_of the_in the_to the_and the',
 1: '1_the russian_the cia_the russians_that russia',
 2: '2_in the_the attack_islamic state_he was',
 3: '3_security council_the united_the un_the palestinians',
 4: '4_the syrian_in aleppo_of aleppo_in syria',
 5: '5_the trump_mr trump_conflicts of_trump organization',
 6: '6_the fed_the economy_interest rates_the dollar',
 7: '7_planned parenthood_she was_women who_that she',
 8: '8_climate change_the epa_energy department_transition team',
 9: '9_the united_united states_to mexico_donald trump',
 10: '10_the european_prime minister_european union_the eu',
 11: '11_one china_the chinese_of taiwan_with taiwan',
 12: '12_air force_china sea_the plane_force one',
 13: '13_the team_the nfl_the giants_week 15',
 14: '14_the military_secretary of_general mattis_donald trump',
 15: '15_secretary of_rex tillerson_exxon mobil_tillerson has',
 16: '16_affordable care_care act_health insurance_health care',
 17: '17_th

In [169]:
df_dataset[df_dataset['topic']==8]

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,id,title,year,month,content,topic,doc_id
19,19,81463,20599,"Jo Cox, Member of British Parliament, Is Kille...",2016.0,12.0,"LEEDS, England — A member of Parliament was...",8,19
32,32,81477,20676,Britain’s Dreams of a ‘Swiss Miracle’ Look Mor...,2016.0,12.0,To help explain why the British voted to leave...,8,32
42,42,81490,20754,A Blunt Message After ‘Brexit’: Bolting Will C...,2016.0,12.0,BRUSSELS — Shaken by Britain’s vote to quit...,8,42
43,43,81491,20755,"Having Won, Boris Johnson and ‘Brexit’ Leaders...",2016.0,12.0,LONDON — With their giddy celebrations of “...,8,43
91,91,81548,22635,"Italy’s Premier, Matteo Renzi, Says He’ll Resi...",2016.0,12.0,ROME — Italy plunged into political and eco...,8,91
...,...,...,...,...,...,...,...,...,...
5405,5405,88982,217368,"Using similar tactics, Austrian nationalists h...",2016.0,12.0,VIENNA — He is a political disrupter supp...,8,5405
5428,5428,89009,217409,Italian prime minister resigns in populist revolt,2016.0,12.0,BRUSSELS — Europe’s embattled political...,8,5428
5456,5456,89039,217447,"Anti-immigrant, anti-euro populists gain groun...",2016.0,12.0,ROME — Italy’s parties vowed Monday to ...,8,5456
5531,5531,89122,217548,Anti-Islam Dutch politician Geert Wilders foun...,2016.0,12.0,LONDON — A Dutch court found politician...,8,5531


In [166]:
#docs_df = pd.DataFrame(data, columns=["Doc"])
#df_dataset['topic'] = clusterer.labels_
#df_dataset['doc_id'] = range(len(df_dataset['content']))
#docs_per_topic = df_dataset.groupby(['topic'], as_index = False).agg({'content': ' '.join})
#docs_per_topic
df_dataset

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,id,title,year,month,content,topic,doc_id
0,0,81444,17283,House Republicans Fret About Winning Their Hea...,2016.0,12.0,WASHINGTON — Congressional Republicans have...,10,0
1,1,81445,17291,"First, a Mixtape. Then a Romance. - The New Yo...",2016.0,12.0,"Just how is Hillary Kerr, the founder of ...",20,1
2,2,81446,17292,Calling on Angels While Enduring the Trials of...,2016.0,12.0,Angels are everywhere in the Muñiz family’s ap...,20,2
3,3,81447,20127,U.S. Plans to Step Up Military Campaign Agains...,2016.0,12.0,"ABU DHABI, United Arab Emirates — The Obama...",-1,3
4,4,81448,20131,272 Slaves Were Sold to Save Georgetown. What ...,2016.0,12.0,WASHINGTON — The human cargo was loaded on ...,20,4
...,...,...,...,...,...,...,...,...,...
5871,5871,89520,218078,An eavesdropping Uber driver saved his 16-year...,2016.0,12.0,Uber driver Keith Avila picked up a p...,20,5871
5872,5872,89521,218079,Plane carrying six people returning from a Cav...,2016.0,12.0,Crews on Friday continued to search L...,1,5872
5873,5873,89522,218080,After helping a fraction of homeowners expecte...,2016.0,12.0,When the Obama administration announced a...,10,5873
5874,5874,89523,218081,"Yes, this is real: Michigan just banned bannin...",2016.0,12.0,This story has been updated. A new law in...,19,5874


In [154]:
topic_model.topic_names[clusterer.labels_[0]]

'9_the united_united states_to mexico_donald trump'

In [53]:
df_dataset['content'].iloc[0]

'WASHINGTON  —   Congressional Republicans have a new fear when it comes to their    health care lawsuit against the Obama administration: They might win. The incoming Trump administration could choose to no longer defend the executive branch against the suit, which challenges the administration’s authority to spend billions of dollars on health insurance subsidies for   and   Americans, handing House Republicans a big victory on    issues. But a sudden loss of the disputed subsidies could conceivably cause the health care program to implode, leaving millions of people without access to health insurance before Republicans have prepared a replacement. That could lead to chaos in the insurance market and spur a political backlash just as Republicans gain full control of the government. To stave off that outcome, Republicans could find themselves in the awkward position of appropriating huge sums to temporarily prop up the Obama health care law, angering conservative voters who have been 

In [39]:
xx = np.unique(clusterer.labels_)
for x in xx:
    print(topic_model.topic_names[x])

-1_in the_to the_on the_to be
0_of the_in the_to the_and the
1_the russian_the cia_the russians_that russia
2_in the_the attack_islamic state_he was
3_security council_the united_the un_the palestinians
4_the syrian_in aleppo_of aleppo_in syria
5_the trump_mr trump_conflicts of_trump organization
6_the fed_the economy_interest rates_the dollar
7_planned parenthood_she was_women who_that she
8_climate change_the epa_energy department_transition team
9_the united_united states_to mexico_donald trump
10_the european_prime minister_european union_the eu
11_one china_the chinese_of taiwan_with taiwan
12_air force_china sea_the plane_force one
13_the team_the nfl_the giants_week 15
14_the military_secretary of_general mattis_donald trump
15_secretary of_rex tillerson_exxon mobil_tillerson has
16_affordable care_care act_health insurance_health care
17_the pipeline_of the_north dakota_rock sioux
18_the united_united states_white house_according to
19_north carolina_the state_the governor_the 

In [70]:
for i, _ in enumerate(clusterer.labels_):
    print(df_dataset[['title']].iloc[i])

title    House Republicans Fret About Winning Their Hea...
Name: 0, dtype: object
title    First, a Mixtape. Then a Romance. - The New Yo...
Name: 1, dtype: object
title    Calling on Angels While Enduring the Trials of...
Name: 2, dtype: object
title    U.S. Plans to Step Up Military Campaign Agains...
Name: 3, dtype: object
title    272 Slaves Were Sold to Save Georgetown. What ...
Name: 4, dtype: object
title    Get to Know the Historical Figures on the $5, ...
Name: 5, dtype: object
title    Queen Elizabeth’s Party Planner Is Proud to We...
Name: 6, dtype: object
title    Small Rhode Island Suddenly Has Big Role in Pr...
Name: 7, dtype: object
title    Two-Hour Marathon. Can It Be Done? - The New Y...
Name: 8, dtype: object
title    How Kosovo Was Turned Into Fertile Ground for ...
Name: 9, dtype: object
title    Tony Fadell Steps Down Amid Tumult at Nest, a ...
Name: 10, dtype: object
title    Garbiñe Muguruza Upsets Serena Williams in Fre...
Name: 11, dtype: object
title    Panam

title    Beware, iPhone Users: Fake Retail Apps Are Sur...
Name: 104, dtype: object
title    The Rehabilitation of a Coaching Outcast - The...
Name: 105, dtype: object
title    Pat McCrory, North Carolina Governor, Concedes...
Name: 106, dtype: object
title    Bill Cosby’s Testimony on Drugs Can Be Evidenc...
Name: 107, dtype: object
title    The Hunt for Hatchimals, the Elusive Toy of th...
Name: 108, dtype: object
title    Keep Your Family Close and Your Roomba Closer ...
Name: 109, dtype: object
title    Finding a Home in the Bronx After Fleeing Anti...
Name: 110, dtype: object
title    Bob Dole Worked Behind the Scenes on Trump-Tai...
Name: 111, dtype: object
title    As Fake News Spreads Lies, More Readers Shrug ...
Name: 112, dtype: object
title    Donald Trump, Iran, Joe McKnight: Your Tuesday...
Name: 113, dtype: object
title    Cold Hands May Signal Raynaud’s Phenomenon - T...
Name: 114, dtype: object
title    Saudis Bankroll Taliban, Even as King Official...
Name: 115, dtype:

title    Hillary Clinton’s Team to Join Wisconsin Recou...
Name: 243, dtype: object
title    U.S. Seeks Death for Charleston Shooting Suspe...
Name: 244, dtype: object
title    Officials to Close Standing Rock Protest Camps...
Name: 245, dtype: object
title    More Than Coffee: New York’s Vanishing Diner C...
Name: 246, dtype: object
title    Two Black Women Embrace Their Chance to Be Hoc...
Name: 247, dtype: object
title    Ethics Office Praises Donald Trump for a Move ...
Name: 248, dtype: object
title    Why the Trump Team’s Economic Promises Will Be...
Name: 249, dtype: object
title    Colombia’s Congress Approves Peace Accord With...
Name: 250, dtype: object
title    A Dose of a Hallucinogen From a ‘Magic Mushroo...
Name: 251, dtype: object
title    Italian Monastery Seeks Salvation in Beer Afte...
Name: 252, dtype: object
title    Donald Trump, François Hollande, Tiger Woods: ...
Name: 253, dtype: object
title    Howard Schultz to Step Down as Starbucks Chief...
Name: 254, dtype:

title    Internal Inquiry Sealed the Fate of Roger Aile...
Name: 348, dtype: object
title    Scaling Up a Drug Trade, Straight Through ISIS...
Name: 349, dtype: object
title    Young Rural Women in India Chase Big-City Drea...
Name: 350, dtype: object
title    Where Even Nightmares Are Classified: Psychiat...
Name: 351, dtype: object
title    Angela Merkel to Seek 4th Term as Germany’s Le...
Name: 352, dtype: object
title    Trump Has Options for Undoing Obama’s Climate ...
Name: 353, dtype: object
title    In Havana, Castro’s Death Lays Bare a Generati...
Name: 354, dtype: object
title    At Home With the Macabre - The New York Times
Name: 355, dtype: object
title    Fidel Castro, Cuban Revolutionary Who Defied U...
Name: 356, dtype: object
title    Under Fidel Castro, Sport Symbolized Cuba’s St...
Name: 357, dtype: object
title    Trump Claims, With No Evidence, That ‘Millions...
Name: 358, dtype: object
title    Delta Air Lines Bans Disruptive Donald Trump S...
Name: 359, dtype: obj

title    Zumwalt: Obama’s Russian Hacking Shell Game - ...
Name: 459, dtype: object
title    49ers Give Kaepernick Prestigious Award for ‘I...
Name: 460, dtype: object
title    California Licenses 800K Illegal Aliens as Mot...
Name: 461, dtype: object
title    Washington Post’s Fake News of Russian Vermont...
Name: 462, dtype: object
title    Obama, Democratic Leaders to Plan Last Ditch E...
Name: 463, dtype: object
title    Trump: Putin ’Very Smart’ to Not Retaliate ove...
Name: 464, dtype: object
title    Trump’s Inaugural Parade Lineup: Border Patrol...
Name: 465, dtype: object
title    Brooks: Trump ’Siding With a Foreign Leader Ag...
Name: 466, dtype: object
title    Drudge, Downed by Cyberattack, Suspects Govern...
Name: 467, dtype: object
title    Kellogg’s ‘Values’? Corn Flakes Invented to Cu...
Name: 468, dtype: object
title    Two Texas Guardsmen Killed in Helicopter Crash
Name: 469, dtype: object
title    Portrait of Ronald Reagan Defaced During Break...
Name: 470, dtype: ob

title    Sen. Tom Cotton: Obama Set up Israel with Anti...
Name: 610, dtype: object
title    Glenn Beck and Samantha Bee Bond Over Their Fe...
Name: 611, dtype: object
title    Students Demand ’Sanctuary Campus’ for All Mig...
Name: 612, dtype: object
title    Outgoing Vice President Joe Biden: Trump’s Vic...
Name: 613, dtype: object
title    Belief in God Drops Ten Points Behind Atheism ...
Name: 614, dtype: object
title    Venezuela Denies Attorney Right to Visit U.S. ...
Name: 615, dtype: object
title    L.A. Proposal to Rename Freeway After Barack O...
Name: 616, dtype: object
title    Sex Shop ’Dildo Nativity Scene’ Sparks Anger
Name: 617, dtype: object
title    Senator Rand Paul’s 2016 Festivus Twitter Stor...
Name: 618, dtype: object
title    Hamas, Islamic Jihad Terror Factions Praise UN...
Name: 619, dtype: object
title    Dallas PD Loses 99 Officers in 75 Days
Name: 620, dtype: object
title    Donald Trump Shares ‘Very Nice’ Christmas Lett...
Name: 621, dtype: object
title   

title    Nancy Pelosi: I Don’t Think Democrats ’Want A ...
Name: 781, dtype: object
title    Frank Gaffney: Leaders like Merkel Still ‘Scra...
Name: 782, dtype: object
title    Colorado Judge Rules Against ’Faithless’ Elect...
Name: 783, dtype: object
title    Oklahoma Newspaper ’Hasn’t Been Forgiven’ Sinc...
Name: 784, dtype: object
title    FULL TEXT: ’A Merry MILO Christmas’ at Minneso...
Name: 785, dtype: object
title    Reid: Russian Involvement In Election ’A Hangi...
Name: 786, dtype: object
title    Sally Field: ’I Just Don’t Understand’ How Tru...
Name: 787, dtype: object
title    Tower Summit: Trump to Meet Silicon Valley Tec...
Name: 788, dtype: object
title    Jeffrey Wright: Trump Most ’Vulgar, Misogynist...
Name: 789, dtype: object
title    Five Facts About Donald Trump’s Expected Pick ...
Name: 790, dtype: object
title    Mark Steyn Backs #DumpKelloggs: Kellogg’s ‘Rea...
Name: 791, dtype: object
title    NYT: ’Public Safety’ At Risk if Donald Trump S...
Name: 792, dtype:

title    DELINGPOLE: ’Moronic, Self-Righteous, Disgusti...
Name: 903, dtype: object
title    Catholic Scholar Attacked for Catholic Views a...
Name: 904, dtype: object
title    MILO Calls Out Conservative-Bullying College P...
Name: 905, dtype: object
title    Politico: ’The Death of Clintonism’ - Breitbart
Name: 906, dtype: object
title    Sam Stein: ’I Don’t Think Fake News Contribute...
Name: 907, dtype: object
title    White Christmas in California as Mountain Snow...
Name: 908, dtype: object
title    Kellogg Foundation’s Director of Racial Equity...
Name: 909, dtype: object
title    Macy’s Halts Donations to Planned Parenthood: ...
Name: 910, dtype: object
title    FL Taxpayers Paid Pitbull $1M to Promote Touri...
Name: 911, dtype: object
title    Battle Brewing for Soul of American Economy: T...
Name: 912, dtype: object
title    Trump: ’I’m Very, Very Strong on Israel’ - Bre...
Name: 913, dtype: object
title    White House Blames Netanyahu for Anti-Israel U...
Name: 914, dtype: o

title    Trump on the ’Blue Wall’: ’We Shattered That S...
Name: 1007, dtype: object
title    Oakland Fire Leader: ’Love Child of Manson, Po...
Name: 1008, dtype: object
title    Dr. Sebastian Gorka on Trump’s Cabinet: ‘After...
Name: 1009, dtype: object
title    House Passes Gold Medal For WWII OSS Operators...
Name: 1010, dtype: object
title    Christmas Eve Stabbing Raises San Jose Murders...
Name: 1011, dtype: object
title    LA County Considers $1 Million in Taxpayer Fun...
Name: 1012, dtype: object
title    Buchanan: McCain, Graham and Rubio’s Tillerson...
Name: 1013, dtype: object
title    Krauthammer: Liberals Always Blame ’Isms’ Of O...
Name: 1014, dtype: object
title    DNC Chair Candidate Tom Perez Had His Own Clin...
Name: 1015, dtype: object
title    Candid Nails: Mets, Phils Great Lenny Dykstra ...
Name: 1016, dtype: object
title    Gene Simmons: Celebs Should ’Shut Their Pie Ho...
Name: 1017, dtype: object
title    German Media Fail to Report Girl Raped and Mur...
Name: 

Name: 1121, dtype: object
title    U.S.-Led Coalition Has Killed 50,000 Islamic S...
Name: 1122, dtype: object
title    NYT Gushes Over Churches Offering ’Sanctuary’ ...
Name: 1123, dtype: object
title    ’White Power’ Causes Islamism: Sweden’s New Ex...
Name: 1124, dtype: object
title    It’s Official: Trump Selects Sean Spicer as Wh...
Name: 1125, dtype: object
title    Watch — Wilders After ’Travesty’ Trial: ’I Wil...
Name: 1126, dtype: object
title    Israeli-Arab Pundit: Mosque Loudspeakers Distu...
Name: 1127, dtype: object
title    Schumer: Trump’s Trillion-Dollar Infrastructur...
Name: 1128, dtype: object
title    Walmart Selling Black Lives Matter Shirts with...
Name: 1129, dtype: object
title    Obama’s Advice For Precious Snowflakes: ‘Don’t...
Name: 1130, dtype: object
title    LGBT Magazine Adds Disclaimer After MILO, Mike...
Name: 1131, dtype: object
title    Flashback — Weekly Standard: Data Shows Politi...
Name: 1132, dtype: object
title    Seven Artists, Dissidents, Jou

title    Draining the Swamp: Ted Cruz and Ron DeSantis ...
Name: 1266, dtype: object
title    Netanyahu Lights Menorah in ’Illegal’ Jewish Q...
Name: 1267, dtype: object
title    Obama Tells CNN His Greatest Regret Was Failur...
Name: 1268, dtype: object
title    John Bolton: Normal People Don’t Become ’Lone ...
Name: 1269, dtype: object
title    T.J. Miller Mocks Donald Trump During 2016 Cri...
Name: 1270, dtype: object
title    Police Chief Blames Merkel Policy for Death of...
Name: 1271, dtype: object
title    7 Terror Attacks and Plots Foiled This Christm...
Name: 1272, dtype: object
title    Donald Trump Nominates Vincent Viola for Secre...
Name: 1273, dtype: object
title    Daily Mail: Trump Chief Strategist Stephen K. ...
Name: 1274, dtype: object
title    Report: Garth Brooks in Talks to Play Trump In...
Name: 1275, dtype: object
title    Orbán: EU Must Change After ’Christians Murder...
Name: 1276, dtype: object
title    Julian Assange: ’Our Source Is Not the Russian...
Name: 

title    Reich to Trump: Stop acting ’thin-skinned and ...
Name: 1417, dtype: object
title    UN: 60% of world’s hungry in Asia-Pacific
Name: 1418, dtype: object
title    The many incarnations of singer Junaid Jamshed
Name: 1419, dtype: object
title    Trump, Pence promise winter of wholesale change
Name: 1420, dtype: object
title    Mayors ask Trump to reconsider DACA stance
Name: 1421, dtype: object
title    China’s Xi Jinping keeps Iowa close to his heart
Name: 1422, dtype: object
title    Trump taps Iowa Gov. Branstad for China ambass...
Name: 1423, dtype: object
title    Trump describes Election Night jitters to dono...
Name: 1424, dtype: object
title    Dylann Roof trial: Jury seated in Charleston 
Name: 1425, dtype: object
title    Lindsey Graham, Democrats plan probes of hacking
Name: 1426, dtype: object
title    Opioid epidemic is getting worse, says CDC
Name: 1427, dtype: object
title    Petraeus, and the double standard on sex
Name: 1428, dtype: object
title    Thomas tempor

title    Duterte should be impeached for ’mass murder’:...
Name: 1541, dtype: object
title    Jerry Falwell, Jr.: Social views of Trump’s St...
Name: 1542, dtype: object
title    Trump’s business partners include controversia...
Name: 1543, dtype: object
title    Sex doll joke gift causes uproar in Chile
Name: 1544, dtype: object
title    Intel analysis shows Putin approved election h...
Name: 1545, dtype: object
title    How Russian playbook transformed war in Syria
Name: 1546, dtype: object
title    China drone seizure throws down gauntlet to Ob...
Name: 1547, dtype: object
title    Minnesota football players protest suspensions
Name: 1548, dtype: object
title    What Australia should learn from President Dut...
Name: 1549, dtype: object
title    Police-involved shooting settlement reached
Name: 1550, dtype: object
title    Trump says his supporters were ’violent’
Name: 1551, dtype: object
title    Obama all but names Putin as behind hacking
Name: 1552, dtype: object
title    Hedge f

title    Trump spokesman Jason Miller not taking White ...
Name: 1671, dtype: object
title    Typhoon expected to wallop Philippines during ...
Name: 1672, dtype: object
title    Graham: Defund UN after Israeli settlement vote
Name: 1673, dtype: object
title    Trump transition asks for names of those worki...
Name: 1674, dtype: object
title    Feds say Florida man threatened Trump in Faceb...
Name: 1675, dtype: object
title    Boko Haram camp falls to Nigerian troops
Name: 1676, dtype: object
title    Tiger and Trump pair up for round of golf
Name: 1677, dtype: object
title    ’’Santas’ march against South Korea’s impeache...
Name: 1678, dtype: object
title    Malta hijackers surrender after releasing Liby...
Name: 1679, dtype: object
title    How to give in solidarity, not charity, this s...
Name: 1680, dtype: object
title    US executions fall to record lows in 2016
Name: 1681, dtype: object
title    Duped by fake news story, Pakistani minister t...
Name: 1682, dtype: object
title  

title    Donald Trump could erase Obama’s legacy almost...
Name: 1813, dtype: object
title    The euro is diving as Italy’s prime minister r...
Name: 1814, dtype: object
title    Here’s how to manipulate Trump
Name: 1815, dtype: object
title    ’Why is it refreshing to make false statements...
Name: 1816, dtype: object
title    A pensions time bomb spells disaster for the U...
Name: 1817, dtype: object
title    Top Republican says he won’t back Trump’s plan...
Name: 1818, dtype: object
title    NY TIMES PUBLIC EDITOR: Some tweets from our p...
Name: 1819, dtype: object
title    These charts help explain Trump’s massive elec...
Name: 1820, dtype: object
title    Here’s what we know about Trump’s $50 billion ...
Name: 1821, dtype: object
title    Obama delivers passionate defense of his count...
Name: 1822, dtype: object
title    Democrats have no good outcomes if they try to...
Name: 1823, dtype: object
title    Turkey is hosting a new round of peace talks b...
Name: 1824, dtype: object

title    Trump asks Boeing to price out a comparable je...
Name: 1934, dtype: object
title    Trump says US must ’expand its nuclear capabil...
Name: 1935, dtype: object
title    Philippine president Duterte rages at UN human...
Name: 1936, dtype: object
title    ’Total catastrophe’: Experts say Trump’s posit...
Name: 1937, dtype: object
title    Trump names RNC’s Sean Spicer top White House ...
Name: 1938, dtype: object
title    I spent 3 months finding the perfect engagemen...
Name: 1939, dtype: object
title    The UN just put Israel on notice — and the US ...
Name: 1940, dtype: object
title    The ’grudge match’ between Obama and Israel ’f...
Name: 1941, dtype: object
title    Trump on Putin’s criticism of Hillary Clinton ...
Name: 1942, dtype: object
title    TRUMP ON UN VOTE: Things will be different whe...
Name: 1943, dtype: object
title    Democrats scorch Obama over UN vote condemning...
Name: 1944, dtype: object
title    Ex-Trump official’s 2017 wishes: Obama dies of...
Name: 

title    The Atlantic Daily: Following Leaders
Name: 2039, dtype: object
title    So, Why Can’t You Call Taiwan?
Name: 2040, dtype: object
title    Can Trump’s Defenders Stop Stein’s Recounts? 
Name: 2041, dtype: object
title    The Atlantic’s Week in Culture
Name: 2042, dtype: object
title    The Atlantic  Politics & Policy Daily: The One...
Name: 2043, dtype: object
title    Kafka in the Bull City
Name: 2044, dtype: object
title    Lessons From Trump’s ‘Fantastic’ Phone Call to...
Name: 2045, dtype: object
title    Always Shine’s Sobering Female Rivalry
Name: 2046, dtype: object
title    The Playground Where Babies Learn to Talk
Name: 2047, dtype: object
title    Grief’s Front Lines and Plastic Best Friends: ...
Name: 2048, dtype: object
title    How Trump Could Wage a War on Scientific Exper...
Name: 2049, dtype: object
title    The Dark Omen of Those Dow 19,000 Hats
Name: 2050, dtype: object
title    François Hollande’s Legacy
Name: 2051, dtype: object
title    Reading Literature W

title    Democrats Plan Fight to Save Obamacare
Name: 2161, dtype: object
title     More Americans Look to Congressional Democrat...
Name: 2162, dtype: object
title    The Novelty and Nostalgia of La La Land
Name: 2163, dtype: object
title    A Who’s Who of Congress in the Trump Era
Name: 2164, dtype: object
title    Severe Inequality Is Incompatible With the Ame...
Name: 2165, dtype: object
title    Carl’s Jr. Is the Fast-Food Chain of Trump’s A...
Name: 2166, dtype: object
title    García Márquez and Hollywood Hackers: The Week...
Name: 2167, dtype: object
title    A Cure for Post-Election Malaise
Name: 2168, dtype: object
title    Why Didn’t Obama Reveal Intel About Russia’s I...
Name: 2169, dtype: object
title    Trump Is Bringing Progressive Protestants Back...
Name: 2170, dtype: object
title    Should Anti-Trump Evangelicals Leave the Movem...
Name: 2171, dtype: object
title    Finding Wisdom in the Letters of Aging Writers
Name: 2172, dtype: object
title    The Atlantic Daily: P

title    Will Obama Retaliate Against Russia Before He ...
Name: 2268, dtype: object
title    The First Reply to a Trump Tweet Is Prime Medi...
Name: 2269, dtype: object
title    Lingua Quechua in Peru and Keeping Up With Kab...
Name: 2270, dtype: object
title    Busy Times at the World’s Largest Polar Bear P...
Name: 2271, dtype: object
title    Will Populism Kill Your Jetpack?
Name: 2272, dtype: object
title    The Day My Father Lost His Country
Name: 2273, dtype: object
title    A Blueprint to End Mass Incarceration
Name: 2274, dtype: object
title    The Actual War on Christians
Name: 2275, dtype: object
title    Patti Smith and the Mariah Carey â€˜Problemâ€™...
Name: 2276, dtype: object
title    Will the Minimum Wage Debate Ever Be Settled?
Name: 2277, dtype: object
title    Our Favorite 31 Songs of 2016
Name: 2278, dtype: object
title    Newt Gingrich Tries to Explain How Trump Will ...
Name: 2279, dtype: object
title    10 Elections to Watch in 2017
Name: 2280, dtype: object
titl

title    It’s Not About the Economy
Name: 2375, dtype: object
title    ‘What the Russians Did Was Utterly Unprecedented’
Name: 2376, dtype: object
title    Will Trump Break the Special Forces?
Name: 2377, dtype: object
title    How Comedy Became Education’s Best Critique
Name: 2378, dtype: object
title    Explorers Find Passage to Earth’s Dark Age
Name: 2379, dtype: object
title    The Atlantic Daily: Parting Words
Name: 2380, dtype: object
title    The Atlantic  Politics & Policy Daily: Kerry L...
Name: 2381, dtype: object
title    Sneakers Have Always Been Political Shoes 
Name: 2382, dtype: object
title    Has the Internet Killed Curly Quotes?
Name: 2383, dtype: object
title    How Trump Could Slow Medical Progress
Name: 2384, dtype: object
title    Why Some Companies Are Trying to Hire More Peo...
Name: 2385, dtype: object
title    The Year in Election Coverage
Name: 2386, dtype: object
title    How the Justice System Pushes Kids Out of Clas...
Name: 2387, dtype: object
title    Do

title    What did Stein gain from recount flop?
Name: 2515, dtype: object
title    Trump to pick Rep. Zinke, ex-SEAL, as Interior...
Name: 2516, dtype: object
title    Dems so far making few changes to playbook, le...
Name: 2517, dtype: object
title    Debate over source of Clinton email leaks is m...
Name: 2518, dtype: object
title    Obama hits Trump over intel briefings, alleged...
Name: 2519, dtype: object
title    Western Europe cracks down amid fears of holid...
Name: 2520, dtype: object
title    White House petition suggests naming next Navy...
Name: 2521, dtype: object
title    Fans boycott ’Star Wars’ over rumored anti-Tru...
Name: 2522, dtype: object
title    Chelsea Handler slams white female Trump voter...
Name: 2523, dtype: object
title    Fox News Poll:  Majority says Russian hacking ...
Name: 2524, dtype: object
title    Fox News Poll: Majority feels hopeful, yet low...
Name: 2525, dtype: object
title    Fox News Poll:  Trump and the Carrier deal
Name: 2526, dtype: objec

title     Civilians Are Under Fire As They Flee ISIS In...
Name: 2642, dtype: object
title     The US’s Election Surprise Has Sparked A Mad ...
Name: 2643, dtype: object
title     Mexico’s Immigration Program Has A Human Righ...
Name: 2644, dtype: object
title     This 3D Map Shows Exactly Where Trump Won The...
Name: 2645, dtype: object
title     Palantir Has A Well Placed Friend In Trumpland
Name: 2646, dtype: object
title     AT&T; Will Let You Stream Cable TV For $35 A ...
Name: 2647, dtype: object
title     Uber’s Company-Backed Guild Says It Supports ...
Name: 2648, dtype: object
title     Barnes & Noble Says People Stopped Reading Bo...
Name: 2649, dtype: object
title     Chip And Joanna Gaines’ Church Is Firmly Agai...
Name: 2650, dtype: object
title     What Scared Young LGBT People Need To Hear Ri...
Name: 2651, dtype: object
title     Kristen Stewart’s Complicated New Appeal
Name: 2652, dtype: object
title     Twitter Board Member: Twitter Helped Trump Wi...
Name: 2653, dtyp

title     Republicans Suddenly Think The Economy Is Gre...
Name: 2747, dtype: object
title     Is Weed A Medicine? Voters Say Yes But Doctor...
Name: 2748, dtype: object
title     The Future Of Starbucks Is Slower And More Ex...
Name: 2749, dtype: object
title     20 Of The Most Amazing Trans Moments Of 2016
Name: 2750, dtype: object
title     I Never Thought I’d Get Married — But Then I ...
Name: 2751, dtype: object
title     Silicon Valley Engineers Pledge To Never Buil...
Name: 2752, dtype: object
title     Trump Will Probably Be Able To Lift Sanctions...
Name: 2753, dtype: object
title     An Indiana Town’s Entire Police Force Has Qui...
Name: 2754, dtype: object
title     The Torture Lobby Is Excited For The Trump Years
Name: 2755, dtype: object
title     Washington Officers Justified In Fatally Shoo...
Name: 2756, dtype: object
title     Hospital Giant UHS Has Lost $2.4 Billion In V...
Name: 2757, dtype: object
title     Chinese Feminists Call Trump Out For His “Str...
Name: 2758

title    This Is the Moment for an Israeli Victory
Name: 2857, dtype: object
title    Thank You!
Name: 2858, dtype: object
title    Obama’s Belated Response to Russian Aggression
Name: 2859, dtype: object
title    2016: The Year Liberal Ideas Failed
Name: 2860, dtype: object
title    Making Guns Great Again
Name: 2861, dtype: object
title    16 Most Ridiculously PC Moments on College Cam...
Name: 2862, dtype: object
title    The Obama ‘Settlements’ Crisis
Name: 2863, dtype: object
title    ‘Safe Transfer Act’ Would Brand Transcripts of...
Name: 2864, dtype: object
title    Sense vs. Nonsense
Name: 2865, dtype: object
title    Defund the United Nations
Name: 2866, dtype: object
title    Obama’s Late Hit on Israel
Name: 2867, dtype: object
title    Americans Are No Longer on the Move
Name: 2868, dtype: object
title    Thomas Becket and Religious Freedom in the Twe...
Name: 2869, dtype: object
title    GW’s Decision to Ditch U.S. History Exemplifie...
Name: 2870, dtype: object
title    Th

title    After Its Ascendency Was Proclaimed, the Polit...
Name: 2975, dtype: object
title    Don’t Let Trump’s Sideshows Distract You from ...
Name: 2976, dtype: object
title    How President Trump Can Make American Intellig...
Name: 2977, dtype: object
title    In France, a Defeat for Free Speech and the Ri...
Name: 2978, dtype: object
title    To Understand Why the White Working Class Vote...
Name: 2979, dtype: object
title    Did Democrats Learn Anything From Their Attack...
Name: 2980, dtype: object
title    The Party of Workers
Name: 2981, dtype: object
title    A How-to Guide for Rolling Back Obama’s Regula...
Name: 2982, dtype: object
title    Repealing Obamacare: Let’s Get This Done
Name: 2983, dtype: object
title    Damien Chazelle Peddles Gimmicks. Huppert Reig...
Name: 2984, dtype: object
title    Mr. Farage Goes to Washington
Name: 2985, dtype: object
title    Trump’s Transition: So Far, So Good
Name: 2986, dtype: object
title    On Immigration, Andy Puzder Should Put Amer

title    Stop Partisan Cheerleading, and More Christmas...
Name: 3121, dtype: object
title    For We Have Seen His Star
Name: 3122, dtype: object
title    Christmas and Living beyond Fear 
Name: 3123, dtype: object
title    The Underestimated Value of ‘Happy Holidays’
Name: 3124, dtype: object
title    ‘Merry Christmas’ — Say It Loud and Proud, Bec...
Name: 3125, dtype: object
title    Does Anyone Besides Jeff Sessions Defend Today...
Name: 3126, dtype: object
title    Obama’s Betrayal of Israel Is a Black Day for ...
Name: 3127, dtype: object
title    The Return of ‘Street Corner Conservatism’
Name: 3128, dtype: object
title    How Russia Will Shape the 2018 Midterm Elections
Name: 3129, dtype: object
title    Scrap the Iran Nuclear Deal
Name: 3130, dtype: object
title    Trump’s Superman Style of Politics
Name: 3131, dtype: object
title    Why Is the Government Telling Us How to Raise ...
Name: 3132, dtype: object
title    Berlin Truck Massacre Shows the Soundness of T...
Name: 3133,

title    Pfizer is auditioning brokers to sell its head...
Name: 3260, dtype: object
title    Deal reached to complete Aleppo evacuations
Name: 3261, dtype: object
title    The video France doesn’t want you to see
Name: 3262, dtype: object
title    You have to be a multimillionaire to buy a Man...
Name: 3263, dtype: object
title    Team Hillary lands on ‘least influential’ list...
Name: 3264, dtype: object
title    Kris Kringles are forking over cash to take ‘S...
Name: 3265, dtype: object
title    Trump is shirtless Putin’s pawn on ‘SNL’
Name: 3266, dtype: object
title    Billionaires who backed Clinton have grown ric...
Name: 3267, dtype: object
title    Cyndi Lauper’s mom feared her daughter might g...
Name: 3268, dtype: object
title    Ex-mistress ‘shocked’ Petraeus is being consid...
Name: 3269, dtype: object
title    My baby bump is netting me thousands of dollars
Name: 3270, dtype: object
title    Wall St. CEOs have made an insane amount of mo...
Name: 3271, dtype: object
title 

Name: 3372, dtype: object
title    What the Trump administration could mean for d...
Name: 3373, dtype: object
title    Hero cop finds autistic boy after school didn’...
Name: 3374, dtype: object
title    Remembering Broadway’s Dick Latessa
Name: 3375, dtype: object
title    This ESPN/MSNBC power couple has a home perfec...
Name: 3376, dtype: object
title    Obama’s ugly bid to snub voters and tie Trump’...
Name: 3377, dtype: object
title    Twitter gets ‘bounced’ from Trump’s tech titan...
Name: 3378, dtype: object
title    Examining why Yankees hope these young starter...
Name: 3379, dtype: object
title    Here’s why New York is shrinking
Name: 3380, dtype: object
title    Donald Trump makes surprise call to Al Sharpton
Name: 3381, dtype: object
title    Cuomo commutes sentence of Brinks robbery geta...
Name: 3382, dtype: object
title    Cuomo’s campaign to ‘bigfoot’ de Blasio is sta...
Name: 3383, dtype: object
title    Electors are being harassed, threatened in bid...
Name: 3384, d

title    CIA says Russia intervened to help Trump win e...
Name: 3479, dtype: object
title    How next year can be better — and even worse —...
Name: 3480, dtype: object
title    Ryan Kavanaugh is out as Relativity CEO after ...
Name: 3481, dtype: object
title    Carmelo doesn’t sound over it as he and Phil m...
Name: 3482, dtype: object
title    Clive James: ‘Even the most trite Netflix dram...
Name: 3483, dtype: object
title    Whale spotted in New York’s East river thought...
Name: 3484, dtype: object
title    DRC parties reach deal denying third term for ...
Name: 3485, dtype: object
title    Hidden Figures review – black women Nasa boffi...
Name: 3486, dtype: object
title    Paul Pogba caps dramatic fightback to save Man...
Name: 3487, dtype: object
title    Readers’ books of the year 2016
Name: 3488, dtype: object
title    Silence review: the last temptation of Liam Ne...
Name: 3489, dtype: object
title    Do you care about the truth? Help the Guardian...
Name: 3490, dtype: objec

title    From glamour to gunfire: the tourist city of A...
Name: 3600, dtype: object
title    Common ground on Syria unites Russia and Turke...
Name: 3601, dtype: object
title    Slavoj Žižek: ‘We are all basically evil, egot...
Name: 3602, dtype: object
title    Islamic State retakes historic city of Palmyra
Name: 3603, dtype: object
title    ’Save us’: Aleppo civilians plead for help as ...
Name: 3604, dtype: object
title    Climate deniers, conspiracists and one-percent...
Name: 3605, dtype: object
title    Defense secretary Ash Carter meets leaders in ...
Name: 3606, dtype: object
title    Barack Obama’s presidency will be defined by h...
Name: 3607, dtype: object
title    The horrors of 2016 could have been stopped – ...
Name: 3608, dtype: object
title    Heisman Trophy winner Rashaan Salaam found dea...
Name: 3609, dtype: object
title    Greece on collision course with lenders as ESM...
Name: 3610, dtype: object
title    The 50 best podcasts of 2016
Name: 3611, dtype: object
titl

Name: 3701, dtype: object
title    In a world of fake news, real journalism must ...
Name: 3702, dtype: object
title    The child refugees of Calais: ‘The journey is ...
Name: 3703, dtype: object
title    Will you be financially better off than your p...
Name: 3704, dtype: object
title    Trump’s response to recent attacks risks addin...
Name: 3705, dtype: object
title    Hawaii’s new homeless regulations could cut sh...
Name: 3706, dtype: object
title    Iran hails victory in Aleppo as Shia militias ...
Name: 3707, dtype: object
title    Petra Kvitova could return to tennis within si...
Name: 3708, dtype: object
title    Critics say Trump’s call with Taiwan may alter...
Name: 3709, dtype: object
title    Last Tango’s abuse reveals the broken promise ...
Name: 3710, dtype: object
title    Action needed now to prevent South Sudan genoc...
Name: 3711, dtype: object
title    South Park finale: trolls create chaos by ’bri...
Name: 3712, dtype: object
title    Romania’s Social Democrats eas

title    Charity appeal raising over £1m sends powerful...
Name: 3813, dtype: object
title    Barack Obama bans oil and gas drilling in most...
Name: 3814, dtype: object
title    Stop worrying about fake news. What comes next...
Name: 3815, dtype: object
title    Andrew Puzder criticized as ’cruel and bafflin...
Name: 3816, dtype: object
title    Donald Trump accuses China of ’unpresidented’ ...
Name: 3817, dtype: object
title    The return of Sherlock: ‘Being a hero isn’t ab...
Name: 3818, dtype: object
title    Trump’s billionaire cabinet could be the wealt...
Name: 3819, dtype: object
title    Raymond Briggs: ‘There could be another world ...
Name: 3820, dtype: object
title    Labour plays to Jeremy Corbyn’s radicalism in ...
Name: 3821, dtype: object
title    Trump: Boeing should lose Air Force One contra...
Name: 3822, dtype: object
title    ’Performing is a political statement’: who wil...
Name: 3823, dtype: object
title    AT&T and Time Warner chiefs grilled on $85.4bn...
Name: 

title    As U.S. Confronts Russia, Trump’s Admiration O...
Name: 3916, dtype: object
title    Putin Says Russia Won’t Expel Diplomats In Res...
Name: 3917, dtype: object
title    (Some Of) Our Favorite Visual Stories of 2016
Name: 3918, dtype: object
title    A Skeptic Fact-Checks Yoga’s Health Claims And...
Name: 3919, dtype: object
title    From Psychedelics To Alzheimer’s, 2016 Was A G...
Name: 3920, dtype: object
title    By Returning To Farming’s Roots, He Found His ...
Name: 3921, dtype: object
title    Reading The Game: The Last Of Us
Name: 3922, dtype: object
title    Reading The Game: No Man’s Sky
Name: 3923, dtype: object
title    High Demand, Low Supply: Colorado River Water ...
Name: 3924, dtype: object
title    Ben Johnston Hears The Notes Between The Notes
Name: 3925, dtype: object
title    On The Men Who Rattled Pop’s Gender Rules — An...
Name: 3926, dtype: object
title    U.S. Ethics Chief Was Behind Those Tweets Abou...
Name: 3927, dtype: object
title    As A Rough Yea

title    Mental Health Care Gets A Boost From 21st Cent...
Name: 4033, dtype: object
title    Winners And Losers With The 21st Century Cures...
Name: 4034, dtype: object
title    A Brighter Outlook Could Translate To A Longer...
Name: 4035, dtype: object
title    What Former Employees Say ITT Tech Did To Scam...
Name: 4036, dtype: object
title    Consumer Cost Emerges As Key Issue In Grilling...
Name: 4037, dtype: object
title    Trump’s Victory Tour Stump Speech, Annotated
Name: 4038, dtype: object
title    Trump Touts Military Strength In N.C. On Secon...
Name: 4039, dtype: object
title    North Platte Canteen: Where The Heartland Open...
Name: 4040, dtype: object
title    Ohio Legislature Moves To Ban Abortion As Earl...
Name: 4041, dtype: object
title    At The Heart Of A Polish Christmas Ad, A Hard ...
Name: 4042, dtype: object
title    Reporter’s Notebook: What It Was Like As A Mus...
Name: 4043, dtype: object
title    As Syrian Government Forces Advance, The War C...
Name: 4044,

title    To Fight Malaria, Scientists Try Genetic Engin...
Name: 4148, dtype: object
title    Partisan Divide Colors Obama’s Place In History 
Name: 4149, dtype: object
title    Study Offers Clues To Risk Of Zika Birth Defec...
Name: 4150, dtype: object
title    Should Trump Foes See Tillerson At State As La...
Name: 4151, dtype: object
title    PHOTOS: Dutch Court Decides Crimean Treasures ...
Name: 4152, dtype: object
title    Postelection Solidarity: A Woman Cooks For Her...
Name: 4153, dtype: object
title    An Obama-Backed Change At Voice Of America Has...
Name: 4154, dtype: object
title    For CIA Nominee Mike Pompeo, ’Not A Good Situa...
Name: 4155, dtype: object
title    How Fast Could GOP Congress Get Obamacare Repe...
Name: 4156, dtype: object
title    Taking Stock Of The State Of The Universe
Name: 4157, dtype: object
title    A Year Later, Unfiltered Flint Tap Water Is St...
Name: 4158, dtype: object
title    ’Politically Correct’: The Phrase Has Gone Fro...
Name: 4159, dty

title    ’Elena Of Avalor’ Takes The Throne As Disney’s...
Name: 4272, dtype: object
title    Fearing Arrest At Home, Turkish Military Offic...
Name: 4273, dtype: object
title    ’Normal’: The Word Of The Year (In A Year That...
Name: 4274, dtype: object
title    Facing Criticism, Eric Trump Will Stop Activel...
Name: 4275, dtype: object
title    Trump’s Businesses Could Be Tripped Up By A 20...
Name: 4276, dtype: object
title    Trump And Gingrich Disagree On That Whole Swam...
Name: 4277, dtype: object
title    Marzipan, The Sweet Taste Of The Holidays In M...
Name: 4278, dtype: object
title    To Cook A Holiday Turkey in Kolkata, It Takes ...
Name: 4279, dtype: object
title    The Classroom Where Fake News Fails
Name: 4280, dtype: object
title    Medicare Penalizes Hospitals In Crackdown On A...
Name: 4281, dtype: object
title    Medicare Pays For A Kidney Transplant, But Not...
Name: 4282, dtype: object
title    The 2016 NPR Music Jazz Critics Poll
Name: 4283, dtype: object
title  

title    In Pro-Brexit English City, A Jobs Crisis Is A...
Name: 4376, dtype: object
title    Trump praises Putin for holding back in U.S.-R...
Name: 4377, dtype: object
title    Russians leave country retreats in the U.S., o...
Name: 4378, dtype: object
title    Investors brace for 2017 shocks after surprise...
Name: 4379, dtype: object
title    Congo rulng party, opposition sign deal for Ka...
Name: 4380, dtype: object
title    Syria rebels: ceasefire ’void’ if government v...
Name: 4381, dtype: object
title    Caution marks Iraqi army advance against Islam...
Name: 4382, dtype: object
title    China steps up scrutiny on individual forex pu...
Name: 4383, dtype: object
title    Italy to seek to track down and deport migrant...
Name: 4384, dtype: object
title    Viewsroom Predictions 2017: Part 1
Name: 4385, dtype: object
title    Baghdad blasts kill 29 as Mosul fighting inten...
Name: 4386, dtype: object
title    Winter storm socks U.S. New England region, sn...
Name: 4387, dtype: ob

title    Airbnb, New York City settle rental law lawsuit
Name: 4477, dtype: object
title    Austrians roundly reject far right in presiden...
Name: 4478, dtype: object
title    Sioux chief asks protesters to disband, Trump ...
Name: 4479, dtype: object
title    Renzi to resign after referendum rout, leaving...
Name: 4480, dtype: object
title    Death toll rises to 36 from California warehou...
Name: 4481, dtype: object
title    Netanyahu to discuss ’bad’ Iran deal with Trum...
Name: 4482, dtype: object
title    New Zealand PM Key announces shock resignation...
Name: 4483, dtype: object
title    Euro gains as Italian vote goes as expected by...
Name: 4484, dtype: object
title    Wall Street rises with banks, investors shrug ...
Name: 4485, dtype: object
title    Fidel Castro interred in rock, closing last ch...
Name: 4486, dtype: object
title    BMW seeks to be ’coolest’ ride-hailing firm wi...
Name: 4487, dtype: object
title    U.S. seeks to reassure Beijing after Trump cal...
Name: 44

title    Syrian government forces press attack in east ...
Name: 4615, dtype: object
title    In Sky deal, Murdochs shield Fox from sagging ...
Name: 4616, dtype: object
title    Gambia President Jammeh rejects outcome of Dec...
Name: 4617, dtype: object
title    Trump team memo on climate change alarms Energ...
Name: 4618, dtype: object
title    Trump floats ban on defense firms hiring milit...
Name: 4619, dtype: object
title    Wisconsin judge rejects bid to stop election r...
Name: 4620, dtype: object
title    The Murdochs’ new reach for Sky is well timed
Name: 4621, dtype: object
title    Data Dive: Advice to the next secretary of sta...
Name: 4622, dtype: object
title    Obama orders review of 2016 election cyber att...
Name: 4623, dtype: object
title    Coke CEO Muhtar Kent hands reins to Quincey in...
Name: 4624, dtype: object
title    Trump’s threats chill corporate investment pla...
Name: 4625, dtype: object
title    Iraq says it destroys Mosul car bomb factories
Name: 4626, d

title    Wall St. slides after Fed raises rates; energy...
Name: 4735, dtype: object
title    Trump team disavows survey seeking names of cl...
Name: 4736, dtype: object
title    China installs weapons systems on artificial i...
Name: 4737, dtype: object
title    NZ to spend around $1.4 billion on rebuilding ...
Name: 4738, dtype: object
title    Shippers, online retailers seek way around ris...
Name: 4739, dtype: object
title    South Korea’s powerful business lobby group de...
Name: 4740, dtype: object
title    Abe, Putin agree to revive Japan-Russia securi...
Name: 4741, dtype: object
title    Factbox: The islands keeping Japan and Russia ...
Name: 4742, dtype: object
title    U.S. may target weapons seized by Islamic Stat...
Name: 4743, dtype: object
title    House Republicans accelerate efforts on tax re...
Name: 4744, dtype: object
title    Buses evacuate thousands of exhausted Aleppo r...
Name: 4745, dtype: object
title    Dollar, bond yields advance on Fed rate outlook
Name: 47

title    British companies absorb Brexit shock, get on ...
Name: 4845, dtype: object
title    Virtual assistants expected to top virtual rea...
Name: 4846, dtype: object
title    German courts should go after fake news on Fac...
Name: 4847, dtype: object
title    With water cannons and Southern belles, Trump ...
Name: 4848, dtype: object
title    Under threat in Washington, first lady’s food ...
Name: 4849, dtype: object
title    Germany says Tunisian’s fingerprints found in ...
Name: 4850, dtype: object
title    Gunman wounds three in Zurich mosque rampage, ...
Name: 4851, dtype: object
title    Exclusive: U.S. proposed $5 billion - 7 billio...
Name: 4852, dtype: object
title    Wall Street extends rally but Germany truck de...
Name: 4853, dtype: object
title    Boeing airplane unit to cut more jobs in 2017,...
Name: 4854, dtype: object
title    Gunfire erupts in Kinshasa as Congolese protes...
Name: 4855, dtype: object
title    Protests erupt in Congo as Kabila’s mandate ex...
Name: 

title    Trump, taking a break from White House prep, t...
Name: 4958, dtype: object
title    Actress Carrie Fisher suffers medical emergenc...
Name: 4959, dtype: object
title    U.S. housing, consumer confidence data bolster...
Name: 4960, dtype: object
title    How Deutsche’s big bet on Wall Street turned t...
Name: 4961, dtype: object
title    Icahn regulatory role gives activist investors...
Name: 4962, dtype: object
title    Stocks could suffer as Trump trade policy take...
Name: 4963, dtype: object
title    Malta hijack ends peacefully as Gaddafi loyali...
Name: 4964, dtype: object
title    U.S. forces embedding more to help Iraqis reta...
Name: 4965, dtype: object
title    Amazon starts flexing muscle in new space: air...
Name: 4966, dtype: object
title    Under scrutiny, Trump decides to dissolve his ...
Name: 4967, dtype: object
title    Exclusive: Trump team seeks names of officials...
Name: 4968, dtype: object
title    French-Swiss aid worker kidnapped in Malian ci...
Name: 

title    President Obama just took unprecedented steps ...
Name: 5055, dtype: object
title    The DNC race has become another fight over Ber...
Name: 5056, dtype: object
title    A simple guide to CRISPR, one of the biggest s...
Name: 5057, dtype: object
title    I quit alcohol to save my life. But I miss the...
Name: 5058, dtype: object
title    In 2016, the world lost an entire tier of prog...
Name: 5059, dtype: object
title    The real reason for Netanyahu’s showdown with ...
Name: 5060, dtype: object
title    The UN’s resolution on Israel doesn’t include ...
Name: 5061, dtype: object
title    9 questions about the UN vote on Israeli settl...
Name: 5062, dtype: object
title    New Year’s Eve will last one second longer tha...
Name: 5063, dtype: object
title    The 2016 culture war, as illustrated by the al...
Name: 5064, dtype: object
title    Donald Trump confirmed yesterday that his vete...
Name: 5065, dtype: object
title    One of 2016’s best shows and one of its best b...
Name: 

title    Trump’s governing strategy is taking shape — a...
Name: 5175, dtype: object
title    A pollster on the racial panic Obama’s preside...
Name: 5176, dtype: object
title    Trump is trying to use Taiwan as a bargaining ...
Name: 5177, dtype: object
title    Oil prices keep rising after Russia joins OPEC...
Name: 5178, dtype: object
title    Hillary Clinton’s campaign wants the Electoral...
Name: 5179, dtype: object
title    The FDA is slowly getting weaker. Here’s why t...
Name: 5180, dtype: object
title    We’re about to see states’ rights used defensi...
Name: 5181, dtype: object
title    Russian hackers, Donald Trump, and the 2016 el...
Name: 5182, dtype: object
title    Donald Trump producing The Celebrity Apprentic...
Name: 5183, dtype: object
title    Methane levels in the atmosphere are now risin...
Name: 5184, dtype: object
title    What becomes of Black Lives Matter in the age ...
Name: 5185, dtype: object
title    Golden Globe nominees 2017: complete nominatio...
Name: 

title    Berlin and Ankara: a tale of two terror attacks
Name: 5296, dtype: object
title    The 11 best albums you might have missed this ...
Name: 5297, dtype: object
title    Skip Die Hard this year. Black Christmas is th...
Name: 5298, dtype: object
title    Washington, DC, passed one of the nation’s mos...
Name: 5299, dtype: object
title    Electoral integrity in all 50 US states, ranke...
Name: 5300, dtype: object
title    How to use taxes to close the trade deficit
Name: 5301, dtype: object
title    Why are there so few Hanukkah movies?
Name: 5302, dtype: object
title    Obama just took a parting shot at Israel — and...
Name: 5303, dtype: object
title    Patriots Day is a stirring tale of ordinary he...
Name: 5304, dtype: object
title    What the holidays are like for a recovering al...
Name: 5305, dtype: object
title    Hidden Figures, about 3 black women at NASA in...
Name: 5306, dtype: object
title    Sense8 Christmas special review: Netflix’s ser...
Name: 5307, dtype: object


title    Trump, the pleaser president
Name: 5398, dtype: object
title    Donald Trump’s Cabinet assembly instructions
Name: 5399, dtype: object
title    Sorry, Lt. Gen. Flynn, it’s unrealistic to ‘wi...
Name: 5400, dtype: object
title    Romney’s groveling before Trump is horrible to...
Name: 5401, dtype: object
title    After a toddler accidentally shot and killed h...
Name: 5402, dtype: object
title    Washington Monument to remain closed for at le...
Name: 5403, dtype: object
title    One year after the San Bernardino attack, poli...
Name: 5404, dtype: object
title    Using similar tactics, Austrian nationalists h...
Name: 5405, dtype: object
title    Second-chance law for young criminals puts vio...
Name: 5406, dtype: object
title    U.S. economy added 178,000 jobs in November; u...
Name: 5407, dtype: object
title    When Americans land in trouble abroad, these e...
Name: 5408, dtype: object
title    Trump’s emerging Cabinet is looking less Trump...
Name: 5409, dtype: object
title 

title    Annie Glenn: ‘When I called John, he cried. Pe...
Name: 5532, dtype: object
title    A teen fired up Facebook Live from the highway...
Name: 5533, dtype: object
title    A rare half-male, half-female butterfly — and ...
Name: 5534, dtype: object
title    Trump expected to pick ExxonMobil chief Rex Ti...
Name: 5535, dtype: object
title    Who is Rex Tillerson, the ExxonMobil chairman ...
Name: 5536, dtype: object
title    Secret CIA assessment says Russia was trying t...
Name: 5537, dtype: object
title    The CIA concluded that Russia worked to elect ...
Name: 5538, dtype: object
title    Schumer calls for full investigation by Congre...
Name: 5539, dtype: object
title    The best movies of 2016: ‘Moonlight,’ ‘Manches...
Name: 5540, dtype: object
title    ISIS is back in the ancient Syrian city of Pal...
Name: 5541, dtype: object
title    Taiwan is a country with history and people. I...
Name: 5542, dtype: object
title    Five myths about U.S.-China relations
Name: 5543, dtype:

Name: 5640, dtype: object
title    Amid outcry, N.C. GOP’s plan to curb Democrati...
Name: 5641, dtype: object
title    Ivanka Trump could be the most powerful first ...
Name: 5642, dtype: object
title    Trump Grill’s star rating is plummeting, and Y...
Name: 5643, dtype: object
title    After first bitter blast, U.S. faces second wa...
Name: 5644, dtype: object
title    It took three years for Yahoo to tell us about...
Name: 5645, dtype: object
title    Defying skeptics, Kim Jong Un marks five years...
Name: 5646, dtype: object
title    Saddam Hussein should have been left to run Ir...
Name: 5647, dtype: object
title    In last-shot bid, thousands urge electoral col...
Name: 5648, dtype: object
title    Trump is stoking his base on his pre-inaugural...
Name: 5649, dtype: object
title    For a president-elect who touts ‘America first...
Name: 5650, dtype: object
title    Trump names Rep. Mick Mulvaney, a fiscal hawk,...
Name: 5651, dtype: object
title    In Pakistan, five girls were k

title    On Christmas, and ‘taming the savageness of men’
Name: 5766, dtype: object
title    Some lessons from Jesus, for all of us
Name: 5767, dtype: object
title    Evangelicals side with Israel. That’s hurting ...
Name: 5768, dtype: object
title    What the Hanukkah story teaches us about the T...
Name: 5769, dtype: object
title    Europe may face a grim future with terrorism a...
Name: 5770, dtype: object
title    Grace, unwrapped
Name: 5771, dtype: object
title    They’ll be home for Christmas: Five homeless c...
Name: 5772, dtype: object
title    Rockettes aren’t being forced to perform at Tr...
Name: 5773, dtype: object
title    Best and worst moments from the Redskins’ win ...
Name: 5774, dtype: object
title    NFL Week 16: Carr and Mariota suffer broken fi...
Name: 5775, dtype: object
title    5 mistakes Americans are making with their money
Name: 5776, dtype: object
title    5 reasons December is a horrible time to date
Name: 5777, dtype: object
title    9 souvenirs you’d be 

In [75]:
bert_model = topic_model
clusterer = bert_model.hdbscan_model

In [76]:
tree = clusterer.condensed_tree_
clusters = tree._select_clusters()

In [83]:
cluster_id = clusters[0]
condensed_tree = tree
raw_tree = condensed_tree._raw_tree

In [91]:
raw_tree[raw_tree['child_size'] > 1]

array([(5876, 5877, 0.65858744,  127), (5876, 5878, 0.65858744, 5644),
       (5878, 5879, 0.76176836,  118), (5878, 5880, 0.76176836, 5518),
       (5880, 5881, 0.91096753,  173), (5880, 5882, 0.91096753, 5249),
       (5882, 5883, 0.91900434,  123), (5882, 5884, 0.91900434, 5103),
       (5884, 5885, 0.93685279,   74), (5884, 5886, 0.93685279, 5013),
       (5886, 5887, 0.9903625 ,   60), (5886, 5888, 0.9903625 , 4891),
       (5888, 5889, 1.09116835,  118), (5888, 5890, 1.09116835, 4622),
       (5890, 5891, 1.24556222,   73), (5890, 5892, 1.24556222, 4238),
       (5892, 5893, 1.25615491, 4099), (5892, 5894, 1.25615491,  125),
       (5893, 5895, 1.32179313,  426), (5893, 5896, 1.32179313, 3548),
       (5896, 5897, 1.32484042,  742), (5896, 5898, 1.32484042, 2801),
       (5898, 5899, 1.34028009,   77), (5898, 5900, 1.34028009, 2703),
       (5897, 5901, 1.40872835,  145), (5897, 5902, 1.40872835,  561),
       (5902, 5903, 1.54409042,  151), (5902, 5904, 1.54409042,  357),
      

In [82]:
cluster_tree = raw_tree[raw_tree['child_size'] > 1]
cluster_tree

array([(5876, 5877, 0.65858744,  127), (5876, 5878, 0.65858744, 5644),
       (5878, 5879, 0.76176836,  118), (5878, 5880, 0.76176836, 5518),
       (5880, 5881, 0.91096753,  173), (5880, 5882, 0.91096753, 5249),
       (5882, 5883, 0.91900434,  123), (5882, 5884, 0.91900434, 5103),
       (5884, 5885, 0.93685279,   74), (5884, 5886, 0.93685279, 5013),
       (5886, 5887, 0.9903625 ,   60), (5886, 5888, 0.9903625 , 4891),
       (5888, 5889, 1.09116835,  118), (5888, 5890, 1.09116835, 4622),
       (5890, 5891, 1.24556222,   73), (5890, 5892, 1.24556222, 4238),
       (5892, 5893, 1.25615491, 4099), (5892, 5894, 1.25615491,  125),
       (5893, 5895, 1.32179313,  426), (5893, 5896, 1.32179313, 3548),
       (5896, 5897, 1.32484042,  742), (5896, 5898, 1.32484042, 2801),
       (5898, 5899, 1.34028009,   77), (5898, 5900, 1.34028009, 2703),
       (5897, 5901, 1.40872835,  145), (5897, 5902, 1.40872835,  561),
       (5902, 5903, 1.54409042,  151), (5902, 5904, 1.54409042,  357),
      

In [106]:
print(df_dataset.shape)
leaves = hdbscan.plots._recurse_leaf_dfs(cluster_tree, cluster_id)
print(leaves)

(5876, 7)
[5877.0]


In [108]:
len(clusters)

21

In [109]:
for cl in range(0, len(clusters)):
    leaves = hdbscan.plots._recurse_leaf_dfs(cluster_tree, clusters[cl])
    print(leaves)

[5877.0]
[5879.0]
[5881.0]
[5883.0]
[5885.0]
[5887.0]
[5889.0]
[5891.0]
[5894.0]
[5913, 5914]
[5899.0]
[5901.0]
[5903.0]
[5906.0]
[5907.0]
[5909.0]
[5910.0]
[5911.0]
[5915.0]
[5917.0]
[5927, 5928, 5931, 5932, 5925, 5929, 5930, 5922]


In [115]:
points_list = []
for cl in range(0, len(clusters)):
    leaves = hdbscan.plots._recurse_leaf_dfs(cluster_tree, clusters[cl])
    result_points = np.array([])
    result_points_val = np.array([])
    for leaf in leaves:
        #max_lambda = raw_tree['lambda_val'][raw_tree['parent'] == leaf].max()
        #points = raw_tree['child'][(raw_tree['parent'] == leaf) & (raw_tree['lambda_val'] == max_lambda)]
        #points_val = raw_tree['lambda_val'][(raw_tree['parent'] == leaf) & (raw_tree['lambda_val'] == max_lambda)]
        points = raw_tree['child'][(raw_tree['parent'] == leaf)]
        points_val = raw_tree['lambda_val'][(raw_tree['parent'] == leaf)]
        
        result_points = np.hstack((result_points, points))
        result_points_val = np.hstack((result_points_val, points_val))   
        points_list.append(result_points)        

In [122]:
xx = np.array(points_list)
kk = np.array([])
for x in xx:
    kk = np.hstack((kk, x))
kk    

array([1172., 3495., 1418., ..., 4315., 3265., 3376.])

In [170]:
len(np.unique(kk))

3348

In [173]:
topic_model = BERTopic.load('../raw_data/proj_final/10_docs_per_topic/2016_9_BERTopic_model_2_2_raw_content')
df_dataset = pd.read_csv('../raw_data/proj_final/10_docs_per_topic/2016_9_dataset.csv')
df_topic = pd.read_csv('../raw_data/proj_final/10_docs_per_topic/2016_9_BERTopic_Info.csv')
docs = df_dataset['content'].values
topics  = df_topic['Topic'].values
topics, probs = topic_model.fit_transform(docs)

In [177]:
new_topics, new_probs = topic_model.reduce_topics(docs, topics, probabilities=probs, nr_topics=18)
new_topics

IndexError: index 129 is out of bounds for axis 1 with size 129

In [191]:
probs

array([[6.86264621e-003, 3.22731969e-003, 7.14934155e-003, ...,
        1.02438619e-002, 1.00701062e-002, 8.59272550e-003],
       [1.01948770e-003, 1.64285811e-003, 1.06858406e-003, ...,
        2.06660344e-003, 1.99601114e-003, 2.08616176e-003],
       [7.25168866e-004, 9.23610481e-004, 6.87608392e-004, ...,
        1.30461630e-003, 1.34605736e-003, 1.61309061e-003],
       ...,
       [1.72938117e-307, 1.36372769e-307, 2.08835531e-307, ...,
        3.14309238e-307, 3.73655166e-307, 2.68425320e-307],
       [1.72145197e-307, 1.36134629e-307, 2.08603864e-307, ...,
        3.10159120e-307, 3.69930204e-307, 2.66582545e-307],
       [8.67219956e-308, 1.79779451e-307, 9.45775809e-308, ...,
        1.31951923e-307, 1.49547094e-307, 1.55844380e-307]])

In [180]:
df_dataset.index.values

array([   0,    1,    2, ..., 5680, 5681, 5682])

In [187]:
#df_news = pd.DataFrame(data={'doc_id': df_dataset.index.values, 'topic': topics, 'probabilities': probs})
df_news = pd.DataFrame(data={'doc_id': df_dataset.index.values, 'topic': topics})
df_news

Unnamed: 0,doc_id,topic
0,0,12
1,1,-1
2,2,-1
3,3,20
4,4,-1
...,...,...
5678,5678,-1
5679,5679,100
5680,5680,56
5681,5681,56


In [None]:
df_news['topic'].min()

## Test 2

In [6]:
#topic_model = BERTopic.load('../raw_data/proj_final/10_docs_per_topic/2016_9_BERTopic_model_2_2_raw_content')
#df_dataset = pd.read_csv('../raw_data/proj_final/10_docs_per_topic/2016_9_dataset.csv')
#df_topic = pd.read_csv('../raw_data/proj_final/10_docs_per_topic/2016_9_BERTopic_Info.csv')
#docs = df_dataset['content'].values
#topics  = df_topic['Topic'].values
#topics, probs = topic_model.fit_transform(docs)

In [6]:
df_topic_docs = pd.read_csv('../raw_data/proj_final/10_docs_per_topic/2016_12_BERTopic_TopicDocuments_reduction.csv')
print(df_topic_docs.shape)
#df_temp = df_topic_documents_hdbscan[df_topic_docs['topic']==-1].sort_values(by=['probabilities'], ascending=True).reset_index(drop=True)
df_temp = df_topic_docs[df_topic_docs['topic']==-1].reset_index(drop=True)

(5876, 2)


In [5]:
df_temp

Unnamed: 0,document_id,topic
0,2,-1
1,4,-1
2,5,-1
3,6,-1
4,7,-1
...,...,...
1651,5862,-1
1652,5864,-1
1653,5868,-1
1654,5870,-1
