## Imports

In [2]:
import pandas as pd
import numpy as np
import hdbscan
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

## Load data

In [33]:
%%time
df = pd.read_csv('../raw_data/proj_final/political_dataset.csv')
df['title'].fillna('no title', inplace = True)
df = df.sort_values(by=['year', 'month'], ascending=True).reset_index(drop=True)

print(df.shape)
df.head()

(99778, 6)
CPU times: user 4.45 s, sys: 683 ms, total: 5.13 s
Wall time: 6.16 s


Unnamed: 0.1,Unnamed: 0,id,title,year,month,content
0,415,36361,2015: Sold Out South Carolina Tea Party Conven...,2015.0,1.0,"MYRTLE BEACH, South Carolina — The South Ca..."
1,417,57593,Narendra Modi Fast Facts,2015.0,1.0,(CNN) Here is a look at the life of India’s P...
2,418,59225,Little Richard Fast Facts,2015.0,1.0,(CNN) Here is a look at the life of ”Archit...
3,420,60219,"Cycling’s marathon man attempts 75,000 miles i...",2015.0,1.0,(CNN) While many people are recovering from a...
4,422,60223,Cops: Georgia police chief on leave after wife...,2015.0,1.0,(CNN) Magazines and websites regularly rank P...


## Split dataset

In [18]:
df_split = df[['id', 'year', 'month']].groupby(['year', 'month']).count().reset_index()
df_split.head(2)

Unnamed: 0,year,month,id
0,2015.0,1.0,285
1,2015.0,2.0,139


In [22]:
df[(df['year'] == df_split['year'].iloc[0]) & (df['month'] == df_split['month'].iloc[1])]

Unnamed: 0.1,Unnamed: 0,id,title,year,month,content
285,889,58852,Marco Rubio Fast Facts,2015.0,2.0,(CNN) Here is a look at the life of Republica...
286,891,60931,Super Bowl: New England Patriots beat Seattle ...,2015.0,2.0,"(CNN) Where Katy Perry needed a big cat, the ..."
287,893,60934,Is U.S. coalition winning war against ISIS? (O...,2015.0,2.0,"(CNN) Nearly five months ago, President Obama..."
288,895,60936,’The Evil Hours’: Lessons of PTSD,2015.0,2.0,"(CNN) During World War II, a soldier under U...."
289,897,60938,"Huckabee compares gay marriage to drinking, sw...",2015.0,2.0,Washington (CNN) Republican presidential hopef...
...,...,...,...,...,...,...
419,1171,162390,Guess How Much Of Uncle Sam’s Money Goes To Fo...,2015.0,2.0,How much of the federal budget goes to foreign...
420,1172,163338,How NAFTA Changed American (And Mexican) Food ...,2015.0,2.0,If you were to try and list the biggest for ...
421,1173,163577,Beyond Rash And Fever: How Measles Can Kill,2015.0,2.0,"In 1962, children’s book author Roald Dahl los..."
422,1175,198817,The Supreme Court just took the case that will...,2015.0,2.0,The Supreme Court has agreed to take up the ...


## BERTopic

In [78]:
def generate_topic_info(bert_model, file_name):
    df_topic_info = bert_model.get_topic_info()
    df_topic_info.to_csv(file_name, header=True, index=False, encoding='utf-8')
    del df_topic_info

def generate_terms(bert_model, file_name):
    topics = bert_model.get_topics()
    number_of_topics = len(topics)-1
    num_of_terms = len(topics[0])

    topic_columns = ['topic', 'term', 'weight']

    df_topics = pd.DataFrame(columns=topic_columns)
    for i in range(-1,number_of_topics): 
        for j in range(num_of_terms):
            new_topic = {}
            new_topic['topic'] = topic_model.topic_names[i]
            new_topic['term'] = topics[i][j][0]
            new_topic['weight'] = round(topics[i][j][1],6)
            df_topics = df_topics.append(new_topic, ignore_index=True)
    df_topics.to_csv(file_name, header=True, index=False, encoding='utf-8')
    del df_topics

def correlation_matrix_to_df(df_corr):
    list_done = []
    lits_item1 = []
    lits_item2 = []
    list_corr = []

    for k in range(1,df_corr.shape[1]):
        for i, j in df_corr.iterrows():
            #if (df_corr.columns[k] != j[0]) and (j[0] not in list_done):
            #if (j[0] not in list_done):
            lits_item1.append(df_corr.columns[k])
            lits_item2.append(j[0])
            list_corr.append(j[k])
        list_done.append(df_corr.columns[k])

    corr_dict = {'topic1': lits_item1,
                 'topic2': lits_item2,
                 'similarity': list_corr}
    df_res = pd.DataFrame(corr_dict)
    df_res = df_res.sort_values(by='similarity', ascending=False).copy()
    df_res.reset_index(inplace=True,drop=True)
    return df_res.copy() 

def generate_topic_similarity(bert_model, file_name):
    corr_matrix = bert_model.topic_sim_matrix

    topics = bert_model.get_topics()
    number_of_topics = len(topics)-1

    topic_columns = ['topic']
    for i in range(-1,number_of_topics):
        topic_columns.append(bert_model.topic_names[i])

    df_similarity = pd.DataFrame(columns=topic_columns)
    for i in range(-1,number_of_topics):
        new_topic = {}
        new_topic['topic'] = bert_model.topic_names[i]
        for j in range(-1,number_of_topics):
            new_topic[bert_model.topic_names[j]] = round(corr_matrix[i,j],6)
        df_similarity = df_similarity.append(new_topic, ignore_index=True)
        
    df_topic_similarity = correlation_matrix_to_df(df_similarity)
    df_topic_similarity.to_csv(file_name, header=True, index=False, encoding='utf-8')
    del df_topic_similarity

def get_topic_documents(cluster_id, condensed_tree):
    result_points = np.array([])
    result_points_val = np.array([])
    
    #assert cluster_id > -1, "The topic's label should be greater than -1!"
    
    if cluster_id <= -1:
        return result_points.astype(np.int64), result_points_val.astype(np.float64)
        
    raw_tree = condensed_tree._raw_tree
    
    # Just the cluster elements of the tree, excluding singleton points
    cluster_tree = raw_tree[raw_tree['child_size'] > 1]
    
    # Get the leaf cluster nodes under the cluster we are considering
    leaves = hdbscan.plots._recurse_leaf_dfs(cluster_tree, cluster_id)
    
    # Now collect up the last remaining points of each leaf cluster (the heart of the leaf) 
    for leaf in leaves:
        #max_lambda = raw_tree['lambda_val'][raw_tree['parent'] == leaf].max()
        #points = raw_tree['child'][(raw_tree['parent'] == leaf) & (raw_tree['lambda_val'] == max_lambda)]
        #points_val = raw_tree['lambda_val'][(raw_tree['parent'] == leaf) & (raw_tree['lambda_val'] == max_lambda)]
        points = raw_tree['child'][(raw_tree['parent'] == leaf)]
        points_val = raw_tree['lambda_val'][(raw_tree['parent'] == leaf)]
        result_points = np.hstack((result_points, points))
        result_points_val = np.hstack((result_points_val, points_val))   
    return result_points.astype(np.int64), result_points_val.astype(np.float64)

def generate_topic_documents(bert_model, file_name):
    clusterer = bert_model.hdbscan_model
    tree = clusterer.condensed_tree_
    clusters = tree._select_clusters()

    number_of_topics = len(clusters)

    relevant_columns = ['topic', 'document', 'lambda_val']
    df_rel_docs = pd.DataFrame(columns=relevant_columns)

    for i in range(-1, number_of_topics):
        rel_docs, lambda_vals = get_topic_documents(clusters[i], tree)
        topic_name = bert_model.topic_names[i]
        for j in range(0, len(rel_docs)):
            new_doc_rel = {}
            new_doc_rel['topic'] = topic_name
            new_doc_rel['document'] = rel_docs[j]
            new_doc_rel['lambda_val'] = round(lambda_vals[j],6)
            df_rel_docs = df_rel_docs.append(new_doc_rel, ignore_index=True)
    df_rel_docs.to_csv(file_name, header=True, index=False, encoding='utf-8')
    del df_rel_docs

def generate_topic_documents_hdbscan(bert_model, file_name):
    clusterer = bert_model.hdbscan_model

    doc_topic_columns = ['document', 'topic', 'probabilities']
    df_doc_topic = pd.DataFrame(columns=doc_topic_columns)

    for i, _ in enumerate(clusterer.labels_):
        new_doc_topic = {}
        new_doc_topic['document'] = i
        new_doc_topic['topic'] = clusterer.labels_[i]
        new_doc_topic['probabilities'] = clusterer.probabilities_[i]
        df_doc_topic = df_doc_topic.append(new_doc_topic, ignore_index=True)
    df_doc_topic.to_csv(file_name, header=True, index=False, encoding='utf-8')
    del df_doc_topic
    
def generate_documents_similarity(bert_model, docs, file_name):
    emb_model = bert_model.embedding_model
    # Create documents embeddings
    embeddings = emb_model.embedding_model.encode(docs)
    doc_sim_matrix = cosine_similarity(embeddings, embeddings)
    np.save(file_name, doc_sim_matrix)
    del doc_sim_matrix


In [79]:
%%time

df = pd.read_csv('../raw_data/proj_final/political_dataset.csv')
df['title'].fillna('no title', inplace = True)
df = df.sort_values(by=['year', 'month'], ascending=True).reset_index(drop=True)
df_split = df[['id', 'year', 'month']].groupby(['year', 'month']).count().reset_index()

list_prefix = []
for row_id, row in df_split.iterrows():
    print(str(row['year']) + ' - ' + str(row['month']))
    
    file_path = '../raw_data/proj_final/per_year_month/'
    prefix = f'{str(int(row["year"]))}_{str(int(row["month"]))}_'
    list_prefix.append(prefix)
    
    df_temp = df[(df['year'] == df_split['year'].iloc[row_id]) & (df['month'] == df_split['month'].iloc[row_id])].copy()
    df_temp = df_temp.sort_values(by=['year', 'month'], ascending=True).reset_index(drop=True)
    docs = df_temp['content'].values
    
    file_name = file_path + prefix + 'dataset.csv' 
    df_temp.to_csv(file_name, header=True, index=True, encoding='utf-8')
    del df_temp
        
    sentence_model = SentenceTransformer("paraphrase-mpnet-base-v2")
    topic_model = BERTopic(min_topic_size=10, language='english', calculate_probabilities=True, n_gram_range=(2,2), embedding_model=sentence_model)
    topic_model.fit_transform(docs)
    print('fit_transform done ...')
    
    file_name = file_path + prefix + 'BERTopic_model_2_2_raw_content' 
    topic_model.save(file_name)
    
    file_name = file_path + prefix + 'BERTopic_DocumentsSimilarity.npy'
    generate_documents_similarity(topic_model, docs, file_name)
    print('documents_similarit done ...')

    file_name = file_path + prefix + 'BERTopic_Info.csv'    
    generate_topic_info(topic_model, file_name)
    
    file_name = file_path + prefix + 'BERTopic_Terms.csv'  
    generate_terms(topic_model, file_name)
    
    #file_name = file_path + prefix + 'BERTopic_TopicSimilarity.csv'
    #generate_topic_similarity(topic_model, file_name)
    file_name = file_path + prefix + 'BERTopic_TopicSimilarity.npy'
    np.save(file_name, topic_model.topic_sim_matrix)
    
    file_name = file_path + prefix + 'BERTopic_TopicDocuments.csv'  
    generate_topic_documents(topic_model, file_name)
    
    file_name = file_path + prefix + 'HDBSCAN_TopicDocuments.csv'  
    generate_topic_documents_hdbscan(topic_model, file_name)
    
    del docs
    del topic_model
    del sentence_model    
    break
    
df_prefix = pd.DataFrame(list_prefix, columns=['year_month'])
file_name = file_path + 'list_of_years_months.csv' 
df_prefix.to_csv(file_name, header=True, index=True, encoding='utf-8')

2015.0 - 1.0
fit_transform done ...
documents_similarit done ...
CPU times: user 8min 1s, sys: 20.3 s, total: 8min 21s
Wall time: 1min 29s


In [53]:
df_prefix

Unnamed: 0,year_month
0,2015_1_


In [61]:
file_path = '../raw_data/proj_final/per_year_month/'
file_name = file_path + '2015_1_BERTopic_DocumentsSimilarity.npy'
xx = np.load(file_name)
xx.shape

(285, 285)

In [65]:
file_path = '../raw_data/proj_final/per_year_month/'
file_name = file_path + '2015_1_BERTopic_TopicSimilarity.npy'
xx = np.load(file_name)
xx.shape

(11, 11)

## sentiment analysis

In [3]:
from LeWagon_FinalProject.sentiment import Sentiment

file_path = '../raw_data/proj_final/per_year_month/'
file_name = file_path + '2015_1_dataset.csv'
df_news = pd.read_csv(file_name)
df_news

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,id,title,year,month,content
0,0,415,36361,2015: Sold Out South Carolina Tea Party Conven...,2015.0,1.0,"MYRTLE BEACH, South Carolina — The South Ca..."
1,1,417,57593,Narendra Modi Fast Facts,2015.0,1.0,(CNN) Here is a look at the life of India’s P...
2,2,418,59225,Little Richard Fast Facts,2015.0,1.0,(CNN) Here is a look at the life of ”Archit...
3,3,420,60219,"Cycling’s marathon man attempts 75,000 miles i...",2015.0,1.0,(CNN) While many people are recovering from a...
4,4,422,60223,Cops: Georgia police chief on leave after wife...,2015.0,1.0,(CNN) Magazines and websites regularly rank P...
...,...,...,...,...,...,...,...
280,280,882,166279,Going Dry: The Benefits Of A Month Without Booze,2015.0,1.0,"As New Year’s resolutions go, cutting back on ..."
281,281,883,171140,Diabetes Technology Inches Closer To An Artifi...,2015.0,1.0,Every person who uses insulin to manage diabet...
282,282,886,174240,"Low-Key, Real-Life Heroism In ’March: Book Two’",2015.0,1.0,Some media are for heroes. Ava DuVernay’s gr...
283,283,887,198904,How America got addicted to road salt — and wh...,2015.0,1.0,The US economy doesn’t just grind to a halt e...


In [None]:
%%time
file_path = '../raw_data/proj_final/per_year_month/'

file_name = file_path + 'list_of_years_months.csv'
df_prefix = pd.read_csv(file_name)
for _, row in df_prefix.iterrows():
    print(row["year_month"])
    prefix = row["year_month"]
    
    file_name = file_path + prefix + 'dataset.csv'
    df_news = pd.read_csv(file_name)
    sentiment = Sentiment(df_news['content'])
    sentiment.sentiment_analisys(1300)
    file_name = file_path + prefix + 'sentiment.csv'
    sentiment.pred.to_csv(file_name, header=True, index=False, encoding='utf-8')
    


In [10]:
%%time
sentiment = Sentiment(df_news['content'])
#1300
sentiment.sentiment_analisys(1300)

1300
CPU times: user 12min 57s, sys: 3.13 s, total: 13min
Wall time: 2min 25s


In [11]:
sentiment.pred

Unnamed: 0,negative,neutral,positive
0,0.170203,0.614661,0.215137
1,0.068293,0.792225,0.139481
2,0.024812,0.656596,0.318592
3,0.094022,0.575034,0.330944
4,0.316410,0.631359,0.052231
...,...,...,...
280,0.031203,0.432074,0.536723
281,0.053679,0.458965,0.487355
282,0.056236,0.542271,0.401493
283,0.683346,0.267782,0.048872


In [17]:
len(df_news['content'].iloc[0])
df_news['content'].iloc[0]

'MYRTLE BEACH, South Carolina  —   The South Carolina Tea Party Coalition Convention is underway, and event organizers tell Breitbart News that they’ve been overwhelmed by an unexpectedly high level of interest among activists statewide.  The event is sold out and the main ballroom, which seats more than 700 people, isn’t large enough for the full crowd. South Carolina Tea Party activist Joe Dugan, the convention’s executive producer, tells Breitbart News the group has created an extra, overflow room that will broadcast the major speeches from the ballroom —  and speakers will head down there to mingle with event attendees after their speeches. There’s overwhelming interest outside the convention as well. Event organizers tell Breitbart News that the group’s website has cratered under immense traffic many times leading up to this weekend, due to the widespread statewide and national interest in the convention. An estimated crowd of about 1, 100 to 1, 200 will hear some potential presid

In [None]:
from LeWagon_FinalProject.sentiment import SentimentLarge

sentiment = SentimentLarge(df_news['content'].head(1))
sentiment.predict(3347)

In [None]:
sentiment.pred

In [5]:
df_news['content'].iloc[0]

'MYRTLE BEA'

In [4]:
from transformers import pipeline
model = pipeline("sentiment-analysis",model="siebert/sentiment-roberta-large-english")
pred = model(df_news['content'].iloc[0][0:2400])
pred

[{'label': 'POSITIVE', 'score': 0.9979955554008484}]

In [9]:
pred[0]['label']
pred[0]['score']

0.9979955554008484

In [15]:
for _, row in df_news.iterrows():
    print(row['content'][0:10])
    #pred = model(row['content'][0:text_size])
    break

MYRTLE BEA
