## Imports

In [1]:
import pandas as pd
import numpy as np

## Prepare data

In [None]:
def prepare_most_important_center(df_topic, keep_documents_no_topic = False):
    ''' Most important topic in center '''
    
    df_topic = df_topic.copy()

    # remove topic -1
    if df_topic['Topic'].iloc[0] == -1:
        df_no_topic = df_topic.head(1).copy()
        df_topic.drop(index=0, inplace=True)
        df_topic.reset_index(inplace=True, drop=True)
    else:
        keep_documents_no_topic = False

    # get most important topic according number of news
    df_first = df_topic.head(1).copy()

    # remove most important topic according number of news
    df_topic.drop(index=0, inplace=True)
    df_topic.reset_index(inplace=True, drop=True)

    # get even rows
    df_even = df_topic.iloc[::2].copy()  # even
    df_even.sort_values(by='Topic', ascending=False, inplace=True)

    # get odd rows
    df_odd = df_topic.iloc[1::2]  # odd

    # concatenate even rows with most important topic (higher number of news)
    df_even = pd.concat([df_even, df_first], ignore_index=True)

    # concatenate with odd rows
    df_even = pd.concat([df_even, df_odd], ignore_index=True)
    
    if keep_documents_no_topic:
        df_even = pd.concat([df_even, df_no_topic], ignore_index=True)

    return df_even.copy()

def prepare_higher_probabilities_center(df_docs):
    ''' Most likely documents in the center '''
    
    df_docs = df_docs.copy()
    if df_docs[df_docs['probabilities'] == 1].shape[0] <= 0:
        return df_docs

    # get most likely probabilities
    df_center = df_docs[df_docs['probabilities'] == 1].copy()
    df_center.sort_values(by='document', ascending=True, inplace=True)

    # remove most likely probabilities
    df_docs = df_docs[df_docs['probabilities'] != 1].copy()
    df_docs.reset_index(inplace=True, drop=True)

    # get even rows
    df_even = df_docs.iloc[::2].copy()  # even
    df_even.sort_values(by='probabilities', ascending=True, inplace=True)

    # get odd rows
    df_odd = df_docs.iloc[1::2]  # odd
    df_odd.sort_values(by='probabilities', ascending=False, inplace=True)

    # concatenate even rows with most important topic (higher number of news)
    df_even = pd.concat([df_even, df_center], ignore_index=True)

    # concatenate with odd rows
    df_even = pd.concat([df_even, df_odd], ignore_index=True)
    
    return df_even.copy()

In [None]:
file_path = '../raw_data/proj_final/per_year_month/'

file_name = file_path + 'list_of_years_months.csv'
df_prefix = pd.read_csv(file_name)
for _, row in df_prefix.iterrows():
    print(row["year_month"])
    prefix = row["year_month"]
    
    topic_info_exists = False
    try:
        file_name = file_path + prefix + 'BERTopic_Info.csv'
        df_topic_info = pd.read_csv(file_name)
        if df_topic_info.shape[0] > 1:
            topic_info_exists = True
        else:
            print(f'There are no topics for {prefix}')
    except:
        print(f'There are no topics for {prefix}')
    
    if topic_info_exists:
        # Sort topics, most important one in the midle of Hilbert curve
        df_topic_info = prepare_most_important_center(df_topic_info, keep_documents_no_topic=False)
        df_topic_info['similarity_previous_topic'] = 0.0
        
        # Cosine similarity to previous topic
        file_name = file_path + prefix + 'BERTopic_TopicSimilarity.npy'
        matrix_topics_similarity = np.load(file_name)        
        for ind_row, topic_row in df_topic_info.iterrows():
            if ind_row > 0:
                df_topic_info['similarity_previous_topic'].iloc[ind_row] = matrix_topics_similarity[int(topic_row['Topic'])+1, int(previous_topic)+1]
            previous_topic = topic_row['Topic']
        
        # Topic documents
        file_name = file_path + prefix + 'BERTopic_DocumentsSimilarity.npy'
        matrix_documents_similarity = np.load(file_name)
        
        file_name = file_path + prefix + 'HDBSCAN_TopicDocuments.csv'
        df_topic_documents_hdbscan = pd.read_csv(file_name)
        
        file_name = file_path + prefix + 'sentiment.csv'
        df_semtiment = pd.read_csv(file_name)
        for ind_row, topic_row in df_topic_info.iterrows():
            top = topic_row['Topic']
            df_temp = df_topic_documents_hdbscan[df_topic_documents_hdbscan['topic']==top].sort_values(by=['probabilities'], ascending=True).reset_index(drop=True)
            df_temp = prepare_higher_probabilities_center(df_temp)
            df_temp['document1_size'] = 1
            df_temp['document2'] = 0.0
            df_temp['similarity_previous_document'] = 0.0
            df_temp['similarity_previous_topic'] = 0.0
            df_temp['topic_name'] = 'topic'
            df_temp['sentimet_classification'] = 'neutral'
            df_temp['sentimet_score'] = 0.5
            for ind_temp_row, temp_row in df_temp.iterrows():
                doc_ind = int(temp_row['document'])
                if ind_temp_row > 0:
                    df_temp['similarity_previous_document'].iloc[ind_temp_row] = matrix_documents_similarity[doc_ind, int(previous_document)]
                    df_temp['document2'].iloc[ind_temp_row] = previous_document
                else:
                    df_temp['document2'].iloc[ind_temp_row] = temp_row['document']
                df_temp['similarity_previous_topic'] = topic_row['similarity_previous_topic']
                df_temp['topic_name'] = topic_row['Name']
                previous_document = temp_row['document']
                
                # Setiment classification                               
                negative = df_semtiment['negative'].iloc[doc_ind]
                neutral = df_semtiment['neutral'].iloc[doc_ind]
                positive = df_semtiment['positive'].iloc[doc_ind]
                if (neutral >= negative) and (neutral >= positive):
                    sentimet_classification = 'neutral'
                    sentimet_score = neutral
                elif (negative > neutral) and (negative > positive):
                    sentimet_classification = 'negative'
                    sentimet_score = negative
                else:
                    sentimet_classification = 'positive'
                    sentimet_score = positive
                df_temp['sentimet_classification'].iloc[ind_temp_row] = sentimet_classification
                df_temp['sentimet_score'].iloc[ind_temp_row] = sentimet_score
                doc_size = len(matrix_documents_similarity[doc_ind][matrix_documents_similarity[doc_ind] >= 0.6])
                df_temp['document1_size'].iloc[ind_temp_row] = doc_size
            df_temp.rename(columns={"document": "document1"}, inplace=True)
            if ind_row == 0:
                df_graph = df_temp.copy()
            else:
                df_graph = pd.concat([df_graph, df_temp], ignore_index=True)
            del df_temp
        del df_topic_documents_hdbscan
        del df_topic_info
        del df_semtiment
        file_name = file_path + prefix + 'graph_data.csv'
        df_graph.to_csv(file_name, header=True, index=False, encoding='utf-8')
        del df_graph


In [None]:
#df_topic_info
#df_graph
#df_topic_documents_hdbscan
#df_graph[df_graph['document1_size'] > 2]
df_graph

In [None]:
#df_topic_info
#df_temp = df_topic_documents_hdbscan[df_topic_documents_hdbscan['topic']==0].sort_values(by=['probabilities'], ascending=False).reset_index(drop=True)
#df_temp
df_graph

In [None]:
file_path = '../raw_data/proj_final/per_year_month/'

file_name = file_path + '2017_1_BERTopic_DocumentsSimilarity.npy'
matrix_documents_similarity = np.load(file_name) 
matrix_documents_similarity.shape

In [None]:
type(matrix_documents_similarity[0])

In [None]:
matrix_documents_similarity.shape[0]

In [None]:
'''for i in range(0, matrix_documents_similarity.shape[0]):
    doc_size = len(matrix_documents_similarity[i][matrix_documents_similarity[i] >= 0.6])
    print(doc_size)
'''

## Prepare data with python class

In [2]:
from LeWagon_FinalProject.topic_prepare_graph import PrepareGraphDataset

prep = PrepareGraphDataset(file_path = '../raw_data/proj_final/Bolsonaro/')
prep.generate_graph_dataset(keep_documents_no_topic=False, similarity_threshold=0.6)
prep.generate_graph_dataset(keep_documents_no_topic=True, similarity_threshold=0.6)

2021_2_
There are no topics for 2021_2_
2021_3_
There are no topics for 2021_3_
2021_5_
There are no topics for 2021_5_
2021_6_


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


2021_7_
2021_2_
There are no topics for 2021_2_
2021_3_
There are no topics for 2021_3_
2021_5_
There are no topics for 2021_5_
2021_6_
2021_7_


## Setiment

In [20]:
df_semtiment = pd.read_csv('../raw_data/proj_final/10_docs_per_topic_preproced/2015_1_sentiment.csv')
df_semtiment

Unnamed: 0.1,Unnamed: 0,negative,neutral,positive
0,0,0.077699,0.850818,0.071483
1,1,0.070702,0.903399,0.025899
2,2,0.047796,0.877077,0.075127
3,3,0.042970,0.773606,0.183425
4,4,0.104974,0.851049,0.043977
...,...,...,...,...
280,280,0.070491,0.827416,0.102093
281,281,0.033435,0.556131,0.410434
282,282,0.198184,0.714169,0.087647
283,283,0.478211,0.488389,0.033399


In [21]:
df_semtiment['temp'] = df_semtiment['neutral']+df_semtiment['positive']-df_semtiment['negative']
temp_max = df_semtiment['temp'].max()
temp_min = df_semtiment['temp'].min()

In [22]:
df_semtiment['setiment_scale'] = (df_semtiment['temp'] - temp_min) / (temp_max - temp_min)
df_semtiment

Unnamed: 0.1,Unnamed: 0,negative,neutral,positive,temp,setiment_scale
0,0,0.077699,0.850818,0.071483,0.844601,0.915806
1,1,0.070702,0.903399,0.025899,0.858596,0.923966
2,2,0.047796,0.877077,0.075127,0.904408,0.950679
3,3,0.042970,0.773606,0.183425,0.914061,0.956307
4,4,0.104974,0.851049,0.043977,0.790052,0.883999
...,...,...,...,...,...,...
280,280,0.070491,0.827416,0.102093,0.859018,0.924213
281,281,0.033435,0.556131,0.410434,0.933129,0.967426
282,282,0.198184,0.714169,0.087647,0.603633,0.775300
283,283,0.478211,0.488389,0.033399,0.043577,0.448738


In [23]:
df_semtiment.drop(['temp'], axis=1, inplace=True)

In [24]:
df_semtiment

Unnamed: 0.1,Unnamed: 0,negative,neutral,positive,setiment_scale
0,0,0.077699,0.850818,0.071483,0.915806
1,1,0.070702,0.903399,0.025899,0.923966
2,2,0.047796,0.877077,0.075127,0.950679
3,3,0.042970,0.773606,0.183425,0.956307
4,4,0.104974,0.851049,0.043977,0.883999
...,...,...,...,...,...
280,280,0.070491,0.827416,0.102093,0.924213
281,281,0.033435,0.556131,0.410434,0.967426
282,282,0.198184,0.714169,0.087647,0.775300
283,283,0.478211,0.488389,0.033399,0.448738
