## Imports

In [1]:
import pandas as pd
import numpy as np
from LeWagon_FinalProject.data import DataProcessor
from bertopic import BERTopic
import hdbscan
from sklearn.metrics.pairwise import cosine_similarity

2021-07-20 19:00:52.755665: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


## Functions

In [None]:
def generate_docs(df_, number_of_docs):
    df_ = df_[['date', 'content']][0:number_of_docs].copy().reset_index(drop=True)
    df_.to_csv(f'../raw_data/BERTDocsContent_{str(number_of_docs)}.csv', header=True, index=False, encoding='utf-8')
    return df_.copy()

In [None]:
def generate_topic_info(bert_model, number_of_docs):
    df_topic_info = bert_model.get_topic_info()

    df_topic_info.to_csv(f'../raw_data/BERTopicInfo_{str(number_of_docs)}.csv', header=True, index=False, encoding='utf-8')
    return df_topic_info.copy()

In [None]:
def generate_terms(bert_model, number_of_docs):
    topics = bert_model.get_topics()
    number_of_topics = len(topics)-1    

    topic_columns = ['topic', 'term', 'weight']

    df_topics = pd.DataFrame(columns=topic_columns)
    for i in range(-1,number_of_topics):
        num_of_terms = len(topics[i])
        for j in range(num_of_terms):
            new_topic = {}
            new_topic['topic'] = topic_model.topic_names[i]
            new_topic['term'] = topics[i][j][0]
            new_topic['weight'] = round(topics[i][j][1],6)
            df_topics = df_topics.append(new_topic, ignore_index=True)

    df_topics.to_csv(f'../raw_data/BERTopicTerms_{str(number_of_docs)}.csv', header=True, index=False, encoding='utf-8')
    return df_topics.copy()

In [None]:
def correlation_matrix_to_df(df_corr):
    list_done = []
    lits_item1 = []
    lits_item2 = []
    list_corr = []

    for k in range(1,df_corr.shape[1]):
        for i, j in df_corr.iterrows():
            #if (df_corr.columns[k] != j[0]) and (j[0] not in list_done):
            #if (j[0] not in list_done):
            lits_item1.append(df_corr.columns[k])
            lits_item2.append(j[0])
            list_corr.append(j[k])
        list_done.append(df_corr.columns[k])

    corr_dict = {'topic1': lits_item1,
                 'topic2': lits_item2,
                 'similarity': list_corr}
    df_res = pd.DataFrame(corr_dict)
    df_res = df_res.sort_values(by='similarity', ascending=False).copy()
    df_res.reset_index(inplace=True,drop=True)
    return df_res.copy()

In [None]:
def generate_topic_similarity(bert_model, number_of_docs):
    corr_matrix = bert_model.topic_sim_matrix

    topics = bert_model.get_topics()
    number_of_topics = len(topics)-1

    topic_columns = ['topic']
    for i in range(-1,number_of_topics):
        topic_columns.append(bert_model.topic_names[i])

    df_similarity = pd.DataFrame(columns=topic_columns)
    for i in range(-1,number_of_topics):
        new_topic = {}
        new_topic['topic'] = bert_model.topic_names[i]
        for j in range(-1,number_of_topics):
            new_topic[bert_model.topic_names[j]] = round(corr_matrix[i,j],6)
        df_similarity = df_similarity.append(new_topic, ignore_index=True)
        
    df_topic_similarity = correlation_matrix_to_df(df_similarity)
    df_topic_similarity.to_csv(f'../raw_data/BERTopicSimilarity_{str(number_of_docs)}.csv', header=True, index=False, encoding='utf-8')
    return df_topic_similarity.copy()

In [None]:
def get_topic_documents(cluster_id, condensed_tree):
    result_points = np.array([])
    result_points_val = np.array([])
    
    #assert cluster_id > -1, "The topic's label should be greater than -1!"
    
    if cluster_id <= -1:
        return result_points.astype(np.int64), result_points_val.astype(np.float64)
        
    raw_tree = condensed_tree._raw_tree
    
    # Just the cluster elements of the tree, excluding singleton points
    cluster_tree = raw_tree[raw_tree['child_size'] > 1]
    
    # Get the leaf cluster nodes under the cluster we are considering
    leaves = hdbscan.plots._recurse_leaf_dfs(cluster_tree, cluster_id)
    
    # Now collect up the last remaining points of each leaf cluster (the heart of the leaf) 
    for leaf in leaves:
        #max_lambda = raw_tree['lambda_val'][raw_tree['parent'] == leaf].max()
        #points = raw_tree['child'][(raw_tree['parent'] == leaf) & (raw_tree['lambda_val'] == max_lambda)]
        #points_val = raw_tree['lambda_val'][(raw_tree['parent'] == leaf) & (raw_tree['lambda_val'] == max_lambda)]
        points = raw_tree['child'][(raw_tree['parent'] == leaf)]
        points_val = raw_tree['lambda_val'][(raw_tree['parent'] == leaf)]
        result_points = np.hstack((result_points, points))
        result_points_val = np.hstack((result_points_val, points_val))
        
    return result_points.astype(np.int64), result_points_val.astype(np.float64)

In [None]:
def generate_topic_documents(bert_model, number_of_docs):
    clusterer = bert_model.hdbscan_model
    tree = clusterer.condensed_tree_
    clusters = tree._select_clusters()

    number_of_topics = len(clusters)

    relevant_columns = ['topic', 'document', 'lambda_val']
    df_rel_docs = pd.DataFrame(columns=relevant_columns)
        
    if number_of_topics == len(bert_model.get_topics()):
        start_ind = -1
    else:
        start_ind = 0

    for i in range(0, number_of_topics):
        rel_docs, lambda_vals = get_topic_documents(clusters[i], tree)
        if len(rel_docs) > 0:
            if start_ind < 0:
                topic_name = bert_model.topic_names[i-1]
            else:
                topic_name = bert_model.topic_names[i]
                
            for j in range(0, len(rel_docs)):
                new_doc_rel = {}
                new_doc_rel['topic'] = topic_name
                new_doc_rel['document'] = rel_docs[j]
                new_doc_rel['lambda_val'] = round(lambda_vals[j],6)
                df_rel_docs = df_rel_docs.append(new_doc_rel, ignore_index=True)

    df_rel_docs.to_csv(f'../raw_data/BERTopicDocuments_{str(number_of_docs)}.csv', header=True, index=False, encoding='utf-8')
    return df_rel_docs.copy()

In [None]:
def get_cosine_similarity(feature_vec_1, feature_vec_2):
    return cosine_similarity(feature_vec_1.reshape(1, -1), feature_vec_2.reshape(1, -1))[0][0]

In [None]:
def generate_documents_similarity(bert_model, docs, number_of_docs):
    emb_model = bert_model.embedding_model
    
    # Create documents embeddings
    embeddings = emb_model.embedding_model.encode(docs)
    doc_sim_matrix = cosine_similarity(embeddings, embeddings)
    np.savetxt(f'../raw_data/BERTopicDocumentsSimilarity_{str(number_of_docs)}.csv', doc_sim_matrix, delimiter=',')
    np.save(f'../raw_data/BERTopicDocumentsSimilarity_{str(number_of_docs)}.npy', doc_sim_matrix)
    return doc_sim_matrix
'''
    sim_columns = ['cosine_similarity', 'document1', 'document2']
    df_sim_docs = pd.DataFrame(columns=sim_columns)
    temp_columns = ['cosine_similarity']
    for i in range(0, len(docs)):
        docs_sim = df_documents_similarity[i]
        df_sim_docs_temp = pd.DataFrame(data = docs_sim, columns=temp_columns)
        df_sim_docs_temp['document1'] = i
        df_sim_docs_temp['document2'] = df_sim_docs_temp.index
        df_sim_docs = df_sim_docs.append(df_sim_docs_temp, ignore_index=True)
    
    #df_sim_docs.to_csv(f'../raw_data/BERTopicDocumentsSimilarity_{str(number_of_docs)}.csv', header=True, index=False, encoding='utf-8')     
    return df_sim_docs.copy()
'''

In [2]:
def get_topic_start_end_dates(mode_index, topic_index):
    df_topic = pd.read_csv(f'../raw_data/BERTopicInfo_{str(mode_index)}.csv')
    topic_name = df_topic[df_topic['Topic']==topic_index]['Name'].values[0]

    df_documents = pd.read_csv(f'../raw_data/BERTopicDocuments_{str(mode_index)}.csv')
    df_documents = df_documents[df_documents['topic']==topic_name]

    df_docscontent = pd.read_csv(f'../raw_data/BERTopicDocsContent_{str(mode_index)}.csv', parse_dates=True)
    df_docscontent = df_docscontent[df_docscontent.index.isin(df_documents['document'].values)]
    #start_date = df_docscontent['date'].min()
    #end_date = df_docscontent['date'].max()
    min_year = df_docscontent['year'].min()
    min_month = df_docscontent[df_docscontent['year']==min_year]['month'].min()
    
    max_year = df_docscontent['year'].max()
    max_month = df_docscontent[df_docscontent['year']==max_year]['month'].max()
    
    number_topic_docs = len(df_documents)
    #return start_date, end_date, number_topic_docs
    return min_year, min_month, max_year, max_month, topic_name, number_topic_docs

## Generate data

In [None]:
'''dp1 = DataProcessor(csv_path='../raw_data/', csv_name='articles1')
df1 = dp1.load_dataset()

dp2 = DataProcessor(csv_path='../raw_data/', csv_name='articles2')
df2 = dp2.load_dataset()

dp3 = DataProcessor(csv_path='../raw_data/', csv_name='articles3')
df3 = dp3.load_dataset()
'''

In [None]:
'''df_all = df1.copy()
df_all = df_all.append(df2, ignore_index=True)
df_all = df_all.append(df3, ignore_index=True)
df_all = df_all.sort_values(by=['year', 'month'], ascending=True).reset_index(drop=True)
'''

In [None]:
'''print(df_all.shape)
#df_all[(df_all['year'].isna())].dropna(inplace=True)
df_all.dropna(subset=['year', 'month'], inplace=True)
print(df_all.shape)
df_all
'''

In [None]:
'''df_all = df_all[['id', 'title', 'year', 'month', 'content']].copy()
df_all = df_all[df_all['year'] >= 2015].copy()
df_all
'''

In [None]:
#df_all.to_csv(f'../raw_data/dataset_work.csv', header=True, index=True, encoding='utf-8')

In [None]:
'''import requests
url = 'https://bucketapipython-guadc7haza-uc.a.run.app/data'

params = {'filename': 'dataset_work.csv', 'data': df_all.to_json()}

x = requests.post(url, params=params)
print(x.text)
'''

In [None]:
'''import requests
url = 'https://bucketapipython-guadc7haza-uc.a.run.app/data'
params = {'filename': 'dataset_work', 'extension': 'csv'}
x = requests.get(url, params=params)
x'''

In [11]:
number_of_docs = 3_000

dp = DataProcessor(csv_path='../raw_data/', csv_name='dataset_work')
df = dp.load_dataset()
#df = df.sort_values(by='date', ascending=True).reset_index(drop=True)
df = df.sort_values(by=['year', 'month'], ascending=True).reset_index(drop=True)
df.shape

(139514, 6)

In [None]:
%%time

can_execute = False

if can_execute:
    number_of_docs = 3000
    number_of_docs_back = 1500
    number_of_iterations = df.shape[0]//number_of_docs

    start_pos = 0
    end_pos = number_of_docs-1
    for i in range(0, number_of_iterations):
        if i == (number_of_iterations-1):
            if end_pos < (df.shape[0]-1):
                end_pos += (df.shape[0]-1) - end_pos

        if i > 0:
            df_docs = df[(start_pos-number_of_docs_back):end_pos].sort_values(by=['year', 'month'], ascending=True).reset_index(drop=True)            
        else:
            df_docs = df[start_pos:end_pos].sort_values(by=['year', 'month'], ascending=True).reset_index(drop=True)

        #if i > 12:    
        docs = df_docs['content'].values
        print('starting transform...')
        topic_model = BERTopic(min_topic_size=30, language='english', calculate_probabilities=False, n_gram_range=(3,3))
        topic_model.fit_transform(docs)
        #print(len(topics))
        #print(topics)
        #break
        topic_model.save('../raw_data/BERTopic_model_3_3_run_'+str(i))
        print('end transform...')

        df_docs.to_csv(f'../raw_data/BERTopicDocsContent_{str(i)}.csv', header=True, index=False, encoding='utf-8')

        df_topics_info = generate_topic_info(topic_model, i)

        df_terms = generate_terms(topic_model, i)

        df_topic_similarity = generate_topic_similarity(topic_model, i)

        df_topic_documents = generate_topic_documents(topic_model, i)

        matrix_documents_similarity = generate_documents_similarity(topic_model, docs, i)

        start_pos += number_of_docs
        end_pos += number_of_docs

        del docs
        del df_docs
        del df_topics_info
        del df_terms
        del df_topic_similarity
        del df_topic_documents
        del matrix_documents_similarity

        print(f'done {str(i)} of {str(number_of_iterations-1)}')

## Check term

In [None]:
%%time

number_of_iterations = 46

models = []
for i in range(0,number_of_iterations):
    topic_model = BERTopic.load('../raw_data/BERTopic_model_3_3_run_'+str(i))
    models.append(topic_model)
    print(f'{i} done')

In [8]:
%%time

number_of_iterations = 46
top_n_sim = 50
search_terms = ['trump', 'climate change', 'biden', 'hillary']

#term_similarity_columns = ['mode_index', 'bert_model', 'search_term', 'topic', 'topic_docs', 'topic_start_date', 'topic_end_date', 'similarity']
term_similarity_columns = ['mode_index', 'bert_model', 'search_term', 'topic_name', 'topic_docs', 'topic_start_year', 'topic_start_month', 'topic_end_year', 'topic_end_month', 'similarity']
df_term_similarity = pd.DataFrame(columns=term_similarity_columns)

model_ind = 0
#for topic_model in models:
for i in range(0,number_of_iterations):
    topic_model = BERTopic.load('../raw_data/BERTopic_model_3_3_run_'+str(i))
    topic_ind = -1010
    for search_term in search_terms:
        similar_topics, similarity = topic_model.find_topics(search_term, top_n=top_n_sim)
        if len(similar_topics) > 0:
            for i in range(0,len(similar_topics)):
                if similarity[i] < 0.7:
                    break
                if topic_ind != similar_topics[i]:
                    #start_date, end_date, number_topic_docs = get_topic_start_end_dates(model_ind, similar_topics[i])
                    min_year, min_month, max_year, max_month, topic_name, number_topic_docs = get_topic_start_end_dates(model_ind, similar_topics[i])
                    topic_ind = similar_topics[i]
                
                new_term_sim = {}
                new_term_sim['mode_index'] = model_ind
                new_term_sim['bert_model'] = 'BERTopic_model_3_3_run_'+str(model_ind)
                new_term_sim['search_term'] = search_term
                new_term_sim['topic_name'] = topic_name
                new_term_sim['topic_docs'] = number_topic_docs
                #new_term_sim['topic_start_date'] = start_date
                #new_term_sim['topic_end_date'] = end_date
                
                new_term_sim['topic_start_year'] = min_year
                new_term_sim['topic_start_month'] = min_month
                new_term_sim['topic_end_year'] = max_year
                new_term_sim['topic_end_month'] = max_month
                
                new_term_sim['similarity'] = round(similarity[i],6)
                df_term_similarity = df_term_similarity.append(new_term_sim, ignore_index=True)
    model_ind += 1
    del topic_model

#df_term_similarity = df_term_similarity.sort_values(by=['search_term', 'topic_start_date'], ascending=True).reset_index(drop=True)
df_term_similarity = df_term_similarity.sort_values(by=['search_term', 'topic_start_year', 'topic_start_month'], ascending=True).reset_index(drop=True)
df_term_similarity.to_csv(f'../raw_data/BERTopicTermModelSimilarity.csv', header=True, index=False, encoding='utf-8')
print(df_term_similarity.shape)

(41, 10)
CPU times: user 7min 59s, sys: 1min 32s, total: 9min 31s
Wall time: 8min 26s


In [4]:
df_term_similarity = df_term_similarity.sort_values(by=['search_term', 'topic_start_year', 'topic_start_month'], ascending=True).reset_index(drop=True)
df_term_similarity.to_csv(f'../raw_data/BERTopicTermModelSimilarity.csv', header=True, index=False, encoding='utf-8')
print(df_term_similarity.shape)

(41, 10)


In [10]:
df_term_similarity

Unnamed: 0,mode_index,bert_model,search_term,topic_name,topic_docs,topic_start_year,topic_start_month,topic_end_year,topic_end_month,similarity
0,8,BERTopic_model_3_3_run_8,climate change,15_sea level rise_the paris agreement_of clima...,116,2016.0,3.0,2016.0,4.0,0.777599
1,9,BERTopic_model_3_3_run_9,climate change,16_clean power plan_of climate change_on clima...,33,2016.0,4.0,2016.0,5.0,0.907843
2,10,BERTopic_model_3_3_run_10,climate change,19_the solar industry_the environmental protec...,33,2016.0,4.0,2016.0,5.0,0.717646
3,11,BERTopic_model_3_3_run_11,climate change,7_the university of_great barrier reef_of clim...,42,2016.0,5.0,2016.0,6.0,0.716946
4,24,BERTopic_model_3_3_run_24,climate change,21_the paris agreement_on climate change_the p...,94,2016.0,10.0,2016.0,11.0,0.863306
5,28,BERTopic_model_3_3_run_28,climate change,17_climate change is_environmental protection ...,67,2016.0,12.0,2016.0,12.0,0.858147
6,30,BERTopic_model_3_3_run_30,climate change,26_of climate change_climate change and_on cli...,84,2016.0,12.0,2017.0,1.0,0.87841
7,31,BERTopic_model_3_3_run_31,climate change,20_environmental protection agency_of climate ...,198,2017.0,1.0,2017.0,1.0,0.771006
8,32,BERTopic_model_3_3_run_32,climate change,19_on climate change_environmental protection ...,250,2017.0,1.0,2017.0,2.0,0.929588
9,39,BERTopic_model_3_3_run_39,climate change,12_daylight saving time_march for science_clea...,41,2017.0,3.0,2017.0,4.0,0.848293


In [None]:
# search_terms = ['trump', 'climate change', 'biden', 'hillary']
df_term_similarity[df_term_similarity['search_term']=='climate change']

In [None]:
parei aqui

## Validate n_gram

In [3]:
dp = DataProcessor(csv_path='../raw_data/', csv_name='dataset_work')
df = dp.load_dataset()
df = df.sort_values(by=['year', 'month'], ascending=True).reset_index(drop=True)

In [None]:
df_ = df[df['year'] == 2015].copy()
print(df_.shape)
df_

In [None]:
n_grams = []

n_grams.append((1,1))
n_grams.append((1,2))
n_grams.append((1,3))
n_grams.append((1,4))
n_grams.append((1,5))
n_grams.append((1,6))
n_grams.append((1,7))
n_grams.append((1,8))
n_grams.append((1,9))

n_grams.append((2,2))
n_grams.append((2,3))
n_grams.append((2,4))
n_grams.append((2,5))
n_grams.append((2,6))
n_grams.append((2,7))
n_grams.append((2,8))
n_grams.append((2,9))

n_grams.append((3,3))
n_grams.append((3,4))
n_grams.append((3,5))
n_grams.append((3,6))
n_grams.append((3,7))
n_grams.append((3,8))
n_grams.append((3,9))

n_grams.append((4,4))
n_grams.append((4,5))
n_grams.append((4,6))
n_grams.append((4,7))
n_grams.append((4,8))
n_grams.append((4,9))

n_grams.append((5,5))
n_grams.append((5,6))
n_grams.append((5,7))
n_grams.append((5,8))
n_grams.append((5,9))

n_grams.append((6,6))
n_grams.append((6,7))
n_grams.append((6,8))
n_grams.append((6,9))

n_grams.append((7,7))
n_grams.append((7,8))
n_grams.append((7,9))

n_grams.append((8,8))
n_grams.append((8,9))

n_grams.append((9,9))

In [None]:
%%time

can_execute = False
if can_execute:    
    for n_gram in n_grams:
        n_gram_txt = f'{str(n_gram[0])}_{str(n_gram[1])}'

        df_docs = df_.sort_values(by=['year', 'month'], ascending=True).reset_index(drop=True)
        docs = df_docs['content'].values
        print('starting transform...')
        topic_model = BERTopic(min_topic_size=30, language='english', calculate_probabilities=False, n_gram_range=n_gram)
        topic_model.fit_transform(docs)

        topic_model.save('../raw_data/BERTopic_model_'+str(n_gram_txt))
        print('end transform...')

        df_docs.to_csv(f'../raw_data/BERTopicDocsContent_{n_gram_txt}.csv', header=True, index=False, encoding='utf-8')

        df_topics_info = generate_topic_info(topic_model, n_gram_txt)

        df_terms = generate_terms(topic_model, n_gram_txt)

        del topic_model
        del docs
        del df_docs
        del df_topics_info
        del df_terms

        print(f'done n_gram: {n_gram_txt}')

In [None]:
can_execute = False
if can_execute:
    icount = 0
    for n_gram in n_grams:         
        n_gram_txt = f'{str(n_gram[0])}_{str(n_gram[1])}'
        df_topic_info = pd.read_csv(f'../raw_data/BERTopicInfo_{n_gram_txt}.csv')
        df_topic_info = df_topic_info.head(1).copy()
        df_topic_info['n_gram'] = n_gram_txt
        if icount == 0:
            df_topic_nones = df_topic_info.copy()
        else:
            df_topic_nones = df_topic_nones.append(df_topic_info, ignore_index=True)

        del df_topic_info
        icount += 1

    df_topic_nones.to_csv('../raw_data/validate_n_gram/BERTopicResult_n_gram.csv', header=True, index=False, encoding='utf-8')
    df_topic_nones

In [4]:
df_topic_nones = pd.read_csv('../raw_data/validate_n_gram/BERTopicResult_n_gram.csv')
df_topic_nones.sort_values(by=['Count'], ascending=True).reset_index(drop=True)

Unnamed: 0,Topic,Count,Name,n_gram
0,-1,1403,-1_the united states_the white house_in the un...,3_3
1,-1,1409,-1_follow us on twitter cnnopinion join_follow...,6_9
2,-1,1419,-1_follow us on twitter cnnopinion_twitter cnn...,5_5
3,-1,1434,-1_the white house_us on facebook_join us on_j...,3_8
4,-1,1442,-1_follow us on twitter cnnopinion_us on twitt...,4_5
5,-1,1442,-1_on twitter cnnopinion join us on_us on twit...,5_9
6,-1,1443,-1_us on twitter cnnopinion join us_follow us ...,4_8
7,-1,1447,-1_follow us on twitter cnnopinion join_on twi...,6_6
8,-1,1447,-1_follow us on twitter cnnopinion_us on twitt...,5_6
9,-1,1458,-1_follow us on twitter cnnopinion join us_us ...,7_7


## Test

In [None]:
df_term_model_sim = pd.read_csv('../raw_data/BERTopicTermModelSimilarity.csv')
#df_topic_nones.sort_values(by=['Count'], ascending=True).reset_index(drop=True)
#df_term_model_sim

## Generate political dataset

In [18]:
political_words = ['hillary',
         'clinton',
         'trump',
         'donald'
         'barack',
         'obama',
         'election',
         'democracy',
         'democrats',
         'republicans',
         'president',
         'candidate',
         'elector',
         'vote',
         'voting',
         'voter',
         'elector',
         'government',
         'big government',
         'bipartisan',
         'bleeding heart',
         'bully pulpit',
         'campaign',
         'caucus',
         'checks and balances',
         'coattails',
         'convention',
         'dark horse',
         'white house',
         'delegate',
         'demagogue',
         'fence mending',
         'filibuster',
         'fishing expedition',
         'front burner',
         'gerrymander',
         'GOP',
         'grass roots',
         'ideology',
         'incumbent',
         'inside the beltway',
         'lame duck',
         'left-wing',
         'lobby',
         'machine politics',
         'mccarthyism',
         'muckraker',
         'nomination',
         'nominee',
         'photo-op',
         'platform',
         'political party',
         'political suicide',
         'political',
         'poll',
         'politicians',
         'pork barrel',
         'primary',
         'pundit',
         'reactionary',
         'red tape',
         'rubber chicken circuit',
         'silent majority',
         'slate',
         'smoke-filled room',
         'spin',
         'stump',
         'swing vote',
         'trial balloon',
         'whip',
         'whistle-stopping:',
         'witch hunt'
        ]

In [21]:
import nltk
from nltk.corpus import wordnet

political_words_new = []
for pol_word in political_words:
    political_words_new.append(pol_word.lower())
    
    for syn in wordnet.synsets(pol_word):
        for lm in syn.lemmas():
            political_words_new.append(lm.name().replace('_', ' ').lower())#adding into synonyms
            if lm.antonyms():
                political_words_new.append(lm.antonyms()[0].name().replace('_', ' ').lower()) #adding into antonyms    
    
political_words_new = set(political_words_new)

In [42]:
print(len(political_words))
print(len(political_words_new))
#political_words_new

71
246


In [24]:
dp = DataProcessor(csv_path='../raw_data/', csv_name='dataset_work')
df = dp.load_dataset()
df = df.sort_values(by=['year', 'month'], ascending=True).reset_index(drop=True)
df.shape
print(df.shape)

(139514, 6)


In [43]:
df.head(1)

Unnamed: 0.1,Unnamed: 0,id,title,year,month,content
0,415,36361,2015: Sold Out South Carolina Tea Party Conven...,2015.0,1.0,"MYRTLE BEACH, South Carolina — The South Ca..."


In [44]:
contents = []
political_columns = ['Unnamed: 0', 'id', 'title', 'year', 'month', 'content']
df_political = pd.DataFrame(columns=political_columns)
for _, row in df.iterrows():
    row_content = row['content'].lower()
    count_words = 0
    for political_word in political_words_new:        
        if political_word in row_content:
            count_words += 1
            if count_words >= 5:
                contents.append(row_content)
                
                new_news = {}
                new_news['Unnamed: 0'] = row['Unnamed: 0']
                new_news['id'] = row['id']
                new_news['title'] = row['title']
                new_news['year'] = row['year']
                new_news['month'] = row['month']
                new_news['content'] = row['content']
                df_political = df_political.append(new_news, ignore_index=True)
                break
                
print(len(contents))
df_political.to_csv(f'../raw_data/political_dataset.csv', header=True, index=False, encoding='utf-8')

99778


In [45]:
df_political

Unnamed: 0.1,Unnamed: 0,id,title,year,month,content
0,415,36361,2015: Sold Out South Carolina Tea Party Conven...,2015.0,1.0,"MYRTLE BEACH, South Carolina — The South Ca..."
1,417,57593,Narendra Modi Fast Facts,2015.0,1.0,(CNN) Here is a look at the life of India’s P...
2,418,59225,Little Richard Fast Facts,2015.0,1.0,(CNN) Here is a look at the life of ”Archit...
3,420,60219,"Cycling’s marathon man attempts 75,000 miles i...",2015.0,1.0,(CNN) While many people are recovering from a...
4,422,60223,Cops: Georgia police chief on leave after wife...,2015.0,1.0,(CNN) Magazines and websites regularly rank P...
...,...,...,...,...,...,...
99773,139924,208849,Trump administration to name Georgia health of...,2017.0,7.0,The Trump administration plans to appoint...
99774,139925,208850,Trump tweets a video with a very unfortunate F...,2017.0,7.0,Fox News is probably happy to have the preside...
99775,139926,208851,"U.S. hospital offers to admit Charlie Gard, th...",2017.0,7.0,A major New York hospital has offered...
99776,139927,208852,How the new ‘Spider-Man’ is really a John Hugh...,2017.0,7.0,THE WORDS “superhero fatigue” have been l...
