In [None]:
! pip install tqdm boto3 requests regex sentencepiece sacremoses
! pip install transformers
! pip install sentence_transformers
! pip install -U sentence-transformers
# ! pip install numpy
! pip install torch
! pip install torchtext
! pip install torchmetrics
! pip install pytorch-lightning
! pip install time
! pip install ipykernel
! pip install spacy
! pip install "grpcio>=1.37.0,<2.0" "h5py>=3.6.0,<3.7" "numpy>=1.22.3,<1.23.0"

In [None]:
import json
import pandas as pd
import collections
import torch
import torch.nn as nn
import numpy as np
import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim
import random
import re
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import math
from google.colab import drive
drive.mount('/content/drive')

In [None]:
class BERT_Data:
    '''
    Class that cleans and formats the bills and news datasets for the BERT
    model, tokenizes the data, and creates then saves the text embeddings.
    '''
    def __init__(self, random_seed = 5,
                bert_model = 'bert-base-uncased', #'bert-base-uncased' or 'bert-large-uncased'
                date_range_begin = None, date_range_end = '2018-04-01',
                bills_csvpath = '/content/drive/Shareddrives/PulseofPolicy_data/bills_data/115th.csv',
                clean_bills_csvpath = '/content/drive/Shareddrives/PulseofPolicy_data/bills_data/115th_clean.csv',
                minimal_clean_bills_csvpath = '/content/drive/Shareddrives/PulseofPolicy_data/bills_data/115th_clean_minimal.csv',
                foxnews_csvpath = '/content/drive/Shareddrives/PulseofPolicy_data/news_data/fox.csv',
                clean_foxnews_csvpath = '/content/drive/Shareddrives/PulseofPolicy_data/news_data/fox_clean.csv',
                minimal_clean_foxnews_csvpath = '/content/drive/Shareddrives/PulseofPolicy_data/news_data/fox_clean_minimal.csv',
                breitbart_csvpath = '/content/drive/Shareddrives/PulseofPolicy_data/news_data/breitbart.csv',
                clean_breitbart_csvpath = '/content/drive/Shareddrives/PulseofPolicy_data/news_data/breitbart_clean.csv',
                minimal_clean_breitbart_csvpath = '/content/drive/Shareddrives/PulseofPolicy_data/news_data/breitbart_clean_minimal.csv',
                cnn_csvpath = '/content/drive/Shareddrives/PulseofPolicy_data/news_data/cnn.csv',
                clean_cnn_csvpath = '/content/drive/Shareddrives/PulseofPolicy_data/news_data/cnn_clean.csv',
                minimal_clean_cnn_csvpath = '/content/drive/Shareddrives/PulseofPolicy_data/news_data/cnn_clean_minimal.csv',
                nytimes_csvpath = '/content/drive/Shareddrives/PulseofPolicy_data/news_data/nyt.csv',
                clean_nytimes_csvpath = '/content/drive/Shareddrives/PulseofPolicy_data/news_data/nyt_clean.csv',
                minimal_clean_nytimes_csvpath = '/content/drive/Shareddrives/PulseofPolicy_data/news_data/nyt_clean_minimal.csv',
                truncated_minimal_clean_nytimes_csvpath = '/content/drive/Shareddrives/PulseofPolicy_data/news_data/nyt_clean_minimal_truncated.csv',
                wapo_csvpath = '/content/drive/Shareddrives/PulseofPolicy_data/news_data/washington_post_with_date.csv',
                clean_wapo_csvpath = '/content/drive/Shareddrives/PulseofPolicy_data/news_data/washington_post_with_date_clean.csv',
                minimal_clean_wapo_csvpath = '/content/drive/Shareddrives/PulseofPolicy_data/news_data/washington_post_with_date_clean_minimal.csv',
                truncated_minimal_clean_wapo_csvpath = '/content/drive/Shareddrives/PulseofPolicy_data/news_data/washington_post_with_date_clean_minimal_truncated.csv'
    ):
        self.device = self.cuda_mps_cpu()
        self.random_seed = random_seed
        self.random_seed_function()
        self.date_range_begin = date_range_begin
        self.date_range_end = date_range_end
        self.bills_csvpath = bills_csvpath
        self.clean_bills_csvpath = clean_bills_csvpath
        self.minimal_clean_bills_csvpath = minimal_clean_bills_csvpath
        self.df_bills_prepared = pd.read_csv(minimal_clean_bills_csvpath)
        # self.df_bills_raw = pd.read_csv(bills_csvpath)
        # self.df_bills_clean = self.clean_bills()
        # self.df_bills_clean = pd.read_csv(clean_bills_csvpath)
        self.foxnews_csvpath = foxnews_csvpath
        self.minimal_clean_foxnews_csvpath = minimal_clean_foxnews_csvpath
        self.breitbart_csvpath = breitbart_csvpath
        self.minimal_clean_breitbart_csvpath = minimal_clean_breitbart_csvpath
        self.cnn_csvpath = cnn_csvpath
        self.minimal_clean_cnn_csvpath = minimal_clean_cnn_csvpath
        self.nytimes_csvpath = nytimes_csvpath
        self.minimal_clean_nytimes_csvpath = minimal_clean_nytimes_csvpath
        self.truncated_minimal_clean_nytimes_csvpath = truncated_minimal_clean_nytimes_csvpath
        self.wapo_csvpath = wapo_csvpath
        self.minimal_clean_wapo_csvpath = minimal_clean_wapo_csvpath
        self.truncated_minimal_clean_wapo_csvpath = truncated_minimal_clean_wapo_csvpath
        self.tokenizer = AutoTokenizer.from_pretrained(bert_model)
        self.bert_base = AutoModel.from_pretrained(bert_model)
        self.practice_bill = self.df_bills_prepared.head(5).copy()


    def clean_bills(self, only_2017 = False, only_2018 = False,
                save = True # only_bills = False,
    ):
        '''
        Clean and format the bills dataset
        '''
        df = pd.read_csv(self.bills_csvpath)
        df.loc[:, ['new_index']] = df.index
        df.loc[:, 'cleaned_text'] = df.loc[:, 'raw_text'].apply(
                self.clean_bill_text, args=()
        )
        df.loc[:, ['date']] = pd.to_datetime(
                    df.loc[:, 'introduced_date'], format='%Y-%m-%d'
        )
        df.loc[:, ['house_passage_binary']] = df.loc[:, 'house_passage'].fillna(0, inplace=True)

        df.loc[:, ['house_passage_binary']] = np.where(
                df.loc[:, 'house_passage_binary'] != 0, 1, 0
        )

        if only_2017:
            df = df.loc[(df.loc[:, 'date'] >= '2017-01-01'
                    & df.loc[:, 'date'] < '2018-01-01'
            ), :]

        if only_2018:
            df = df.loc[(df.loc[:, 'date'] >= '2018-01-01'
                    & df.loc[:, 'date'] < '2019-01-01'
            ), :]

        df.loc[:, ['cleaned_text']] = df.loc[:, 'cleaned_text'].apply(
                self.clean_generalnews_text, args=()
        )
        df = df.loc[:, ['bill_id', 'new_index', 'cleaned_text', 'date', 'house_passage_binary', 'bill_type']]

        if save:
            df.to_csv(self.clean_bills_csvpath, index=False)
        # if only_bills:
        #     df = df.loc[(df.loc[:, 'bill_type'] == | df.loc[:, 'bill_type'] == ), :]

        return df


    def dates_clean_news(self, df_whole, Date_version, long_date_version,
            start_date = None, end_date = None,
            minimal_columns = False
            # minimal_columns = ['index', 'date', 'cleaned_text']
    ):
        '''
        Format the date of the news articles to match the date of the bills
        '''
        if start_date is None:
            start_date = self.date_range_begin
        if end_date is None:
            end_date = self.date_range_end

        df = df_whole.copy()
        if Date_version:
            df.loc[:, ['date']] = pd.to_datetime(
                    df.loc[:, 'Date'], format='%Y-%m-%d'
            )
        elif long_date_version:
            df.loc[:, ['date']] = pd.to_datetime(
                    df.loc[:, 'date'].str[:10], format='%Y-%m-%d'
            )
        else:
            df.loc[:, ['date']] = pd.to_datetime(
                    df.loc[:, 'date'], format='%Y-%m-%d'
            )

        if start_date is not None:
            df = df.loc[df.loc[:, 'date'] >= start_date, :]
        if end_date is not None:
            df = df.loc[df.loc[:, 'date'] <= end_date, :]

        if minimal_columns is not False:
            df = df.loc[:, minimal_columns]

        return df


    def clean_foxnews(self, save = True):
        '''
        Clean and format the Fox News data
        '''
        df_fox = pd.read_csv(self.foxnews_csvpath)
        df_fox.loc[:, ['new_index']] = df_fox.index
        df_fox_dated = self.dates_clean_news(df_fox, Date_version = True,
                long_date_version = False
        )
        df_fox_dated.loc[:, 'cleaned_text'] = df_fox_dated.loc[:, 'article_text'].apply(
                self.clean_foxnews_text, args=()
        )
        df_fox_dated = df_fox_dated.loc[:, ['uuid', 'new_index', 'cleaned_text', 'date']]

        if save:
            df_fox_dated.to_csv(self.minimal_clean_foxnews_csvpath, index=False)

        return df_fox_dated


    def clean_breitbart(self, save = True):
        '''
        Clean and format the Breitbart data
        '''
        df_breitbart = pd.read_csv(self.breitbart_csvpath)
        df_breitbart.loc[:, ['new_index']] = df_breitbart.index
        df_breitbart_dated = self.dates_clean_news(df_breitbart,
                Date_version = True, long_date_version = False
        )
        df_breitbart_dated.loc[:, 'cleaned_text'] = df_breitbart_dated.loc[:, 'article_text'].apply(
                self.clean_generalnews_text, args=()
        )
        df_breitbart_dated = df_breitbart_dated.loc[:, ['uuid', 'new_index', 'cleaned_text', 'date']]
        # df_breitbart_dated.replace('', np.nan, inplace=True)
        # df_breitbart_dated.dropna(subset= ['cleaned_text'], inplace=True)

        if save:
            df_breitbart_dated.to_csv(self.minimal_clean_breitbart_csvpath,
                    index=False
            )

        return df_breitbart_dated


    def clean_cnn(self, save = True):
        '''
        Clean and format the CNN data
        '''
        df_cnn = pd.read_csv(self.cnn_csvpath)
        df_cnn.loc[:, ['new_index']] = df_cnn.index
        df_cnn_dated = self.dates_clean_news(df_cnn, Date_version = False,
                long_date_version = True
        )
        df_cnn_dated.loc[:, 'cleaned_text'] = df_cnn_dated.loc[:, 'text'].apply(
                self.clean_generalnews_text, args=()
        )
        df_cnn_dated = df_cnn_dated.loc[:, ['title', 'new_index', 'cleaned_text', 'date']]

        if save:
            df_cnn_dated.to_csv(self.minimal_clean_cnn_csvpath, index=False)

        return df_cnn_dated


    def clean_nytimes(self, save = True):
        '''
        Clean and format the New York Times data
        '''
        df_nytimes = pd.read_csv(self.nytimes_csvpath)
        df_nytimes.loc[:, ['new_index']] = df_nytimes.index
        df_nytimes_dated = self.dates_clean_news(df_nytimes,
                Date_version = False, long_date_version = True
        )
        df_nytimes_dated.loc[:, 'cleaned_text'] = df_nytimes_dated.loc[:, 'text'].apply(
                self.clean_generalnews_text, args=()
        )
        df_nytimes_dated = df_nytimes_dated.loc[:, ['title', 'new_index', 'cleaned_text', 'date']]

        if save:
            df_nytimes_dated.to_csv(self.minimal_clean_nytimes_csvpath,
                    index=False
            )

        return df_nytimes_dated


    def clean_wapo(self, save = True):
        '''
        Clean and format the Washington Post data
        '''
        df_wapo = pd.read_csv(self.wapo_csvpath)
        df_wapo.loc[:, ['new_index']] = df_wapo.index
        df_wapo_dated = self.dates_clean_news(df_wapo, Date_version = False,
                long_date_version = False
        )
        df_wapo_dated.loc[:, 'cleaned_text'] = df_wapo_dated.loc[:, 'text'].apply(
                self.clean_generalnews_text, args=()
        )
        df_wapo_dated = df_wapo_dated.loc[:, ['title', 'new_index', 'cleaned_text', 'date']]

        if save:
            df_wapo_dated.to_csv(self.minimal_clean_wapo_csvpath, index=False)

        return df_wapo_dated


    def clean_bill_text(self, text, glove=None, cbows=None, lemma=None):
        '''
        Intakes the text of a single bill and removes formating unique to the
        bill text, eliminates html tags, separates sentences,
        lower cases, and correctly formats the text
        '''
        text = text.lower()
        text = re.compile('<.*?>').sub('', text)
        text = re.compile(r"[_]").sub(" ", text)
        text = re.sub(r"\[.*?\]", "", text)
        text = re.sub(r"\d+\sU\.S\.C\.\s\d+[a-z]?(\(\d+\))?( \([a-z]+\))?", "",
                text
        )
        text = re.sub(r'(sec\.\s+\d+\.?)|(section\s+\d+\.)', '', text)
        text = re.sub(r'\(\d+ u\.s\.c\. \d+\([a-z]\)(\(\d+\))*\)', '', text)
        text = re.sub(r'\(\d+\s+u\.s\.c\.\s+\d+\)', '', text)
        text = re.sub(r'\(\d+ u\.s\.c\. \d+\)', '', text)
        text = re.sub(r'\(\d+\)', '', text)
        text = re.sub(r'\([ivxlcdm]+\)', '', text)
        text = re.sub(r'\(\w\)', '', text)
        text = re.sub(r'\[\d+\]', '', text)
        text = re.sub(r'\n', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'([a-z])\.--([a-z])', r'\1. -- \2', text)
        text = text.replace('``', '"').replace("''", '"')
        text = text.strip()

        return text


    def clean_foxnews_text(self, text):
        '''
        Intakes a single article and removes formating unique to the Fox News
        text, lower cases, and correctly formating the text
        '''
        text = text.replace("Get the latest updates from the 2024 campaign trail, exclusive interviews and more Fox News politics content.SubscribedYou've successfully subscribed to this newsletter!", " ")
        text = text.lower()
        text = re.sub(r"([.!?;:\"“”])(?=[^\s])|”(?=\w)", r"\1 ", text)
        text = re.sub(r'\s+', ' ', text)
        text = text.strip()

        return text


    def clean_generalnews_text(self, text):
        '''
        Intakes a single article and removes formatting, lower cases, separates
        connected end of sentences, and correctly formating the text
        '''
        text = re.sub(r"http\S+", " ", text)
        text = re.sub(r"@\S+", " ", text)
        text = re.sub(r'([a-z])\.”([A-Z])', r'\1. ” \2', text)
        text = text.lower()
        text = re.sub(r"([.!?;:\"“”])(?=[^\s])|”(?=\w)", r"\1 ", text)
        text = re.sub(r'\s+', ' ', text)
        text = text.strip()

        return text


    def long_texts_dim_reduction(self, embedding_tensor,
                dim_reduction_strategy
    ): # Pooling, max, (potential: PCA)
        '''
        If tokenized text is untruncated and longer than 510 tokens, adds a
        secondary measure to reduce the dimensionality of the embedding
        tensor
        '''
        if dim_reduction_strategy == 'mean':
            embedding_tensor = embedding_tensor.mean(dim=0)
        elif dim_reduction_strategy == 'max':
            embedding_tensor = embedding_tensor.max(dim=0).values

        print(embedding_tensor.shape)
        return embedding_tensor


    def bert_embed_single_row(self, text, index = None, pooling = 'mean',
            max_len = 510, attention_mask_onpadding = True,
            long_dim_reduction_strategy = 'max',
            skip_long_texts = False, truncate_text = False,
    ): #512 tokens BERT # max, mean, CLS token embedding
        '''
        Intakes an entire bill or article, tokenizens the text, embeds the text
        using the BERT model, applies pooling, and returns the embedding tensor.
        Text may be truncated to max_len tokens.
        '''
        print(f'Bill/News index: {index}')
        if truncate_text:
            text_tokenized = self.tokenizer(text, return_tensors='pt',
                add_special_tokens=False, truncation=True, max_length = max_len
            )
        else:
            text_tokenized = self.tokenizer(text, return_tensors='pt',
                add_special_tokens=False, truncation=False
            )
        text_len = len(text_tokenized.input_ids[0])
        if skip_long_texts:
            if text_len > max_len:
                return None
        # print(text_tokenized.input_ids[0])
        text_segment_count = math.ceil(text_len / max_len)
        embedding_tensor = torch.tensor([], dtype = torch.float32)
        for segment in range(text_segment_count):
            start_token = segment * max_len
            end_token = min(start_token + max_len, text_len)
            # print(f'Segment {segment}, start_token {start_token}, end_token {end_token}')
            # print(self.tokenizer.cls_token_id, self.tokenizer.sep_token_id)
            segment_ids_tensor = text_tokenized.input_ids[0, start_token:end_token]
            padded_token_count = (max_len) - (end_token - start_token)
            if padded_token_count == 0:
                segment_ids_tensor = torch.cat([
                        torch.tensor([self.tokenizer.cls_token_id]),
                        segment_ids_tensor,
                        torch.tensor([self.tokenizer.sep_token_id])
                ]).unsqueeze(0)
            else:
                padding_tokens_tensor = torch.tensor(
                        [self.tokenizer.pad_token_id] * padded_token_count
                )
                segment_ids_tensor = torch.cat([
                        torch.tensor([self.tokenizer.cls_token_id]),
                        segment_ids_tensor,
                        padding_tokens_tensor,
                        torch.tensor([self.tokenizer.sep_token_id])
                ]).unsqueeze(0)
            if attention_mask_onpadding:
                attention_mask = (segment_ids_tensor !=
                        self.tokenizer.pad_token_id).to(dtype=torch.int64
                )
                with torch.no_grad():
                    segment_embeddings_tensor = self.bert_base(
                            input_ids = segment_ids_tensor,
                            attention_mask=attention_mask
                    ).last_hidden_state
            else:
                with torch.no_grad():
                    segment_embeddings_tensor = self.bert_base(
                            segment_ids_tensor
                    ).last_hidden_state

            if segment == 0:
                total_embeddings_tensor = segment_embeddings_tensor
            else:
                # print('total_embeddings_tensor: ', total_embeddings_tensor.shape)
                # print('segment_embeddings_tensor: ', segment_embeddings_tensor.shape)
                total_embeddings_tensor = torch.cat([total_embeddings_tensor,
                        segment_embeddings_tensor
                ])

        if pooling == 'mean':
            output_embedding_tensor = total_embeddings_tensor.mean(dim=1)
        elif pooling == 'max':
            output_embedding_tensor = total_embeddings_tensor.max(dim=1).values
        else: # None or CLS implies CLS
            output_embedding_tensor = total_embeddings_tensor[:, 0, :]

        if not skip_long_texts:
            if not truncate_text:
                output_embedding_tensor = self.long_texts_dim_reduction(
                        output_embedding_tensor, long_dim_reduction_strategy
                )

        print('output_embedding_tensor: ', output_embedding_tensor.shape)
        return output_embedding_tensor


    def total_bert_embeddings(self, df, text_column, pooling = 'mean',
            max_len = 510, attention_mask_onpadding = True,
            long_dim_reduction_strategy = 'max',
            skip_long_texts = False, truncate_text = False
    ):
        '''
        Intakes a dataframe, facilitates the tokenization and embedding process
        using the BERT model for each row in the dataframe.  Returns the
        total embedding tensor.  Text may be truncated to max_len tokens.
        '''
        embeddings_lst = []

        for index, row in df.iterrows():
            ## Used in testing
            # if index == 10:
            #     break
            embedded_row = self.bert_embed_single_row(row.loc[text_column],
                            index, pooling, max_len, attention_mask_onpadding,
                            long_dim_reduction_strategy, skip_long_texts,
                            truncate_text
            )
            # If skip_long_texts is True and text after tokenization longer than 512 than text skipped
            if embedded_row is not None:
                embeddings_lst.append(embedded_row)

        embeddings_tensor = torch.stack(embeddings_lst)

        return embeddings_tensor


    def cuda_mps_cpu(self):
        '''
        States the computational device in use
        '''
        if torch.cuda.is_available():  # use GPU if available
            print('Using GPU')
            return torch.device('cuda')
        # https://www.linkedin.com/pulse/how-use-gpu-tensorflow-pytorch-libraries-macbook-pro-m2apple-kashyap/
        elif torch.backends.mps.is_available():
            print('Using MPS')
            return torch.device('mps')
        else:
            print('Using CPU')
            return torch.device('cpu')


    def random_seed_function(self):
        '''
        Sets the random seed for replicability
        '''
        torch.manual_seed(self.random_seed)

        if self.device == 'cuda':
            torch.cuda.manual_seed_all(self.random_seed)

        random.seed(self.random_seed)

        return None

In [None]:
with open(f'/content/drive/MyDrive/30255_data/input_data/115th.json', 'r') as file:
    data_base_2 = json.load(file)

In [None]:
df = pd.DataFrame(data_base_2)

In [None]:
df.to_csv('/content/drive/MyDrive/30255_data/input_data/csv_115th.csv',
        index=False
)

In [None]:
data_class = BERT_Data()


Using GPU


In [None]:
df_clean_improved = data_class.clean_bills()
df_foxnews_clean = data_class.clean_foxnews()
df_breitbart_clean = data_class.clean_breitbart()
df_cnn_clean = data_class.clean_cnn()
df_nytimes_clean = data_class.clean_nytimes()
df_wapo_clean = data_class.clean_wapo()

In [None]:
######################################################
#   Conservative EMBEDDINGs Mean then Mean Pooling   #
######################################################
df_foxnews_clean = pd.read_csv(data_class.minimal_clean_foxnews_csvpath)
foxnews_embeddings_tensor = data_class.total_bert_embeddings(df_foxnews_clean,
        'cleaned_text', pooling = 'mean', max_len = 510,
        attention_mask_onpadding = True, long_dim_reduction_strategy = 'mean',
        skip_long_texts = False, truncate_text = False
)

try:
    torch.save(foxnews_embeddings_tensor,
            '/content/drive/Shareddrives/PulseofPolicy_data/news_embeddings/foxnews_embeddings_mean_mean.pt'
    )
except Exception as e:
    print(e)
try:
    torch.save(foxnews_embeddings_tensor,
            '/content/drive/Shareddrives/PulseofPolicy_data/news_data/foxnews_embeddings_mean_mean.pt'
    )
except Exception as e:
    print(e)

In [None]:
df_breitbart_clean = pd.read_csv(data_class.minimal_clean_breitbart_csvpath)
breitbart_embeddings_tensor = data_class.total_bert_embeddings(df_breitbart_clean,
        'cleaned_text', pooling = 'mean', max_len = 510,
        attention_mask_onpadding = True, long_dim_reduction_strategy = 'mean',
        skip_long_texts = False, truncate_text = False
)

try:
    torch.save(breitbart_embeddings_tensor,
            '/content/drive/Shareddrives/PulseofPolicy_data/news_embeddings/breitbart_embeddings_mean_mean.pt'
    )
except Exception as e:
    print(e)
try:
    torch.save(breitbart_embeddings_tensor,
            '/content/drive/Shareddrives/PulseofPolicy_data/news_data/breitbart_embeddings_mean_mean.pt'
    )
except Exception as e:
    print(e)

In [None]:
#############################################
#   Left EMBEDDINGS Mean then Mean Pooling  #
#############################################
cnn_data = pd.read_csv(data_class.minimal_clean_cnn_csvpath)
cnn_embeddings_tensor = data_class.total_bert_embeddings(cnn_data,
        'cleaned_text', pooling = 'mean', max_len = 510,
        attention_mask_onpadding = True, long_dim_reduction_strategy = 'mean',
        skip_long_texts = False, truncate_text = False
)

try:
    torch.save(cnn_embeddings_tensor,
            '/content/drive/Shareddrives/PulseofPolicy_data/news_embeddings/cnn_embeddings_mean_mean.pt'
    )
except Exception as e:
    print(e)
try:
    torch.save(cnn_embeddings_tensor,
            '/content/drive/Shareddrives/PulseofPolicy_data/news_data/cnn_embeddings_mean_mean.pt'
    )
except Exception as e:
    print(e)

In [None]:
nytimes_data = pd.read_csv(data_class.truncated_minimal_clean_nytimes_csvpath)
nytimes_embeddings_tensor = data_class.total_bert_embeddings(nytimes_data,
        'cleaned_text', pooling = 'mean', max_len = 510,
        attention_mask_onpadding = True, long_dim_reduction_strategy = 'mean',
        skip_long_texts = False, truncate_text = False
)

try:
    torch.save(nytimes_embeddings_tensor,
            '/content/drive/Shareddrives/PulseofPolicy_data/news_embeddings/nytimes_embeddings_mean_mean.pt'
    )
except Exception as e:
    print(e)
try:
    torch.save(nytimes_embeddings_tensor,
            '/content/drive/Shareddrives/PulseofPolicy_data/news_data/nytimes_embeddings_mean_mean.pt'
    )
except Exception as e:
    print(e)

In [None]:
wapo_data = pd.read_csv(data_class.truncated_minimal_clean_wapo_csvpath)
wapo_embeddings_tensor = data_class.total_bert_embeddings(wapo_data,
        'cleaned_text', pooling = 'mean', max_len = 510,
        attention_mask_onpadding = True, long_dim_reduction_strategy = 'mean',
        skip_long_texts = False, truncate_text = False
)

try:
    torch.save(wapo_embeddings_tensor,
            '/content/drive/Shareddrives/PulseofPolicy_data/news_embeddings/truncated_wapo_embeddings_mean_mean.pt'
    )
except Exception as e:
    print(e)
try:
    torch.save(wapo_embeddings_tensor,
            '/content/drive/Shareddrives/PulseofPolicy_data/news_data/truncated_wapo_embeddings_mean_mean.pt'
    )
except Exception as e:
    print(e)

In [None]:
##############################################
#   Bill EMBEDDINGS Mean then Mean Pooling   #
##############################################
df_bills_prepared = pd.read_csv(data_class.minimal_clean_bills_csvpath)
bills_embeddings_tensor = data_class.total_bert_embeddings(df_bills_prepared,
        'cleaned_text', pooling = 'mean', max_len = 510,
        attention_mask_onpadding = True, long_dim_reduction_strategy = 'mean',
        skip_long_texts = False, truncate_text = False
)

try:
    torch.save(bills_embeddings_tensor,
            '/content/drive/Shareddrives/PulseofPolicy_data/bills_embeddings/115th_embeddings_mean_mean.pt'
    )
except Exception as e:
    print(e)
try:
    torch.save(bills_embeddings_tensor,
            '/content/drive/Shareddrives/PulseofPolicy_data/bills_data/115th_embeddings_mean_mean.pt'
    )
except Exception as e:
    print(e)

In [None]:
#####################################################
#   Conservative EMBEDDINGs Mean then Max Pooling   #
#####################################################
df_foxnews_clean = pd.read_csv(data_class.minimal_clean_foxnews_csvpath)
foxnews_embeddings_tensor = data_class.total_bert_embeddings(
        df_foxnews_clean, 'cleaned_text'
)

try:
    torch.save(foxnews_embeddings_tensor,
            '/content/drive/Shareddrives/PulseofPolicy_data/news_embeddings/foxnews_embeddings_mean_max.pt'
    )
except Exception as e:
    print(e)
try:
    torch.save(foxnews_embeddings_tensor,
            '/content/drive/Shareddrives/PulseofPolicy_data/news_data/foxnews_embeddings_mean_max.pt'
    )
except Exception as e:
    print(e)

In [None]:
df_breitbart_clean = pd.read_csv(data_class.minimal_clean_breitbart_csvpath)
breitbart_embeddings_tensor = data_class.total_bert_embeddings(
        df_breitbart_clean, 'cleaned_text'
)

try:
    torch.save(breitbart_embeddings_tensor,
            '/content/drive/Shareddrives/PulseofPolicy_data/news_embeddings/breitbart_embeddings_mean_max.pt'
    )
except Exception as e:
    print(e)
try:
    torch.save(breitbart_embeddings_tensor,
            '/content/drive/Shareddrives/PulseofPolicy_data/news_data/breitbart_embeddings_mean_max.pt'
    )
except Exception as e:
    print(e)

In [None]:
############################################
#   Left EMBEDDINGS Mean then Max Pooling  #
############################################
cnn_data = pd.read_csv(data_class.minimal_clean_cnn_csvpath)
cnn_embeddings_tensor = data_class.total_bert_embeddings(cnn_data,
         'cleaned_text'
)

try:
    torch.save(cnn_embeddings_tensor,
            '/content/drive/Shareddrives/PulseofPolicy_data/news_embeddings/cnn_embeddings_mean_max.pt'
    )
except Exception as e:
    print(e)
try:
    torch.save(cnn_embeddings_tensor,
            '/content/drive/Shareddrives/PulseofPolicy_data/news_data/cnn_embeddings_mean_max.pt'
    )
except Exception as e:
    print(e)

In [None]:
nytimes_data = pd.read_csv(data_class.truncated_minimal_clean_nytimes_csvpath)
nytimes_embeddings_tensor = data_class.total_bert_embeddings(nytimes_data,
        'cleaned_text'
)

try:
    torch.save(nytimes_embeddings_tensor,
            '/content/drive/Shareddrives/PulseofPolicy_data/news_embeddings/truncated_nytimes_embeddings_mean_max.pt'
    )
except Exception as e:
    print(e)
try:
    torch.save(nytimes_embeddings_tensor,
            '/content/drive/Shareddrives/PulseofPolicy_data/news_data/truncated_nytimes_embeddings_mean_max.pt'
    )
except Exception as e:
    print(e)

In [None]:
wapo_data = pd.read_csv(data_class.truncated_minimal_clean_wapo_csvpath)
wapo_embeddings_tensor = data_class.total_bert_embeddings(wapo_data,
         'cleaned_text'
)

try:
    torch.save(wapo_embeddings_tensor,
            '/content/drive/Shareddrives/PulseofPolicy_data/news_embeddings/truncated_wapo_embeddings_mean_max.pt'
    )
except Exception as e:
    print(e)
try:
    torch.save(wapo_embeddings_tensor,
            '/content/drive/Shareddrives/PulseofPolicy_data/news_data/truncated_wapo_embeddings_mean_max.pt'
    )
except Exception as e:
    print(e)

In [None]:
#############################################
#   Bill EMBEDDINGS Mean then Max Pooling   #
#############################################
df_bills_prepared = pd.read_csv(data_class.minimal_clean_bills_csvpath)

bills_embeddings_tensor = data_class.total_bert_embeddings(df_bills_prepared,
        'cleaned_text'
)
try:
    torch.save(bills_embeddings_tensor,
            '/content/drive/Shareddrives/PulseofPolicy_data/bills_embeddings/115th_embeddings_mean_max.pt'
    )
except Exception as e:
    print(e)
try:
    torch.save(bills_embeddings_tensor,
            '/content/drive/Shareddrives/PulseofPolicy_data/bills_data/115th_embeddings_mean_max.pt'
    )
except Exception as e:
    print(e)

In [None]:
######################################################
#   Practice EMBEDDINGs Truncated CLS Only Segment   #
######################################################
data_class.practice_bill
practice_bill_clsembeddings_tensor = data_class.total_bert_embeddings(
        data_class.practice_bill,
        'cleaned_text', pooling = 'CLS', max_len = 510,
        attention_mask_onpadding = True, long_dim_reduction_strategy = None,
        skip_long_texts = False, truncate_text = True
)


In [None]:
##########################################################
#   Conservative EMBEDDINGs Truncated CLS Only Segment   #
##########################################################
df_foxnews_clean = pd.read_csv(data_class.minimal_clean_foxnews_csvpath)
foxnews_clsembeddings_tensor = data_class.total_bert_embeddings(df_foxnews_clean,
        'cleaned_text', pooling = 'CLS', max_len = 510,
        attention_mask_onpadding = True, long_dim_reduction_strategy = None,
        skip_long_texts = False, truncate_text = True
)

try:
    torch.save(foxnews_clsembeddings_tensor,
            '/content/drive/Shareddrives/PulseofPolicy_data/news_embeddings/foxnews_embeddings_cls.pt'
    )
except Exception as e:
    print(e)
try:
    torch.save(foxnews_clsembeddings_tensor,
            '/content/drive/Shareddrives/PulseofPolicy_data/news_data/foxnews_embeddings_cls.pt'
    )
except Exception as e:
    print(e)

In [None]:
df_breitbart_clean = pd.read_csv(data_class.minimal_clean_breitbart_csvpath)
breitbart_clsembeddings_tensor = data_class.total_bert_embeddings(df_breitbart_clean,
        'cleaned_text', pooling = 'CLS', max_len = 510,
        attention_mask_onpadding = True, long_dim_reduction_strategy = None,
        skip_long_texts = False, truncate_text = True
)

try:
    torch.save(breitbart_clsembeddings_tensor,
            '/content/drive/Shareddrives/PulseofPolicy_data/news_embeddings/breitbart_embeddings_cls.pt'
    )
except Exception as e:
    print(e)
try:
    torch.save(breitbart_clsembeddings_tensor,
            '/content/drive/Shareddrives/PulseofPolicy_data/news_data/breitbart_embeddings_cls.pt'
    )
except Exception as e:
    print(e)

In [None]:
##################################################
#   Left EMBEDDINGs Truncated CLS Only Segment   #
##################################################
cnn_data = pd.read_csv(data_class.minimal_clean_cnn_csvpath)
cnn_clsembeddings_tensor = data_class.total_bert_embeddings(cnn_data,
        'cleaned_text', pooling = 'CLS', max_len = 510,
        attention_mask_onpadding = True, long_dim_reduction_strategy = None,
        skip_long_texts = False, truncate_text = True
)

try:
    torch.save(cnn_clsembeddings_tensor,
            '/content/drive/Shareddrives/PulseofPolicy_data/news_embeddings/cnn_embeddings_cls.pt'
    )
except Exception as e:
    print(e)
try:
    torch.save(cnn_clsembeddings_tensor,
            '/content/drive/Shareddrives/PulseofPolicy_data/news_data/cnn_embeddings_cls.pt'
    )
except Exception as e:
    print(e)

In [None]:
nytimes_data = pd.read_csv(data_class.truncated_minimal_clean_nytimes_csvpath)
nytimes_clsembeddings_tensor = data_class.total_bert_embeddings(nytimes_data,
        'cleaned_text', pooling = 'CLS', max_len = 510,
        attention_mask_onpadding = True, long_dim_reduction_strategy = None,
        skip_long_texts = False, truncate_text = True
)

try:
    torch.save(nytimes_clsembeddings_tensor,
            '/content/drive/Shareddrives/PulseofPolicy_data/news_embeddings/nytimes_embeddings_cls.pt'
    )
except Exception as e:
    print(e)
try:
    torch.save(nytimes_clsembeddings_tensor,
            '/content/drive/Shareddrives/PulseofPolicy_data/news_data/nytimes_embeddings_cls.pt'
    )
except Exception as e:
    print(e)

In [None]:
wapo_data = pd.read_csv(data_class.truncated_minimal_clean_wapo_csvpath)
wapo_clsembeddings_tensor = data_class.total_bert_embeddings(wapo_data,
        'cleaned_text', pooling = 'CLS', max_len = 510,
        attention_mask_onpadding = True, long_dim_reduction_strategy = None,
        skip_long_texts = False, truncate_text = True
)

try:
    torch.save(wapo_clsembeddings_tensor,
            '/content/drive/Shareddrives/PulseofPolicy_data/news_embeddings/wapo_embeddings_cls.pt'
    )
except Exception as e:
    print(e)
try:
    torch.save(wapo_clsembeddings_tensor,
            '/content/drive/Shareddrives/PulseofPolicy_data/news_data/wapo_embeddings_cls.pt'
    )
except Exception as e:
    print(e)

In [None]:
##################################################
#   Bill EMBEDDINGs Truncated CLS Only Segment   #
##################################################
bill_data = pd.read_csv(data_class.minimal_clean_bills_csvpath)
bill_clsembeddings_tensor = data_class.total_bert_embeddings(bill_data,
        'cleaned_text', pooling = 'CLS', max_len = 510,
        attention_mask_onpadding = True, long_dim_reduction_strategy = None,
        skip_long_texts = False, truncate_text = True
)

try:
    torch.save(bill_clsembeddings_tensor,
            '/content/drive/Shareddrives/PulseofPolicy_data/bills_embeddings/bill_embeddings_cls.pt'
    )
except Exception as e:
    print(e)

try:
    torch.save(bill_clsembeddings_tensor,
            '/content/drive/Shareddrives/PulseofPolicy_data/bills_data/bill_embeddings_cls.pt'
    )
except Exception as e:
    print(e)


In [None]:
a.bert_embed_single_row(a.practice_bill.loc[:, 'cleaned_text'][0])