# Summarize

## Import Libraries

In [15]:
import boto3
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

import importlib.util

import pandas as pd

import configparser
import sys
import os

## Functions

### Create an aws session

In [None]:
def create_aws_session(profile_name='py_admin'):
    try:
        session = boto3.Session(profile_name=profile_name)
        return session
    except Exception as e:
        print(f"Error creating session for profile {profile_name}: {e}")
        return None

# Use the py_admin profile
session = create_aws_session()

### Import custom functions

In [18]:
# Initialize S3 client
#s3 = boto3.client('s3')
s3 = session.client('s3')


def import_module_from_s3(bucket, key, module_name):    
    obj = s3.get_object(Bucket=bucket, Key=key)
    code = obj['Body'].read()

    # Create a temporary module and load the code
    spec = importlib.util.spec_from_loader(module_name, loader=None)
    module = importlib.util.module_from_spec(spec)
    exec(code, module.__dict__)

    # Register the module so it can be imported
    sys.modules[module_name] = module

    return module


# get common methods location
cm_bucket = 'cloudius-aurelius-config'
cm_key = 'common_methods/csv_methods.py'
#cm_bucket=os.getenv('CM_BUCKET')
#cm_key=os.getenv('CM_KEY')

cm = import_module_from_s3(bucket=cm_bucket,\
                           key=cm_key,\
                           module_name='csv_methods') 


Module csv_methods imported.


## Load Data

In [19]:
# set aws bucket and key 
aws_bucket = "py-secnews-landing"
aws_key = "news/1/ArchiveNews_2024-06-14:13:23_split.csv"

# download csv content
csv_content = cm.download_csv_content_from_s3(bucket=aws_bucket,\
                key=aws_key,\
                encoding='utf-8',\
                s3_client=s3)

In [None]:
# aggregate data (but ideally this is only done in the data lake)
"""
import boto3
import pandas as pd
from io import BytesIO

# Initialize your S3 client
session = create_aws_session()
s3_client = session.client('s3')

# Define bucket name and base folder path
bucket_name = 'py-secnews-landing'
base_folder = 'news/'

def merge_csvs_in_folders(bucket, base_folder):
    merged_dataframes = []
    
    # List all folders in the base path
    for folder_num in range(1, 11):  # Folders 1 to 10
        folder_path = f"{base_folder}{folder_num}/"
        
        # List all CSV files in the current folder
        response = s3_client.list_objects_v2(Bucket=bucket, Prefix=folder_path)
        
        # Iterate through the CSV files in the folder
        for obj in response.get('Contents', []):
            if obj['Key'].endswith('.csv'):
                # Download and load the CSV file into a DataFrame
                csv_obj = s3_client.get_object(Bucket=bucket, Key=obj['Key'])
                csv_content = csv_obj['Body'].read()
                df = pd.read_csv(BytesIO(csv_content))
                
                # Append to the list of dataframes to merge
                merged_dataframes.append(df)

    # Concatenate all DataFrames into one
    combined_df = pd.concat(merged_dataframes, ignore_index=True)
    return combined_df

# Execute the function and merge the CSVs
merged_df = merge_csvs_in_folders(bucket_name, base_folder)

# Optional: Save to a CSV (local or upload to S3)
# merged_df.to_csv('merged_output.csv', index=False)
"""

In [23]:
meta_dict={'delimiter':','}
df = cm.csv_str_to_df(csv_content=csv_content,\
                      metadata_dict=meta_dict)

In [25]:
df.head(5)

Unnamed: 0,source_id,time_publish,author,title,news_summary,news_text,news_url,crawl_id
0,6,,"[""Jun"", ""The Hacker News""]",Learn to Secure Petabyte-Scale Data in a Webin...,,Data is growing faster than ever. Remember whe...,https://thehackernews.com/2024/06/learn-to-sec...,1
1,6,,"[""Jun"", ""The Hacker News""]",Why Regulated Industries are Turning to Milita...,,As cyber threats loom large and data breaches ...,https://thehackernews.com/2024/06/why-regulate...,1
2,6,,"[""Jun""]",ZKTeco Biometric System Found Vulnerable to 24...,,An analysis of a hybrid biometric access syste...,https://thehackernews.com/2024/06/zkteco-biome...,1
3,6,,"[""Jun""]",North Korean Hackers Target Brazilian Fintech ...,,Threat actors linked to North Korea have accou...,https://thehackernews.com/2024/06/north-korean...,1
4,6,,"[""Jun""]",Microsoft Delays AI-Powered Recall Feature for...,,Microsoft on Thursday revealed that it's delay...,https://thehackernews.com/2024/06/microsoft-de...,1


In [39]:
# store input data
article = df['news_text'][3]

## Apply Model

In [27]:
# initialize model
def model_init(model_name="t5-small"):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    return model
model_init("t5-small")

In [36]:
def apply_model(input_text, model):
    """
    Creates a summary of a given input text with a given model
    input_text (str): input text to apply the model on
    model (transformers.models): the model initialized with model_init
    """

    input_ids = tokenizer.encode( "summarize: " + input_text,
        return_tensors="pt",\
        max_length=512,\
        truncation=True)
    
    summary_ids = model.generate(input_ids, max_length=150)

    summary = tokenizer.decode(
        summary_ids[0],\
        skip_special_tokens=True)

    return summary


In [40]:
summary = apply_model(input_text=article,model=model)

In [41]:
print("Original Text:")
print("---------------")
print(article)
print("\n\nGenerated Summary:")
print("---------------")
print(summary)

Original Text:
---------------
Threat actors linked to North Korea have accounted for one-third of all the phishing activity targeting Brazil since 2020, as the country's emergence as an influential power has drawn the attention of cyber espionage groups.

"North Korean government-backed actors have targeted the Brazilian government and Brazil's aerospace, technology, and financial services sectors," Google's Mandiant and Threat Analysis Group (TAG) divisions said in a joint report published this week.

"Similar to their targeting interests in other regions, cryptocurrency and financial technology firms have been a particular focus, and at least three North Korean groups have targeted Brazilian cryptocurrency and fintech companies."

Prominent among those groups is a threat actor tracked as UNC4899 (aka Jade Sleet, PUKCHONG, and TraderTraitor), which has targeted cryptocurrency professionals with a malware-laced trojanized Python app.

The attack chains involve reaching out to potentia

## Evaluate