In [None]:
import requests
import pandas as pd
from datetime import datetime, timedelta

In [None]:
def get_guardian_news(api_key, start_date, page_size=50, max_articles=5000):
    url = 'https://content.guardianapis.com/search'
    all_articles = []
    to_date = start_date

    while len(all_articles) < max_articles:
        params = {
            'order-by': 'newest',
            'page-size': page_size,
            'api-key': api_key,
            'to-date': to_date.strftime('%Y-%m-%d'),
            'show-fields': 'headline,bodyText'
        }

        response = requests.get(url, params=params)
        if response.status_code == 200:
            data = response.json()
            articles = data['response']['results']
            if not articles:
                print(f"No more articles found up to date {to_date.strftime('%Y-%m-%d')}")
                break
            all_articles.extend(articles)
            if len(all_articles) >= max_articles:
                break
            if len(articles) < page_size:
                print(f"Less articles ({len(articles)}) returned than page size ({page_size}) up to date {to_date.strftime('%Y-%m-%d')}")
                break
            oldest_article_date = datetime.strptime(articles[-1]['webPublicationDate'], '%Y-%m-%dT%H:%M:%SZ')
            to_date = oldest_article_date - timedelta(days=1)
        else:
            print(f"Error: {response.status_code}")
            print(response.json())
            break

    rows = []
    for article in all_articles[:max_articles]:
        headline = article['fields'].get('headline', '')
        content = article['fields'].get('bodyText', '')
        category = article['sectionName']
        rows.append({
            'headline': headline,
            'content': content,
            'category': category,
            'URL': article['webUrl'],
            'Published Date': article['webPublicationDate']
        })

    if rows:
        df = pd.DataFrame(rows)
        df['Published Date'] = pd.to_datetime(df['Published Date']).dt.tz_localize(None)
        df = df.sort_values(by='Published Date', ascending=False)
        df.reset_index(drop=True, inplace=True)
        today = datetime.today().strftime('%Y-%m-%d')
        filename = f"uncategorized_{today}.xlsx"
        df.to_excel(filename, index=False)
        print(f"Data saved to {filename}")
    else:
        print("No articles found for the given query parameters.")

if __name__ == '__main__':
    api_key = 'a7ce7d9b-a274-49c0-bcbb-6cd4bb466170'
    start_date = datetime(2024, 5, 31, 20, 0, 21)
    get_guardian_news(api_key, start_date)


Data saved to uncategorized_2024-06-02.xlsx


In [None]:
news_df = pd.read_csv('latestnews.csv')
print(news_df.head())

company_df = pd.read_csv('companies.csv')
print(company_df.head())


                                            headline  \
0  Ticketek customer details exposed in cyber sec...   
1  Biden says Trump’s claim of rigged trial is ‘d...   
2  Scientists develop cheap and quick spit test f...   
3  The best theatre to stream this month: David S...   
4                           Weekend crossword No 699   

                                             content    category  \
0  Ticketek has been hit by a “cyber incident” wi...  Technology   
1  It was a day of ongoing reactions to yesterday...     US news   
2  Scientists have developed a spit test that cou...     Society   
3  David Suchet: Poirot and More To mark his 75th...       Stage   
4                                                NaN  Crosswords   

                                                 URL       Published Date  
0  https://www.theguardian.com/technology/article...  2024-05-31 23:50:56  
1  https://www.theguardian.com/us-news/live/2024/...  2024-05-31 23:05:15  
2  https://www.theguardian

In [None]:
news_df.drop_duplicates(subset='headline', inplace=True)
news_df.dropna(subset=['headline'], inplace=True)

company_df.drop_duplicates(subset='company_name', inplace=True)
company_df.dropna(subset=['description', 'industry'], inplace=True)


In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, pipeline
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalpha() and word not in stop_words]
    return ' '.join(tokens)

news_df = pd.read_csv('latestnews.csv')
news_df['cleaned_headline'] = news_df['headline'].apply(preprocess_text)

model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)

classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)

def calculate_risk_percentage(headline):
    results = classifier(headline)
    risk_prob = sum(result['score'] for result in results if result['label'] in ['1 star', '2 stars'])
    risk_percentage = risk_prob * 100
    return risk_percentage

news_df['risk_percentage'] = news_df['cleaned_headline'].apply(calculate_risk_percentage)

news_df[['headline', 'cleaned_headline', 'risk_percentage']].to_csv('risk_percentages.csv', index=False)

print(news_df[['cleaned_headline', 'risk_percentage']].head())


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                    cleaned_headline  risk_percentage
0  ticketek customer detail exposed cyber securit...        53.074753
1  biden say trump claim rigged trial dangerous r...        84.503281
2  scientist develop cheap quick spit test prosta...        33.844233
3  best theatre stream month david suchet poirot ...         0.000000
4                                  weekend crossword         0.000000


In [None]:
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity


company_df = pd.read_csv('companies.csv')
company_df['cleaned_description'] = company_df['description'].apply(preprocess_text)

news_df = pd.read_csv('risk_percentages.csv')
news_df['cleaned_headline'] = news_df['headline'].apply(preprocess_text)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def get_embeddings(text_list):
    inputs = tokenizer(text_list, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1)

news_embeddings = get_embeddings(news_df['cleaned_headline'].tolist())
news_embeddings = news_embeddings.numpy()

def find_relevant_news(company_description, news_embeddings, news_df, top_k=10):
    company_embedding = get_embeddings([company_description]).numpy()
    similarity_scores = cosine_similarity(company_embedding, news_embeddings)[0]
    news_df['similarity_score'] = similarity_scores
    top_news = news_df.nlargest(top_k, 'similarity_score')
    return top_news[['headline', 'similarity_score', 'risk_percentage']]

def get_top_10_news_for_company(company_name, company_description, company_industry):
    cleaned_description = preprocess_text(company_description)
    top_news = find_relevant_news(cleaned_description, news_embeddings, news_df)
    return top_news

company_name = "Apple "
company_description = "Designs, manufactures and markets smartphones, personal computers, tablets, wearables and accessories, and sells a variety of related services"
company_industry = "Consumer electronics Software services"

top_10_news = get_top_10_news_for_company(company_name, company_description, company_industry)
print(top_10_news)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                               headline  similarity_score  \
3478  ‘You start to wonder, is it really a choice?’ ...          0.738611   
2889  Wearable tech: how the human body can help pow...          0.670442   
1796  Ohio company to sell a ‘flamethrower-wielding ...          0.662431   
3103  ‘It’s a worldly thing’: the ancient, multi-str...          0.659155   
4384  Companies portray menopause as ‘medical proble...          0.652491   
2422  The Australian company behind Splendour has a ...          0.651523   
4071  ‘I hate my tummy being constricted’: stylish w...          0.644243   
1502  ‘Unethical’ junk food packaging manipulates ch...          0.642691   
3890  Middle East crisis: Gaza aid ship from Cyprus ...          0.642387   

      risk_percentage  
3478         0.000000  
2889         0.000000  
1796        30.186903  
3103         0.000000  
4384        42.919657  
2422         0.000000  
2873        65.674144  
4071        54.052627  
1502        9

In [None]:
from transformers import RobertaTokenizer, RobertaModel

company_df = pd.read_csv('companies.csv')
company_df['cleaned_description'] = company_df['description'].apply(preprocess_text)

news_df = pd.read_csv('risk_percentages.csv')
news_df['cleaned_headline'] = news_df['headline'].apply(preprocess_text)

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaModel.from_pretrained('roberta-base')

def get_embeddings(text_list):
    inputs = tokenizer(text_list, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1)

news_embeddings = get_embeddings(news_df['cleaned_headline'].tolist())
news_embeddings = news_embeddings.numpy()

def find_relevant_news(company_description, news_embeddings, news_df, top_k=10):
    company_embedding = get_embeddings([company_description]).numpy()
    similarity_scores = cosine_similarity(company_embedding, news_embeddings)[0]
    news_df['similarity_score'] = similarity_scores
    top_news = news_df.nlargest(top_k, 'similarity_score')

    explanations = []
    for _, row in top_news.iterrows():
        explanation = f"This news article is related to the company due to its high similarity score of {row['similarity_score']:.2f}. "
        if row['risk_percentage'] > 50:
            explanation += f"It has a high risk percentage of {row['risk_percentage']:.2f}%, indicating potential negative impact."
        else:
            explanation += f"It has a lower risk percentage of {row['risk_percentage']:.2f}%, indicating a potentially neutral or positive impact."
        explanations.append(explanation)
    top_news['explanation'] = explanations

    return top_news[['headline', 'similarity_score', 'risk_percentage', 'explanation']]

def get_top_10_news_for_company(company_name, company_description, company_industry):
    cleaned_description = preprocess_text(company_description)
    top_news = find_relevant_news(cleaned_description, news_embeddings, news_df)
    return top_news

company_name = "Apple"
company_description = "Designs, manufactures and markets smartphones, personal computers, tablets, wearables and accessories, and sells a variety of related services."
company_industry = "Consumer electronics, Software services"

top_10_news = get_top_10_news_for_company(company_name, company_description, company_industry)
print(top_10_news)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


                                               headline  similarity_score  \
4045  Labor rights? Whatever. TikTok frenzy over Tra...          0.948018   
379   ‘A deranged fringe movement’: what is Maga com...          0.947740   
4268  Affordable apples and brilliant berries: Austr...          0.946118   
1208  Australian supermarket chocolate hazelnut spre...          0.945865   
4646  Food trade bodies consider legal action over p...          0.945653   
715   Supplies arrive in Gaza via new pier but land ...          0.945377   
143   Labour has ‘no plans’ to allow health worker v...          0.945322   
4791  $1bn donation means students at New York medic...          0.945057   
2921  ‘The best time of the year for produce’: pears...          0.944785   
1787   Teaching assistants routinely cover lessons i...          0.944505   

      risk_percentage                                        explanation  
4045         0.000000  This news article is related to the company du...  
37