In [None]:
import pandas as pd

news = pd.read_csv('news_combined.csv')

# Цитаты

In [None]:
import requests as r
from tqdm import tqdm

api_url = "<censored>"
oauth_token = "<censored>"

In [60]:
def split_text(text, max_length=10000):
    if len(text) <= max_length:
        return [text]
    
    segments = []
    start = 0
    
    while start < len(text):
        end = start + max_length
        if end >= len(str(text)):
            segments.append(text[start:])
            break
        
        split_point = text.rfind('.', start, end)
        split_point = max(split_point, text.rfind('!', start, end))
        split_point = max(split_point, text.rfind('?', start, end))
        split_point = max(split_point, text.rfind('"', start, end))
        
        if split_point == -1:
            split_point = end
        else:
            split_point += 1  
        segments.append(text[start:split_point].strip())
        start = split_point
    
    return segments

In [None]:
def build_request(comm: str) -> dict:
    return {
        "model": "deepseek-ai/deepseek-v3",
        "temperature" : 0.1,
        "max_tokens": 5096,
        "messages": [
            {
                "role": "user",
                "content": f"""Extract Relevant Israel-Palestine Conflict Quotes

                        1. They must contain sufficient context (minimum 3 words) 
                        2. Don't include quotes which contain vague statements and don't relate to conflict without article context. For instance, "it was false"

                        3. Extract quotes that are made by political figures, representatives, official or organization. Don't include qoutes made by reporters, journalists or spectators.

                        4. Don't include quotes made by Israel or Palestine / Hamas fiqures. 
                        
                        You may add minimal clarification in [brackets] within quotes to identify ambiguous references (e.g., 'They [Hamas] must stop the attacks')

                        Extract only quote, without prefix like "someone said that..."
                        
                        Return ONLY in this format:
                        'quote' - speaker; 'quote' - speaker
                        
                        If none found, respond only with: not found
                                
                The text: {comm}"""
            }
        ]
    }


def send_requests(df: pd.DataFrame) -> list: 
    answers = []
    
    for _, row in tqdm(df.iterrows()):
        text = row['text']
        try:
            segments = split_text(text) 
        except: 
            continue

        curr_ans = ''
        for segm in segments:
            res = r.post(api_url, json=build_request(segm), headers={'Authorization': f'OAuth {oauth_token}'})
            answer = res.json()['response']['choices'][0]['message']['content']
            curr_ans += answer
        answers.append((row['article_url'], row['date'], curr_ans, text))

    return answers

In [None]:
df_sample = news
quotes = send_requests(df_sample) 

5118it [2:59:30,  2.10s/it]


In [None]:
quotes.to_csv('news_with_quotes.csv')

In [90]:
def split_quotes(row):
    
    quotes = row['quotes'].split(';')
    
    results = []
    for quote in quotes:
        if quote.strip():  
            try:
                quote_text, author = quote.rsplit('-', 1)
                quote_text = quote_text.strip().strip('"')
                quote_text = quote_text.replace('"', '')
                author = author.strip()
                
                results.append({
                    'date': row['date'],
                    'url': row['article_url'],
                    'quote': quote_text,
                    'author': author
                })
            except ValueError:
                continue  
    
    return results

new_rows = []
for _, row in quotes.iterrows():
    new_rows.extend(split_quotes(row))

quotes = pd.DataFrame(new_rows)

In [91]:
quotes

Unnamed: 0,date,url,quote,author
0,2025-04-09,https://www.bbc.com/news/articles/cy5rrnzw0kwo,'Gaza is a killing field and civilians are in ...,António Guterres
1,2025-04-09,https://www.bbc.com/news/articles/cy5rrnzw0kwo,'There is no shortage of humanitarian aid in t...,Oren Marmonstein
2,2025-04-07,https://www.bbc.com/news/articles/c0jzgp947qdo,'We condemn the resumption of Israeli strikes ...,Emmanuel Macron
3,2025-04-07,https://www.bbc.com/news/articles/c0jzgp947qdo,'Hamas must not be part of any future governme...,Emmanuel Macron
4,2025-04-02,https://www.bbc.com/news/articles/cy8qnxwv08do,'Initial reports indicate the facility was she...,Philippe Lazzarini
...,...,...,...,...
4799,2024-04-14,https://www.aljazeera.com/news/2024/4/14/hamas...,'Egypt and Saudi Arabia called for restraint',Egypt and Saudi Arabia representatives
4800,2024-04-14,https://www.aljazeera.com/news/2024/4/14/hamas...,'The UN Secretary-General Antonio Guterres sai...,Antonio Guterres
4801,2023-10-20,https://www.aljazeera.com/news/2023/10/20/hama...,'Qatar hopes dialogue will lead to the “releas...,spokesperson for Qatar’s foreign ministry
4802,2023-10-20,https://www.aljazeera.com/news/2023/10/20/hama...,'The International Committee of the Red Cross ...,International Committee of the Red Cross
