In [1]:
import sqlite3
import pickle
from datetime import datetime
import numpy as np

In [2]:
import sys
sys.path.append('../flask_app/')
import get_chatgpt_criteria
import importlib
importlib.reload(get_chatgpt_criteria)
from get_chatgpt_criteria import *

In [3]:
import re

def parse_chatgpt_field(process_result, field):
    parsed_manipulation_methods = []
    if field not in process_result:
        return []
    lines = process_result[field].split('\n')
    for line in lines:
        method = re.findall(r'(\d+\.(.*?)[:])', line)
        if method:
            method_name = method[0][1].strip()
            if '(' in method_name:
                method_example = re.findall(r'(\((.*?)\))', method_name)[0][0]
                method_name = re.sub(r'\s+', ' ', method_name.replace(method_example, '')).strip()
            else:
                method_example = ''
            method_explanation = (method_example.strip() + " " + line.replace(method[0][0], '').strip()).strip()
            parsed_manipulation_methods.append((method_name, method_explanation))
    return parsed_manipulation_methods

In [4]:
sample = pickle.load(open('data/db_texts.p', 'rb'))
text2chatgpt_output = pickle.load(open('text2chatgpt_output_sample_011023_temp0.p', 'rb'))

In [5]:
telegram_record = sample[0]

In [6]:
len(text2chatgpt_output)

930

In [7]:
max([len(x['text']) for x in sample if isinstance(x['text'], str)])

8923

In [8]:
np.median([len(x['text']) for x in sample if isinstance(x['text'], str)])

100.0

In [9]:
np.median([len(x['tg_preview_text']) for x in sample if isinstance(x['text'], str)])

400.0

In [10]:
np.max([len(x['tg_preview_text']) for x in sample if isinstance(x['text'], str)])

2006

In [11]:
import time
from tqdm import tqdm

all_parsed_manipulation_methods = []
all_parsed_logical_fallacies = []
errors = []
db_records = []
# LENGTH_THRESHOLD = {'MIN': 30, 'MAX': 2048}

for i, telegram_record in enumerate(tqdm(sample)):
    db_record = {}
    text_to_check = ''
    for field in ['tg_preview_text']:#, 'text']:
        if isinstance(telegram_record[field], str):
            text_to_check += telegram_record[field] + " "
    text_to_check = text_to_check.strip()
    if text_to_check:
        if text_to_check in text2chatgpt_output:
            process_result = text2chatgpt_output[text_to_check]
        else:
            process_result = {}
    else:
        process_result = {}
    if process_result:
        try:
            db_record['text'] = telegram_record['tg_preview_text']
            db_record['source'] = telegram_record['channel']
            db_record['date'] = telegram_record['created_at'].strftime('%Y-%m-%d %H:%M:%S')
            db_record['url'] = telegram_record['message_url']
            db_record['platform'] = 'telegram'
            db_record['full_text'] = telegram_record['text'] if isinstance(telegram_record['text'], str) else ''
            db_record['title'] = telegram_record.get('title', '')
        except Exception as e:
            errors.append((i, e, 'failed to process telegram record'))
            print((i, e, 'failed to process telegram record'))
            db_record['text'] = 'unprocessed'
            db_record['source'] = 'unprocessed'
            db_record['date'] = 'unprocessed'
            db_record['url'] = 'unprocessed'
            db_record['platform'] = 'telegram'
            db_record['full_text'] = 'unprocessed'
            db_record['title'] = 'unprocessed'
        try:
            db_record['text_type'] = process_result['text_type'].lower()
            db_record['source_references'] = process_result.get('references', 'uncalculated')
            db_record['source_references_present'] = process_result.get('references_present', 0)
            db_record['logical_fallacies_present'] = process_result.get('logical_fallacies_present', 0)
            db_record['hatespeech_present'] = 0 if any(x in process_result.get('hatespeech', 'не найден') for x in ['не найден', 'не обнаружен', 'отсутствует']) else 1
            db_record['hatespeech_indicators'] = process_result.get('hatespeech', 'uncalculated')
            parsed_manipulation_methods = parse_chatgpt_field(process_result, 'manipulation_methods')
            all_parsed_manipulation_methods.append(parsed_manipulation_methods)
            parsed_logical_fallacies = parse_chatgpt_field(process_result, 'logical_fallacies')
            all_parsed_logical_fallacies.append(parsed_logical_fallacies)
        except Exception as e:
            errors.append((i, e, 'failed to parse chatgpt output'))
            print((i, e, 'failed to parse chatgpt output'))
    else:
        db_record['text_type'] = 'undefined'
        db_record['source_references'] = 'uncalculated'
        db_record['source_references_present'] = 0
        db_record['logical_fallacies_present'] = 0
        db_record['hatespeech_present'] = 0
        db_record['hatespeech_indicators'] = 'uncalculated'
    if 'text' in db_record:
        db_records.append(db_record)

100%|████████████████████████████████████████████████████████████████████████████████████████| 3000/3000 [00:00<00:00, 23420.00it/s]


In [12]:
def simplify_text_type(db_record_text_type):
    for text_type in text_types:
        if text_type in db_record_text_type:
            return text_type
    if 'коммент' in db_record_text_type:
        return 'публичный комментарий'
    if 'стать' in db_record_text_type:
        return 'статья'
    if 'новост' in db_record_text_type:
        return 'новость'
    return 'другое'

In [13]:
import sqlite3
from datetime import datetime

# Connect to the SQLite database (or create a new one if it doesn't exist)
conn = sqlite3.connect('sample_database.db')
cursor = conn.cursor()

text_types = ('личное сообщение', 'статья', 'новость', 'запись в личном блоге', 'публичный комментарий', 'другое', 'undefined')
# Create the 'texts' table
cursor.execute(f'''
    CREATE TABLE IF NOT EXISTS texts (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        text TEXT,
        source TEXT,
        text_type TEXT CHECK(text_type IN {text_types}),
        date TIMESTAMP,
        url TEXT,
        platform TEXT,
        full_text TEXT,
        title TEXT,
        source_references TEXT,
        source_references_present BOOLEAN,
        logical_fallacies_present BOOLEAN,
        hatespeech_present BOOLEAN,
        hatespeech_indicators TEXT)
''')

for db_record in db_records:

    cursor.execute('''
        INSERT INTO texts (
            text, source, text_type, date, url, platform, full_text, title, source_references,
            source_references_present, logical_fallacies_present,
            hatespeech_present, hatespeech_indicators
        ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
    ''', (
        db_record['text'], db_record['source'], simplify_text_type(db_record['text_type']),
        db_record['date'], db_record['url'], db_record['platform'],
        db_record['full_text'], db_record['title'], db_record['source_references'],
        db_record['source_references_present'], db_record['logical_fallacies_present'],
        db_record['hatespeech_present'], db_record['hatespeech_indicators']
    ))
    conn.commit()

# Commit changes and close the connection
conn.commit()
conn.close()

In [14]:
conn = sqlite3.connect('sample_database.db')
cursor = conn.cursor()
# Create the 'manipulation_methods' table
cursor.execute('''
    CREATE TABLE IF NOT EXISTS manipulation_methods (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        manipulation_method_name TEXT)
''')

# Create the 'logical_fallacies' table
cursor.execute('''
    CREATE TABLE IF NOT EXISTS logical_fallacies (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        logical_fallacy_name TEXT)
''')

# Create the 'text_2_manipulation_methods' junction table
cursor.execute('''
    CREATE TABLE IF NOT EXISTS text_2_manipulation_methods (
        text_id INTEGER,
        manipulation_method_id INTEGER,
        manipulation_method_explanation TEXT,
        FOREIGN KEY (text_id) REFERENCES texts (id),
        FOREIGN KEY (manipulation_method_id) REFERENCES manipulation_methods (id),
        PRIMARY KEY (text_id, manipulation_method_id)
    )
''')


# Create the 'text_2_logical_fallacies' junction table
cursor.execute('''
    CREATE TABLE IF NOT EXISTS text_2_logical_fallacies (
        text_id INTEGER,
        logical_fallacy_id INTEGER,
        logical_fallacy_explanation TEXT,
        FOREIGN KEY (text_id) REFERENCES texts (id),
        FOREIGN KEY (logical_fallacy_id) REFERENCES manipulation_methods (id),
        PRIMARY KEY (text_id, logical_fallacy_id)
    )
''')

manipulation_methods = sorted(set([y[0] for x in all_parsed_manipulation_methods for y in x]))
# Insert manipulation methods into the 'manipulation_methods' table
for method in manipulation_methods:
    cursor.execute('INSERT INTO manipulation_methods (manipulation_method_name) VALUES (?)', (method,))
logical_fallacies = sorted(set([y[0] for x in all_parsed_logical_fallacies for y in x]))
# Insert manipulation methods into the 'manipulation_methods' table
for logical_fallacy in logical_fallacies:
    cursor.execute('INSERT INTO logical_fallacies (logical_fallacy_name) VALUES (?)', (logical_fallacy,))
    
# Commit changes and close the connection
conn.commit()
conn.close()

In [15]:
conn = sqlite3.connect('sample_database.db')
for method_name in {y[0] for x in all_parsed_manipulation_methods for y in x}:
    cursor = conn.cursor()
    cursor.execute('''
                SELECT id FROM manipulation_methods
                WHERE manipulation_method_name = ?
            ''', (method_name,))
    print(cursor.fetchone()[0], end = ' ')
conn.close()

37 34 91 9 88 68 72 1 65 51 45 49 14 97 42 48 33 23 66 61 59 98 2 31 12 57 35 58 85 46 47 28 70 94 67 41 55 15 83 24 25 53 30 40 56 99 93 26 38 22 79 20 62 64 84 3 43 18 8 39 17 52 90 60 77 11 63 86 29 32 92 73 27 75 13 50 21 80 5 82 89 76 54 96 7 81 95 87 16 71 19 6 74 44 10 36 78 69 4 

In [16]:
conn = sqlite3.connect('sample_database.db')
cursor = conn.cursor()
errors = []
for i, record in enumerate(db_records):
    current_manips = all_parsed_manipulation_methods[i]
    cursor.execute('''
            SELECT id FROM texts
            WHERE text = ?
        ''', (record['text'],))
    text_id = cursor.fetchone()[0]

    # Associate manipulation methods with the texts article in the 'texts_manipulation' table
    for method in current_manips:
        try:
            cursor.execute('''
                INSERT INTO text_2_manipulation_methods (text_id, manipulation_method_id, manipulation_method_explanation)
                VALUES (?, (SELECT id FROM manipulation_methods WHERE manipulation_method_name = ?), ?)
            ''', (text_id, method[0], method[1]))
        except Exception as e:
            errors.append(str(e))
print(len(errors), len(set(errors)))
# Commit changes and close the connection
conn.commit()
conn.close()

291 1


In [17]:
set(errors)

{'UNIQUE constraint failed: text_2_manipulation_methods.text_id, text_2_manipulation_methods.manipulation_method_id'}

In [19]:
conn = sqlite3.connect('sample_database.db')
cursor = conn.cursor()
cursor.execute('''
    SELECT * FROM text_2_manipulation_methods
''')

# Fetch all the records
records = cursor.fetchall()
print(len(records))
conn.close()

4327


In [20]:
records[0]

(2,
 31,
 'В тексте упоминается только одна сторона - контролируемая Республиканской партией палата представителей США. Это создает впечатление, что других мнений или точек зрения нет.')

In [21]:
conn = sqlite3.connect('sample_database.db')
text_id = 2
cursor = conn.cursor()
cursor.execute('''
        SELECT * FROM texts
        WHERE id = ?
    ''', (text_id,))
text_details = cursor.fetchone()
conn.close()
text_details

(2,
 'Контролируемая Республиканской партией палата представителей США представила собственный проект финансирования правительства на 45 дней, не включающего помощь Украине, сообщил конгрессмен Коул. Голосование по нему может состояться уже в ближайшее время.',
 'rian_ru',
 'новость',
 '2023-09-30 18:42:35',
 'https://t.me/rian_ru/216889',
 'telegram',
 '',
 '',
 'Источники: не обнаружены.',
 0,
 0,
 1,
 'В данном тексте нет дискриминирующей, стигматизирующей или провоцирующей ненависть лексики, высказываний или идей.')

In [22]:
conn = sqlite3.connect('sample_database.db')
cursor = conn.cursor()
cursor.execute('''
    SELECT * FROM text_2_manipulation_methods 
    WHERE text_id = ?
''', (text_details[0],))

# Fetch all the records
records = cursor.fetchall()
manip_ids = [r[1] for r in records]
manip_labels = []
for manip_id in manip_ids:
    cursor.execute('''
        SELECT * FROM manipulation_methods 
        WHERE id = ?
    ''', (manip_id,))

    # Fetch all the records
    records = cursor.fetchall()
    manip_labels.extend([r[1] for r in records])
print(', '.join(sorted(set(manip_labels))))
conn.close()

Влияние на эмоции и ценности, Использование выборочной информации, Манипуляция через акцентирование внимания на последствиях для личности/группы, Некорректное обобщение, Отсутствие контекста, Отсутствие противоположной точки зрения, Подмена понятий
