In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import numpy as np
import matplotlib.pyplot as plt
import spacy
from wordcloud import WordCloud
import seaborn as sns

In [None]:
def process_url(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        article_urls = []

        articles = soup.find_all("article")
        for article in articles:
            article_url = article.find("a")["href"]
            if article_url.startswith('/'):
                article_url = 'https://www.iefimerida.gr' + article_url
            article_urls.append(article_url)

        return article_urls
    else:
        print(f"Failed to retrieve page {url}. Status code: {response.status_code}")
        return []

base_url = 'https://www.iefimerida.gr/tag/ekloges-toyrkia?page='
start_page = 1
end_page = 11

all_article_urls = []

for page_num in range(start_page, end_page + 1):
    print(f"Processing page: {page_num}")
    url = base_url + str(page_num)
    article_urls = process_url(url)
    all_article_urls.extend(article_urls)

print(f"Total articles found: {len(all_article_urls)}")

print("Example article URLs:")
for idx, article_url in enumerate(all_article_urls[:5], start=1):
    print(f"{idx}. {article_url}")

In [None]:
len(all_article_urls)

In [None]:
iefimerida_turkel_urls = pd.DataFrame(all_article_urls)

In [None]:
iefimerida_turkel_urls;

In [None]:
iefimerida_turkel_urls.to_csv('C:\\Users\\stath\\iefimerida_turkel_urls.csv', index=False)


In [None]:
full_articles_list = []
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
                  '(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

for index, processed_url in enumerate(iefimerida_turkel_urls[0]):
    full_article_dict = {}
    try:
        response = requests.get(processed_url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')

        full_article_dict['site'] = "iefimerida.gr"
        full_article_dict['url'] = processed_url

        title_tag = soup.find("h1", {"class": "f-big w-bold"})
        title = title_tag.get_text(strip=True) if title_tag else 'No Title Found'
        full_article_dict['title'] = title

        category_tag = soup.find("div", {"class": "details f-details-alt c-red"})
        category = category_tag.get_text(strip=True) if category_tag else 'No Category Found'
        full_article_dict['category'] = category

        
        datetime_div = soup.find("div", {"class": "f-details c-black w-regular details-below"})
        if datetime_div:
            publication_time_tag = datetime_div.find("time")
            publication_time = publication_time_tag.get_text(strip=True) if publication_time_tag else 'No Publication Time Found'
        else:
            publication_time = 'No Publication Time Found'
        full_article_dict['publication_time'] = publication_time

        main_content = soup.find("div", {"class": "field--name-body"})
        content = ''
        if main_content:
            for child in main_content.children:
                if child.name == 'p':
                    content += child.get_text(strip=True) + '\n'
                elif child.name == 'h2':
                    content += '\n' + child.get_text(strip=True) + '\n'
        full_article_dict['content'] = content.strip()

    except Exception as e:
        print(f"Error processing {processed_url}: {e}")
        full_article_dict = {
            'site': "iefimerida.gr",
            'url': processed_url,
            'title': 'Error',
            'category': 'Error',
            'publication_time': 'Error',
            'content': 'Error'
        }

    print(f"Processed {index + 1}/{len(iefimerida_turkel_urls[0])}: {processed_url}")

    full_articles_list.append(full_article_dict)

    time.sleep(1.5)

    iefimerida_turkel = pd.DataFrame(full_articles_list)

In [None]:
iefimerida_turkel.rename(columns={
    'content': 'text',
    'publication_time': 'datetime'
}, inplace=True)

In [None]:
iefimerida_turkel.to_csv('C:\\Users\\stath\\iefimerida_turkish_elections.csv', index=False)

In [None]:
iefimerida_turkel = pd.read_csv('C:\\Users\\stath\\iefimerida_turkish_elections.csv')

In [None]:
iefimerida_turkel.shape

In [None]:
pd.set_option('display.max_rows', 200)

In [None]:
short_df = iefimerida_turkel[["title", "datetime"]]

In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
short_df

In [None]:
iefimerida_turkel["text"] = iefimerida_turkel["text"].str.replace("\n", "")
iefimerida_turkel["text"] = iefimerida_turkel["text"].str.replace("\xa0", "")

In [None]:
nlp = spacy.load('el_core_news_md')

In [None]:
iefimerida_turkel_full_text = iefimerida_turkel['text'].str.cat(sep = ' ')

In [None]:
iefimerida_turkel_full_doc = nlp(iefimerida_turkel_full_text)

In [None]:
lemmatized_text = ' '.join(token.lemma_ for token in iefimerida_turkel_full_doc)

In [None]:
stopwords = nlp.Defaults.stop_words
stopwords.add("ς")
stopwords.add("μπορώ")
stopwords.add("αναφέρω")
stopwords.add("υπάρχω")
stopwords.add("γίνομαι")
stopwords.add("ή")
stopwords.add("κάνω")
stopwords.add("θέλω")
stopwords.add("κ")
stopwords.add("λέγω")

In [None]:
lemmatized_text = lemmatized_text.replace("Κεμάλς Κιλιτσντάρογλου", "Κεμάλ Κιλιτσντάρογλου")
wordcloud_iefimerida_turkel = WordCloud(
    stopwords=nlp.Defaults.stop_words,
    width=2000,
    height=1000,
    background_color='black'
).generate(lemmatized_text)

fig = plt.figure(
    figsize=(40, 30),
    facecolor='k',
    edgecolor='k'
)
plt.imshow(wordcloud_iefimerida_turkel, interpolation='bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
cv = CountVectorizer(stop_words= list(nlp.Defaults.stop_words), min_df=0.01, max_df=0.95)

In [None]:
count_vector = cv.fit_transform(iefimerida_turkel["text"])

In [None]:
cv = CountVectorizer(stop_words= list(nlp.Defaults.stop_words), max_features=40, ngram_range=(2,8))
count_vector = cv.fit_transform(iefimerida_turkel["text"])
iefimerida_turkel_bigrams = pd.DataFrame(count_vector.toarray(), columns=cv.get_feature_names_out()) 

In [None]:
iefimerida_turkel_bigrams.sum(axis =0).sort_values(ascending = False)

In [None]:
data = {"Tαγίπ Eρντογάν"    :                 116,
"Κεμάλ Κιλιτσντάρογλου"  :              92,
"Ρετζέπ Ταγίπ Ερντογάν"  :              92,
"δεύτερο γύρο"      :                   57,
"προεδρικές εκλογές"   :               37,
"Τούρκος Πρόεδρος"   :                  35,
"14ης μαΐου"        :                   32,
"εκλογές Τουρκία"    :                  30,
"προεδρικών εκλογών" :                 22,
"14 μαΐου"        :                     22,
"βουλευτικές εκλογές"   :               22,
"τουρκικές εκλογές"     :               20,
"28 μαΐου"   :                          19,
"εξωτερική πολιτική"  :                 18,
"σινάν ογάν"    :                       16,
"μουχαρέμ ιντζέ" :                      16,
"εκλογές 14ης"  :                       16,
"δικαιοσύνης ανάπτυξης" :              16,
"ρεπουμπλικανικού λαϊκού κόμματος" :    15,
"ΑΠΕ ΜΠΕ"  :                           15,
"Ερντογάν Κιλιτσντάρογλου"  :           15,
"εκλογές 14ης μαΐου" :                  15,
"γύρο προεδρικών"  :                    15,
"μεγάλο βαθμό"    :                     14,
"πηγή ΑΠΕ ΜΠΕ"     :                   14,
"γύρο προεδρικών εκλογών"  :            14,
"σινάν ογκάν"  :                        14,
"συμμαχίας αντιπολίτευσης"  :           13,
"Τούρκου Προέδρου" :                    13,
"γύρο εκλογών"     :                    13,
"κόμματος CHP"     :                    13,
"κόμμα δικαιοσύνης ανάπτυξης" :         13}

df = pd.DataFrame(list(data.items()), columns=['Διγράμματα / Πολυγράμματα', 'Αναφορές'])

df = df.sort_values(by='Αναφορές', ascending=False)

plt.figure(figsize=(10, 8))
plt.barh(df['Διγράμματα / Πολυγράμματα'], df['Αναφορές'], color='black', edgecolor= 'black')
bars = plt.barh(df['Διγράμματα / Πολυγράμματα'], df['Αναφορές'], color='black', edgecolor= 'black')
for bar in bars:
    plt.text(
        bar.get_width() + 2, bar.get_y() + bar.get_height() / 2,  
        str(bar.get_width()),
        va='center',
        ha='left',  
        color='black',
        fontsize=10
    )
    
plt.xlabel('Αναφορές')
plt.ylabel('Διγράμματα / Πολυγράμματα')
plt.xlim(0,130)
plt.title('Διγράμματα/Πολυγράμματα Τουρκικών Εκλογών 2023 - Iefimerida')
plt.gca().invert_yaxis() 
plt.show()

In [None]:
filepath = "https://raw.githubusercontent.com/datajour-gr/DataJournalism/main/Bachelor%20Lessons%202023/Lesson%2010/NRC_GREEK_Translated_6_2020.csv"

In [None]:
emolex_df = pd.read_csv(filepath)

In [None]:
emolex_df = emolex_df.drop_duplicates(subset=['word'])
emolex_df = emolex_df.dropna()
emolex_df.reset_index(inplace = True, drop=True)

In [None]:
vec = CountVectorizer(analyzer = 'word', vocabulary = emolex_df.word,
                      lowercase=False, 
                      strip_accents = 'unicode',  
                      stop_words= list(nlp.Defaults.stop_words),
                      ngram_range=(1, 2))

In [None]:
matrix = vec.fit_transform(iefimerida_turkel["text"])
vocab = vec.get_feature_names_out()
wordcount_df = pd.DataFrame(matrix.toarray(), columns=vocab)

In [None]:
positive_words = emolex_df[emolex_df.Positive == 1]['word']

In [None]:
negative_words = emolex_df[emolex_df.Negative == 1]['word']

In [None]:
iefimerida_turkel['positive_text'] = wordcount_df[positive_words].sum(axis=1)

In [None]:
iefimerida_turkel['negative_text'] = wordcount_df[negative_words].sum(axis=1)

In [None]:
iefimerida_turkel['pos/neg_text'] = iefimerida_turkel['positive_text'] - iefimerida_turkel['negative_text']

In [None]:
iefimerida_turkel['pos/neg_text'].mean()

In [None]:
joy_words = emolex_df[emolex_df.Joy == 1]['word']

In [None]:
anger_words = emolex_df[emolex_df.Anger == 1]['word']

In [None]:
iefimerida_turkel['joy_text'] = wordcount_df[joy_words].sum(axis=1)

In [None]:
iefimerida_turkel['anger_text'] =  wordcount_df[anger_words].sum(axis=1)

In [None]:
iefimerida_turkel['joy/anger_text'] = iefimerida_turkel['joy_text'] - iefimerida_turkel['anger_text']

In [None]:
iefimerida_turkel['joy/anger_text'].mean()

In [None]:
fear_words = emolex_df[emolex_df.Fear == 1]['word']

In [None]:
trust_words = emolex_df[emolex_df.Trust == 1]['word']

In [None]:
iefimerida_turkel['fear_text'] = wordcount_df[fear_words].sum(axis=1)

In [None]:
iefimerida_turkel['trust_text'] = wordcount_df[trust_words].sum(axis=1)

In [None]:
iefimerida_turkel['trust/fear_text'] = iefimerida_turkel['trust_text'] - iefimerida_turkel['fear_text']

In [None]:
iefimerida_turkel['trust/fear_text'].mean()

In [None]:
matrix = vec.fit_transform(iefimerida_turkel["title"])
vocab = vec.get_feature_names_out()
wordcount_df = pd.DataFrame(matrix.toarray(), columns=vocab)

In [None]:
iefimerida_turkel['positive_title'] = wordcount_df[positive_words].sum(axis=1)

In [None]:
iefimerida_turkel['negative_title'] = wordcount_df[negative_words].sum(axis=1)

In [None]:
iefimerida_turkel['pos/neg_title'] = iefimerida_turkel['positive_title'] - iefimerida_turkel['negative_title']

In [None]:
iefimerida_turkel['pos/neg_title'].mean()

In [None]:
iefimerida_turkel['joy_title'] = wordcount_df[joy_words].sum(axis=1)

In [None]:
iefimerida_turkel['anger_title'] =  wordcount_df[anger_words].sum(axis=1)

In [None]:
iefimerida_turkel['joy/anger_title'] = iefimerida_turkel['joy_title'] - iefimerida_turkel['anger_title']

In [None]:
iefimerida_turkel['joy/anger_title'].mean()

In [None]:
iefimerida_turkel['fear_title'] = wordcount_df[fear_words].sum(axis=1)

In [None]:
iefimerida_turkel['trust_title'] = wordcount_df[trust_words].sum(axis=1)

In [None]:
iefimerida_turkel['trust/fear_title'] = iefimerida_turkel['trust_title'] - iefimerida_turkel['fear_title']

In [None]:
iefimerida_turkel['trust/fear_title'].mean()

In [None]:
means = {
    "Θετικότητα/Αρνητικότητα": {
        'Πολικότητα Κειμένων': iefimerida_turkel['pos/neg_text'].mean().round(2),
        'Πολικότητα Τίτλων': iefimerida_turkel['pos/neg_title'].mean().round(2)
    },
    'Εμπιστοσύνη/Φόβος': {
        'Πολικότητα Κειμένων': iefimerida_turkel['trust/fear_text'].mean().round(2),
        'Πολικότητα Τίτλων': iefimerida_turkel['trust/fear_title'].mean().round(2)
    },
    'Χαρά/Θυμός': {
        'Πολικότητα Κειμένων': iefimerida_turkel['joy/anger_text'].mean().round(2),
        'Πολικότητα Τίτλων': iefimerida_turkel['joy/anger_title'].mean().round(2),
    }
}


means_df = pd.DataFrame(means).T


color_map = {
    "Θετικότητα/Αρνητικότητα": 'green',
    'Εμπιστοσύνη/Φόβος': 'lightblue',
    'Χαρά/Θυμός': 'crimson'
}


new_col_titles = ['Πολικότητα Κειμένων', 'Πολικότητα Τίτλων']


fig, ax = plt.subplots(figsize=(14, 8))


ax.xaxis.set_visible(False)
ax.yaxis.set_visible(False)
ax.set_frame_on(False)


table = ax.table(
    cellText=means_df.values,
    rowLabels=means_df.index,
    colLabels=new_col_titles,
    cellLoc='center',
    loc='center',
    colColours=['lightgrey'] * len(new_col_titles),
    rowColours=['white'] * len(means_df.index)
)


for i, label in enumerate(means_df.index):
    row_color = color_map.get(label, 'white')  # Default to white if not found in the map
    for j in range(len(new_col_titles)):
        table[(i + 1, j)].set_facecolor(row_color)


table.auto_set_font_size(False)
table.set_fontsize(14)
table.scale(1.5, 3)


plt.show()

In [None]:
titles_erdogan = iefimerida_turkel[iefimerida_turkel['title'].str.contains('Ερντογάν', case=False, na=False)]

In [None]:
titles_erdogan

In [None]:
iefimerida_turkel.shape

In [None]:
titles_Κιλιτσντάρογλου= iefimerida_turkel[iefimerida_turkel['title'].str.contains('Κιλιτσντάρογλου', case=False, na=False)]

In [None]:
titles_Κιλιτσντάρογλου.shape

In [None]:
titles_with_both = iefimerida_turkel[
    iefimerida_turkel['title'].str.contains(r'Ερντογάν.*Κιλιτσντάρογλου|Κιλιτσντάρογλου.*Ερντογάν', case=False, na=False)
]

titles_with_both.shape


In [None]:
pip install matplotlib-venn


In [None]:
import matplotlib.pyplot as plt
from matplotlib_venn import venn2

# Data for the Venn diagram
count_erdogan_only = 58 - 18
count_kilicdaroglu_only = 35 - 18
count_both = 18
total_titles = 110

# Calculate titles that do not contain either term
count_neither = total_titles - (count_erdogan_only + count_kilicdaroglu_only + count_both)

# Create a Venn diagram
plt.figure(figsize=(10, 8))

venn = venn2(subsets=(count_erdogan_only, 
                      count_kilicdaroglu_only, 
                      count_both), 
             set_labels=('Ερντογάν', 'Κιλιτσντάρογλου'))

# Add title
plt.title('Δημοσιεύματα για Ερντογάν-Κιλιτσντάρογλου - Iefimerida')

# Add text for the total number of titles
plt.text(0.5, -0.3, f'Σύνολο Άρθρων: {total_titles}', ha='center', va='center', fontsize=8, color='black')

# Display the Venn diagram
plt.show()


In [None]:
count_vector = cv.fit_transform(titles_erdogan["title"])

In [None]:
cv = CountVectorizer(stop_words= list(nlp.Defaults.stop_words), max_features=100, ngram_range=(2,2))
count_vector = cv.fit_transform(titles_erdogan["title"])
titles_erdogan_bigrams = pd.DataFrame(count_vector.toarray(), columns=cv.get_feature_names_out())

In [None]:
titles_erdogan_bigrams.sum(axis =0).sort_values(ascending = False)

In [None]:
count_vector = cv.fit_transform(titles_Κιλιτσντάρογλου["title"])

In [None]:
cv = CountVectorizer(stop_words= list(nlp.Defaults.stop_words), max_features=100, ngram_range=(2,2))
count_vector = cv.fit_transform(titles_Κιλιτσντάρογλου["title"])
titles_Κιλιτσντάρογλου_bigrams = pd.DataFrame(count_vector.toarray(), columns=cv.get_feature_names_out())

In [None]:
titles_Κιλιτσντάρογλου_bigrams.sum(axis =0).sort_values(ascending = False)

In [None]:
import geopandas as gpd


shapefile_path = r"C:\Users\stath\OneDrive\Υπολογιστής\ne_110m_admin_0_countries\ne_110m_admin_0_countries.shp"


world = gpd.read_file(shapefile_path)


greece_turkey = world[world["SOVEREIGNT"].isin(['Greece', 'Turkey'])]


fig, ax = plt.subplots(figsize=(20, 10))  # Increased size


greece_turkey[greece_turkey["SOVEREIGNT"] == "Greece"].plot(ax=ax, color='blue', edgecolor='black', label='Greece')
greece_turkey[greece_turkey["SOVEREIGNT"] == "Turkey"].plot(ax=ax, color='red', edgecolor='black', label='Turkey')


ax.set_title('Τίτλοι δημοσιευμάτων για Μητσοτάκη-Τσίπρα, Ερντογάν-Κιλιτσντάρογλου - Iefimerida', fontsize=17)


ax.grid(True, linestyle='--', alpha=0.7)


greece_coords = (20.0, 38.5) 
turkey_coords = (33.0, 37.5)  

# Phrases for Greece
greece_phrases = [
    "Μητσοτάκης: (αυτούσια δήλωση)",
    
    "Τσίπρας: (αυτούσια δήλωση)"
]

# Phrases for Turkey
turkey_phrases = [
    "μονοκρατορία Eρντογάν",
    "oικονομική κατάρρευση",
    "πελατειακό κράτος",
    "Ερντογάν μοιράζει δώρα",
    "λάθη Κιλιτσντάρογλου/Ερντογάν"
]


for i, phrase in enumerate(greece_phrases):
    ax.text(greece_coords[0], greece_coords[1] + i*0.7, phrase, fontsize=14, color='black', ha='left', weight='bold')


for i, phrase in enumerate(turkey_phrases):
    ax.text(turkey_coords[0], turkey_coords[1] + i*0.7, phrase, fontsize=14, color='black', ha='center', weight='bold')


ax.legend()

plt.show()

In [None]:
import re

frames = {
    'Σύγκρουση': [
        'κόντρα', 'VS', 'μάχη', 'επίθεση', 'αντιπαράθεση', '«σφάχτηκαν»', '«επιτίθεται»', "ή", "δίλημμα",
        '«πυρά»', '«Πυρά»','πυρά', 'Πυρά', 'μπηχτή', 'μεταξύ', 'σύγκρουση', '«σκοτωμός»', 'απαντά', '«σπόντα»',
        '«μάχης»', 'μια πλευρά', 'Μητσοτάκη-Τσίπρα', 'απάντηση', 'καυγά', 'αίμα', 'απαντάμε',
        'διαφορά', 'έναντι', 'σύγκριση', 'κατά', 'εναντίον','προβάδισμα', 'Μητσοτάκης-Τσίπρας', 'ΝΔ-ΣΥΡΙΖΑ' 'ΝΔ - ΣΥΡΙΖΑ', "Ερντογάν - Κιλιτσντάρογλου, Ερντογάν-Κιλιτσντάρογλου", "ελληνοτουρκικά"
    ]
}

def detect_conflict_frames(title, frames):
    conflict_keywords = frames['Σύγκρουση']
    
    pattern = r'\b(?:' + '|'.join(map(re.escape, conflict_keywords)) + r')\b'

    return bool(re.search(pattern, title))


iefimerida_turkel['Conflict'] = iefimerida_turkel['title'].apply(lambda x: 1 if detect_conflict_frames(x, frames) else 0)

In [None]:
iefimerida_turkel 

In [None]:
iefimerida_turkel['Conflict'].value_counts()

In [None]:
iefimerida_turkel.shape

In [None]:
12.72