In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import numpy as np
import matplotlib.pyplot as plt
import spacy
from wordcloud import WordCloud
import seaborn as sns

In [None]:
tpp_turkel_urls = []
base_url = "https://thepressproject.gr/category/international/page/"

keywords = ["erntogan", "ntemirtas", "tourkia", "tourkikes", "tourkoi", "tourkous", "kilitsntaroglou"]

for page_num in range(200, 240):
    url = base_url + str(page_num)
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        articles = soup.find_all('div', {"class": "col-md-8 archive-item"})

        print(f"Page {page_num}: Found {len(articles)} articles")

        for article in articles:
            article_url = article.find('a')['href']
            if any(keyword in article_url for keyword in keywords):
                tpp_turkel_urls.append(article_url)
    else:
        print(f"Failed to retrieve the page {page_num}. Status code: {response.status_code}")

print("Collected URLs:", tpp_turkel_urls)
print("Total URLs collected:", len(tpp_turkel_urls))


In [None]:
tpp_turkel_urls = pd.DataFrame(tpp_turkel_urls)

In [None]:
full_articles_list = []
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
                  '(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

for index, processed_url in enumerate(tpp_turkel_urls[0]):
    full_article_dict = {}
    try:
        response = requests.get(processed_url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')

        full_article_dict['site'] = "thepressproject.gr"
        full_article_dict['url'] = processed_url

        title_tag = soup.find("h1", {"class": "entry-title"})
        title = title_tag.get_text(strip=True) if title_tag else 'No Title Found'
        full_article_dict['title'] = title

        datetime_tag = soup.find("div", {"class": "article-date"})
        datetime = datetime_tag.get_text(strip=True) if datetime_tag else 'No Datetime Found'
        full_article_dict['datetime'] = datetime

        article_lead_tag = soup.find("div", {"class": "subtitle article-summary"})
        article_lead = article_lead_tag.get_text(strip=True) if article_lead_tag else 'No article lead Found'
        full_article_dict['article_lead'] = article_lead
        
        text = soup.find("div", {"class": "main-content article-content"})
        content = ''
        if text:
            for child in text.children:
                if child.name == 'p':
                    content += child.get_text(strip=True) + '\n'
                elif child.name == 'h2':
                    content += '\n' + child.get_text(strip=True) + '\n'
            full_article_dict['text'] = content.strip()
        else:
            full_article_dict['text'] = 'No text found'

    except Exception as e:
        print(f"Error processing {processed_url}: {e}")
        full_article_dict = {
            'site': "thepressproject.gr",
            'url': processed_url,
            'title': 'Error',
            'datetime': 'Error',
            'article_lead': 'Error',
            'text': 'Error'
        }

    print(f"Processed {index + 1}/{len(tpp_turkel_urls[0])}: {processed_url}")

    full_articles_list.append(full_article_dict)

    time.sleep(1)

tpp_turkel = pd.DataFrame(full_articles_list)

In [None]:
tpp_turkel.to_csv('C:\\Users\\stath\\tpp_turkish_elections.csv', index=False)


In [None]:
tpp_turkel = pd.read_csv('C:\\Users\\stath\\tpp_turkish_elections.csv')

In [None]:
tpp_turkel

In [None]:
tpp_turkel["datetime"] = tpp_turkel["datetime"].str.replace("Αναρτήθηκε:", "")
tpp_turkel["datetime"] = tpp_turkel["datetime"].str.replace("2023Αναρτήθηκε:", "2023")
tpp_turkel["datetime"] = tpp_turkel["datetime"].str.replace("Δευτέρα", "")
tpp_turkel["datetime"] = tpp_turkel["datetime"].str.replace("Τρίτη", "")
tpp_turkel["datetime"] = tpp_turkel["datetime"].str.replace("Τετάρτη","")
tpp_turkel["datetime"] = tpp_turkel["datetime"].str.replace("Πέμπτη", "")
tpp_turkel["datetime"] = tpp_turkel["datetime"].str.replace("Παρασκευή", "")
tpp_turkel["datetime"] = tpp_turkel["datetime"].str.replace("Σάββατο","")
tpp_turkel["datetime"] = tpp_turkel["datetime"].str.replace("Κυριακή", "")

In [None]:
tpp_turkel["text"] = tpp_turkel["text"].str.replace("\n", "")

In [None]:
def replace_month_name(date_str):
    months = {
        "Ιανουαρίου": "01",
        "Φεβρουαρίου": "02",
        "Μαρτίου": "03",
        "Απριλίου": "04",
        "Μαΐου": "05",
        "Ιουνίου": "06",
        "Ιουλίου": "07",
        "Αυγούστου": "08",
        "Σεπτεμβρίου": "09",
        "Οκτωβρίου": "10",
        "Νοεμβρίου": "11",
        "Δεκεμβρίου": "12"
    }
    for greek_month, month_num in months.items():
        if greek_month in date_str:
            return date_str.replace(greek_month, month_num)
    return date_str


tpp_turkel['datetime'] = tpp_turkel['datetime'].apply(lambda x: replace_month_name(x).strip())
print("After replacement:")
print(tpp_turkel['datetime'])

tpp_turkel['datetime'] = pd.to_datetime(tpp_turkel['datetime'], format='%d %m %Y %H:%M:%S', errors='coerce')

print("After conversion to datetime:")
print(tpp_turkel['datetime'])


failed_conversions = tpp_turkel[tpp_turkel['datetime'].isna()]
print("Failed conversions:")
print(failed_conversions)


In [None]:
tpp_turkel['article_lead'] = tpp_turkel['article_lead'].fillna('')
tpp_turkel['text'] = tpp_turkel['text'].fillna('')

tpp_turkel['full_content'] = tpp_turkel['article_lead'] + ' ' + tpp_turkel['text']




In [None]:
tpp_turkel

In [None]:
tpp_turkel["title"][3]

In [None]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud

In [None]:
import spacy
print(spacy.__version__)

In [None]:
import seaborn as sns

In [None]:
nlp = spacy.load('el_core_news_md')

In [None]:
tpp_turkel_full_text = tpp_turkel['full_content'].str.cat(sep = ' ')

In [None]:
print(len(tpp_turkel_full_text))

In [None]:
nlp.max_length = 2000000

In [None]:
tpp_turkel_full_doc = nlp(tpp_turkel_full_text)

In [None]:
from joblib import dump, load

In [None]:
dump(tpp_turkel_full_doc, 'tpp_turkel_full_doc.joblib')

In [None]:
lemmatized_text = ' '.join(token.lemma_ for token in tpp_turkel_full_doc)

In [None]:
lemmatized_text = lemmatized_text.replace("Κεμάλς Κιλιτσντάρογλου", "Κεμάλ Κιλιτσντάρογλου")
wordcloud_tpp_turkel = WordCloud(
    stopwords = nlp.Defaults.stop_words,
    width = 2000,
    height = 1000,
    background_color = 'black'
 ).generate(lemmatized_text)
fig = plt.figure(
    figsize = (40, 30),
    facecolor = 'k',
    edgecolor = 'k')
plt.imshow(wordcloud_tpp_turkel, interpolation = 'bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
cv = CountVectorizer(stop_words= list(nlp.Defaults.stop_words), min_df=0.01, max_df=0.95)

In [None]:
cv = CountVectorizer(stop_words= list(nlp.Defaults.stop_words), max_features=50, ngram_range=(2,5))
count_vector = cv.fit_transform(tpp_turkel["full_content"])
tpp_turkel_bigrams = pd.DataFrame(count_vector.toarray(), columns=cv.get_feature_names_out()) 

In [None]:
tpp_turkel_bigrams.sum(axis =0).sort_values(ascending = False)

In [None]:
data = {"Ταγίπ Ερντογάν" :                        19,
"Κεμάλ Κιλιτσντάρογλου"  :                 16,
"ρετζέπ ταγίπ ερντογάν" :                  14,
"δεύτερος γύρος/δεύτερο γύρο" :              13,
"απεργία πείνας" :                          9,
"Κωνσταντίνα Καρτσιώτη" :                   7,
"Σινάν Ογκάν" :                             5,
"προεδρικές εκλογές"  :                     5,
"εκατομμύρια άνθρωποι έχασαν ζωή" :                  5,
"προεδρικών εκλογών" :                      5,
"Πρόεδρος Τουρκίας"  :                      5}

df = pd.DataFrame(list(data.items()), columns=['Διγράμματα / Πολυγράμματα', 'Αναφορές'])

df = df.sort_values(by='Αναφορές', ascending=False)

plt.figure(figsize=(10, 8))

bars = plt.barh(df['Διγράμματα / Πολυγράμματα'], df['Αναφορές'], color='lightblue', edgecolor= 'black')
for bar in bars:
    plt.text(
        bar.get_width() + 2, bar.get_y() + bar.get_height() / 2,  
        str(bar.get_width()),
        va='center',
        ha='left',  
        color='black',
        fontsize=10
    )
    
plt.xlabel('Αναφορές')
plt.ylabel('Διγράμματα / Πολυγράμματα')
plt.xlim(0,25)
plt.title('Διγράμματα/Πολυγράμματα Τουρκικών Εκλογών 2023 - The Press Project')
plt.gca().invert_yaxis() 
plt.show()

In [None]:
import geopandas as gpd


shapefile_path = r"C:\Users\stath\OneDrive\Υπολογιστής\ne_110m_admin_0_countries\ne_110m_admin_0_countries.shp"


world = gpd.read_file(shapefile_path)


greece_turkey = world[world["SOVEREIGNT"].isin(['Greece', 'Turkey'])]


fig, ax = plt.subplots(figsize=(20, 10))  # Increased size


greece_turkey[greece_turkey["SOVEREIGNT"] == "Greece"].plot(ax=ax, color='blue', edgecolor='black', label='Greece')
greece_turkey[greece_turkey["SOVEREIGNT"] == "Turkey"].plot(ax=ax, color='red', edgecolor='black', label='Turkey')


ax.set_title('Τίτλοι δημοσιευμάτων για Ελλάδα(Μητσοτάκη-ΝΔ, Τσίπρα-ΣΥΡΙΖΑ), Τουρκία(Ερντογάν-Κιλιτσντάρογλου) - The Press Project', fontsize=14)


ax.grid(True, linestyle='--', alpha=0.7)


greece_coords = (19.0, 37.5) 
turkey_coords = (33.0, 37.5)  

# Phrases for Greece
greece_phrases = [
 
  "σκάνδαλο υποκλοπών",
  "Τσίπρας: (απόδοση δηλώσεων)",
  "Μητσοτάκης: (απόδοση δηλώσεων)",
"δημοσκοπήσεις: Μεθοδευμένες ερωτήσεις, αποκλίσεις αποτελεσμάτων"
  
]


turkey_phrases = [
     "απεργία πείνας κρατουμένων",
    "προεκλογικό δώρο ΗΠΑ",
    "ακραία πόλωση εκλογικού σώματος"


]


for i, phrase in enumerate(greece_phrases):
    ax.text(greece_coords[0], greece_coords[1] + i*0.7, phrase, fontsize=11, color='black', ha='left', weight='bold')


for i, phrase in enumerate(turkey_phrases):
    ax.text(turkey_coords[0], turkey_coords[1] + i*0.7, phrase, fontsize=14, color='black', ha='center', weight='bold')


ax.legend()

plt.show()