In [4]:
import requests
from bs4 import BeautifulSoup

# Main function to process multiple pages
def main():
    base_url = "https://www.almenrausch.at/touren/suchergebnisse/"
    total_pages = 107
    all_tour_info = []

    for page_num in range(1, total_pages + 1):
        page_url = base_url + str(page_num) + "/"
        print(f"Processing page: {page_num}")
        links = extract_links_from_page(page_url)
        for link in links:
            tour_info = extract_info_from_subpage(link)
            if tour_info:
                all_tour_info.append(tour_info)

    # Save the extracted information into a CSV file
    save_to_csv(all_tour_info)

# Function to save tour information to a CSV file
def save_to_csv(all_tour_info):
    with open('tour_info.csv', 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['Tour Name', 'Description', 'Character', 'Tour Info', 'Housing Recommendations']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for tour_info in all_tour_info:
            writer.writerow({
                'Tour Name': tour_info[0],
                'Description': tour_info[1],
                'Character': tour_info[2],
                'Tour Info': tour_info[3],
                'Housing Recommendations': tour_info[4]
            })

if __name__ == "__main__":
    main()

# Function to extract links from a single page
def extract_links_from_page(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        drei_spalten_elements = soup.find_all(class_='dreiSpalten')
        links = []
        for element in drei_spalten_elements:
            a_tag = element.find('a')
            if a_tag and 'href' in a_tag.attrs:
                link = a_tag['href']
                full_url = "https://www.almenrausch.at" + link
                links.append(full_url)
        print(links)
        return links
    else:
        print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
        return []
    
#Function to extract information from a subpage
def extract_info_from_subpage(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find the section containing the tour name
        tour_name_section = soup.find('div', id='detail-top')
        # Extract tour name
        tour_name = tour_name_section.find('h1').text.strip()
        
        # Find the main row element containing tour information
        row_element = soup.find(class_='singel-infowrap')
        
        # Extract description from the main row element
        description_paragraphs = row_element.find_all('p')
        description_paragraphs
        description = "\n".join([p.text.strip() for p in description_paragraphs if p.text.strip() != ""])

        charakter_element = soup.find(class_='charakter')
        character = charakter_element.text.strip()

        # Find the tripInfoWide section
        trip_info_wide = soup.find(class_='tripInfoWide')

        # Extract information from tripInfoWide
        tour_info = {}
        for tr in trip_info_wide.find_all('tr'):
            tds = tr.find_all('td')
            if len(tds) == 2:
                key = tds[0].text.strip().rstrip(':')
                value = tds[1].text.strip()
                tour_info[key] = value

        # Find the tripHousing section
        trip_housing = soup.find(class_='tripHousing')

        # Extract housing recommendations from tripHousing
        housing_recommendations = []
        overview_boxes = trip_housing.find_all(class_='overviewBox')
        for box in overview_boxes:
            housing_info = {}
            housing_info['Title'] = box.find('span', itemprop='headline').text.strip()
            housing_info['Description'] = box.find('div', itemprop='description').text.strip()
            housing_info['Link'] = "https://www.almenrausch.at" + box.find('a')['href']
            housing_recommendations.append(housing_info)

        return tour_name, description, character, tour_info, housing_recommendations
    else:
        print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
        return None, None, None, {}, []

Processing page: 1
['https://www.almenrausch.at/touren/detail/weg-der-sinne-am-hochpillberg-in-tirol/', 'https://www.almenrausch.at/touren/detail/1-tag-luesens-westfalenhaus/', 'https://www.almenrausch.at/touren/detail/1-tag-mutterbergalm-hinterer-daunkopf-amberger-huette/', 'https://www.almenrausch.at/touren/detail/1tag-gries-winnebachseehuette-winnebacher-weisskogel-westfalenhaus/', 'https://www.almenrausch.at/touren/detail/1tag-luesens-hoher-seeblaskogel-winnebachseehuette/', 'https://www.almenrausch.at/touren/detail/1tag-mutterbergalm-hinterer-daunkopf-amberger-huette/', 'https://www.almenrausch.at/touren/detail/2-tag-amberger-huette-schrankarkopf-amberger-huette/', 'https://www.almenrausch.at/touren/detail/2-tag-westfalenhaus-winnebacher-weisskogel-gleirscher-fernerkogel-pforzheimerhuette/', 'https://www.almenrausch.at/touren/detail/2-almen-rundwanderung-von-rum/', 'https://www.almenrausch.at/touren/detail/2-gipfel-rundtour-gritzer-hoerndl-merschenhoehe-speikboden/', 'https://www.

In [44]:
import pandas as pd

df = pd.read_csv('cleaned_file.csv')
df.to_excel('cleaned_file.xlsx', index=False)

In [17]:
import pandas as pd
import ast

# Load the CSV file into a Pandas DataFrame
df = pd.read_csv('../data/tour_info.csv')

# Define a function to parse the "Description" column
def parse_description(description):
    try:
        # Convert the string representation of the dictionary to an actual dictionary
        description_dict = ast.literal_eval(description)
    except (ValueError, SyntaxError):
        description_dict = {}
    return description_dict

# Define a function to parse the "Housing Recommendation" column
def parse_housing_recommendation(housing_recommendation):
    try:
        # Convert the string representation of the list of dictionaries to an actual list of dictionaries
        housing_list = ast.literal_eval(housing_recommendation)
        if housing_list and isinstance(housing_list, list):
            housing_dict = housing_list[0]  # Assuming we only care about the first dictionary in the list
        else:
            housing_dict = {}
    except (ValueError, SyntaxError):
        housing_dict = {}
    
    # Prefix housing keys with "Housing_" to make it clear
    prefixed_housing_dict = {f"Housing_{key}": value for key, value in housing_dict.items()}
    return prefixed_housing_dict

# Apply the function to the "Description" column and create a DataFrame from the resulting dictionaries
description_df = df['Tour Info'].apply(parse_description).apply(pd.Series)

# Apply the function to the "Housing Recommendation" column and create a DataFrame from the resulting dictionaries
housing_df = df['Housing Recommendations'].apply(parse_housing_recommendation).apply(pd.Series)

# Concatenate the original DataFrame with the new DataFrames containing the separated columns
df_cleaned = pd.concat([df, description_df, housing_df], axis=1)

# Drop the original "Description" and "Housing Recommendation" columns if no longer needed
df_cleaned.drop(columns=['Tour Info', 'Housing Recommendations'], inplace=True)

# Save the cleaned DataFrame to a new CSV file
df_cleaned.to_csv('../data/cleaned_full.csv', index=False)
df_cleaned.to_excel('../data/cleaned_full.xlsx', index=False)

# Display the cleaned DataFrame
print(df_cleaned.head())


                                           Tour Name  \
0           "Weg der Sinne" am Hochpillberg in Tirol   
1                        1 Tag: Lüsens-Westfalenhaus   
2  1 Tag: Mutterbergalm-Hinterer Daunkopf-Amberge...   
3  1.Tag: Gries-Winnebachseehütte-Winnebacher Wei...   
4  1.Tag: Lüsens-Hoher Seeblaskogel-Winnebachseeh...   

                                         Description  \
0  Gemütliche Wanderung in den Tuxer Alpen mit sc...   
1  Skitour (Hüttenzustieg) von Lüsens durch das L...   
2  Skitour von der Mutterbergalm auf den Hinteren...   
3  Skitour von Gries im Sulztal zur Winnebachseeh...   
4  Skitour von Lüsens durch das Längental auf den...   

                                           Character Tourenkategorie  \
0  Charakter: Auf dieser gemütliche Wanderung gib...       Wanderung   
1  Charakter: Der erste Tag beginnt gemütliche mi...         Skitour   
2  Charakter: Wunderschöner Aufstieg durch die Gl...         Skitour   
3  Charakter: Der erste Tag dieser 2 T

In [31]:

# Convert EPUB files to text
epub_files = ['../trainsdata/marco_polo.csv', '../trainsdata/messner.csv', "../trainsdata/alpenverein.csv"]  # Replace with your EPUB file paths
output_texts = []

#delete "" "" in every csv file of the list
for file in epub_files:
    with open(file, 'r', encoding='utf-8') as f:
        text = f.read()
        text = text.replace('"', '')
        output_texts.append(text)

# Save the extracted text to a text file
output_file = 'extracted_text.txt'
with open(output_file, 'w', encoding='utf-8') as f:
    f.write('\n\n'.join(output_texts))

#delete empty lines
with open('extracted_text.txt', 'r', encoding='utf-8') as f:
    lines = f.readlines()
    lines = [line.strip() for line in lines if line.strip()]

#save as csv
import csv

with open('extracted_text.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Text'])
    for line in lines:
        writer.writerow([line])