In [3]:
# %load Ads_extraction.py
import pandas as pd
from datetime import datetime
import re
import Utils as utils
import os

# Path to the CSV file containing URLs
csv_file_path = 'kijiji_rental_ads_url.csv'

# Read the CSV file into a DataFrame
df = pd.read_csv(csv_file_path)
df.head()
# Extract the URLs as a list
url_list = df['URL'].tolist()  # Assuming 'URL' is the column containing the URLs

data = []
counter = 0
start_time = datetime.now()

# Loop through the list of URLs and process each one
for url in url_list:

    soup = utils.get_soup(url)
    if soup:
        # Initialize a dictionary to store the extracted data for this URL
        ad_data = {}

        title_el = soup.find('h1', class_='title-4206718449')
        if title_el is None:
            continue

        # Extract title, price, address, and posting date
        ad_data['Title'] = title_el.text.strip()

        # Extract Price
        price_elem = soup.find(class_='priceWrapper-3915768379')
        price_text = ''
        if price_elem:
            price_text = price_elem.text.strip()
        else:
            backup_price_elem = soup.find(class_='currentPrice-231544276')
            price_text = backup_price_elem.text.strip() if backup_price_elem else None

        ad_data['Price($)'] = ''.join(re.findall(r'\d+', price_text))

        # Extract Address
        address_elem = soup.find(itemprop='address')
        ad_data['Address'] = address_elem.text.strip() if address_elem else None

        # Find the div with class "datePosted"
        date_posted_div = soup.find('div', class_='datePosted-1776470403')

        posted_datetime = ''
        # Extract the datetime attribute from the time tag
        if date_posted_div:
            time_tag = date_posted_div.find('time')
            if time_tag and 'datetime' in time_tag.attrs:
                datetime_str = time_tag['datetime']

                # Parse the datetime string into a datetime object
                posted_datetime = datetime.strptime(datetime_str, "%Y-%m-%dT%H:%M:%S.%fZ")

        ad_data['Date Posted'] = posted_datetime

        # Extract bedrooms, bathrooms, and building type if title_attributes is present
        title_attributes = soup.find(class_='titleAttributes-183069789')
        if title_attributes:
            title_attributes = title_attributes.find_all('li', class_='noLabelAttribute-262950866')
            for attr in title_attributes:
                if 'Bedrooms' in attr.span.text:
                    ad_data['Bedrooms'] = attr.span.text.split(': ')[-1]
                elif 'Bathrooms' in attr.span.text:
                    ad_data['Bathrooms'] = attr.span.text.split(': ')[-1]
                else:
                    ad_data['Building Type'] = attr.span.text

        # Handle case where title_attributes is not present
        else:
            ad_data['Bedrooms'] = None
            ad_data['Bathrooms'] = None
            ad_data['Building Type'] = None

        # Find the container for the "Utilities Included" section
        utilities_included_container = soup.find('h4', string='Utilities Included')

        # Extract the "Utilities Included" section if it exists
        utilities_text = ''
        if utilities_included_container:
            utilities_included_section = utilities_included_container.find_next('ul')
            if utilities_included_section:
                utilities_element = utilities_included_section.find_all('li')
                utilities_text = ','.join(
                    f"{item.get_text(strip=True)}_Yes" if 'Yes' in item.find('svg').get('aria-label',
                                                                                        '') else f"{item.get_text(strip=True)}_No" if 'No' in item.find(
                        'svg').get('aria-label', '') else f"{item.get_text(strip=True)}" for item in utilities_element)

        # Print the combined text with Yes/No indicators
        ad_data['Utilities'] = utilities_text

        # Get and print each section's text
        ad_data['Wi-Fi and More'] = utils.get_section_text(soup, 'Wi-Fi and More', 'h4', 'ul')

        ad_data['Parking Included'] = utils.get_section_text(soup, 'Parking Included', 'dt', 'dd')

        ad_data['Agreement Type'] = utils.get_section_text(soup, 'Agreement Type', 'dt', 'dd')

        ad_data['Move-In Date'] = utils.get_section_text(soup, 'Move-In Date', 'dt', 'dd')

        ad_data['Pet Friendly'] = utils.get_section_text(soup, 'Pet Friendly', 'dt', 'dd')

        ad_data['Size (sqft)'] = utils.get_section_text(soup, 'Size (sqft)', 'dt', 'dd')

        ad_data['Furnished'] = utils.get_section_text(soup, 'Furnished', 'dt', 'dd')

        ad_data['Air Conditioning'] = utils.get_section_text(soup, 'Air Conditioning', 'dt', 'dd')

        ad_data['Personal Outdoor Space'] = utils.get_section_text(soup, 'Personal Outdoor Space', 'h4', 'ul')

        ad_data['Smoking Permitted'] = utils.get_section_text(soup, 'Smoking Permitted', 'dt', 'dd')

        ad_data['Appliances'] = utils.get_multiple_section_text(soup, 'Appliances')

        ad_data['Amenities'] = utils.get_multiple_section_text(soup, 'Amenities')

        # Extract description
        description_elm = soup.select_one('.descriptionContainer-2067035870 p')
        ad_data['Description'] = description_elm.text.strip() if description_elm else None

        # Extract visit counter
        visit_counter_elem = soup.select_one('.visitCounter-204515568 span')
        ad_data['Visit Counter'] = visit_counter_elem.text.strip() if visit_counter_elem else None

        ad_data['url'] = url

        # Append the extracted data to the list
        data.append(ad_data)

        counter += 1
        print("Number ", counter, ' and URL ', url)

# Create a DataFrame from the collected data
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
df.to_csv(f'kijiji_rental_ads_{counter}.csv', index=False)

# Delete the original CSV file containing URLs
os.remove('kijiji_rental_ads_url.csv')
print("Delete csv kijiji_rental_ads_url.csv")

utils.print_time_info(start_time, datetime.now())


Number  1  and URL  https://www.kijiji.ca/v-apartments-condos/city-of-toronto/renovated-bachelor-king-and-jameson-id-2541/1683888315
Number  2  and URL  https://www.kijiji.ca/v-apartments-condos/city-of-toronto/renovated-two-bedroom-queen-and-victoria-park-id-3192/1686025060
Number  3  and URL  https://www.kijiji.ca/v-apartments-condos/city-of-toronto/fully-renovated-2-bedroom-in-etobicoke/1639869260
Number  4  and URL  https://www.kijiji.ca/v-apartments-condos/city-of-toronto/831-kennedy-road-831-kennedy-apartment-for-rent/1686827384
Number  5  and URL  https://www.kijiji.ca/v-apartments-condos/city-of-toronto/renovated-two-bedroom-don-mills-and-lawrence-id-3244/1687384233
Number  6  and URL  https://www.kijiji.ca/v-apartments-condos/city-of-toronto/private-basement-apartment-located-at-kennedy-lawrence-east/1685515558
Number  7  and URL  https://www.kijiji.ca/v-apartments-condos/city-of-toronto/105-west-lodge-1-bedroom-apartment-in-the-heart-of-parkdale-ap/1682857424
Number  8  and U

Number  60  and URL  https://www.kijiji.ca/v-commercial-office-space/city-of-toronto/book-a-reserved-coworking-spot-or-hot-desk-in-queen-bay/1523240220
Number  61  and URL  https://www.kijiji.ca/v-commercial-office-space/city-of-toronto/book-a-reserved-coworking-spot-or-hot-desk-in-yonge-and-sheppard/1523134540
Number  62  and URL  https://www.kijiji.ca/v-commercial-office-space/city-of-toronto/private-office-space-for-3-persons-in-don-mills/1649069518
Number  63  and URL  https://www.kijiji.ca/v-commercial-office-space/city-of-toronto/access-professional-coworking-space-in-yonge-and-richmond-centre/1520807555
Number  64  and URL  https://www.kijiji.ca/v-commercial-office-space/city-of-toronto/fully-serviced-private-office-space-for-you-and-your-team/1523134625
Number  65  and URL  https://www.kijiji.ca/v-commercial-office-space/city-of-toronto/access-professional-coworking-space-in-eaton-centre/1520767771
Number  66  and URL  https://www.kijiji.ca/v-commercial-office-space/city-of-tor

Number  118  and URL  https://www.kijiji.ca/v-apartments-condos/city-of-toronto/2bed-den-2-bath-new-condo-at-bloor-islington-toronto/1688033864
Number  119  and URL  https://www.kijiji.ca/v-room-rental-roommate/city-of-toronto/furnished-room-for-rent/1688033647
Number  120  and URL  https://www.kijiji.ca/v-room-rental-roommate/city-of-toronto/large-bedroom-available-by-fairview-mall-female-only/1688033330
Number  121  and URL  https://www.kijiji.ca/v-apartments-condos/city-of-toronto/one-bedroom-suite-lakeshore-for-rent-17-bathurst-street/1688033316
Number  122  and URL  https://www.kijiji.ca/v-room-rental-roommate/city-of-toronto/private-room-in-crockamhil-scarborough-for-females-only/1688033290
Number  123  and URL  https://www.kijiji.ca/v-room-rental-roommate/city-of-toronto/2-bedroom-basement-for-rent/1688033092
Number  124  and URL  https://www.kijiji.ca/v-commercial-office-space/city-of-toronto/therapeutic-office-rooms-for-rent-by-hour-downtown-toronto/1680531178
Number  125  and

Number  177  and URL  https://www.kijiji.ca/v-apartments-condos/city-of-toronto/beautiful-and-modern-1-bedroom-unit-steps-from-glencairn-subway/1688028637
Number  178  and URL  https://www.kijiji.ca/v-apartments-condos/city-of-toronto/waterfront-living-1-bed-condo-38-dan-leckie-way/1688029067
Number  179  and URL  https://www.kijiji.ca/v-apartments-condos/city-of-toronto/2bd-at-45-la-rose-avenue-richview-developments/1688028938
Number  180  and URL  https://www.kijiji.ca/v-apartments-condos/city-of-toronto/stunning-modern-new-build-2-bedroom-2-bathroom-with-a-back-patio/1688029049
Number  181  and URL  https://www.kijiji.ca/v-apartments-condos/city-of-toronto/recently-renovated-executive-rental-south-annex-5800-utilit/1688029069
Number  182  and URL  https://www.kijiji.ca/v-apartments-condos/city-of-toronto/2-bed-3-bath-condo-townhouse-in-south-etobicoke/1688028624
Number  183  and URL  https://www.kijiji.ca/v-apartments-condos/city-of-toronto/2-bed-1-bath-home-in-the-annex/1688028590
