In [None]:
import os
import pandas as pd
from bs4 import BeautifulSoup
import requests

# Define input and output paths
input_path = "C:/Users/ADMIN/Desktop/Python programming tutorial/Data-Scrap/TELUNGANA/RAW"
output_path = "C:/Users/ADMIN/Desktop/Python programming tutorial/Data-Scrap/TELUNGANA/TRANSFORMED"

# Ensure output directory exists
os.makedirs(output_path, exist_ok=True)

# List CSV files efficiently
files = [os.path.join(input_path, f) for f in os.listdir(input_path) if f.endswith(".csv")]
print(f"Found {len(files)} CSV files: {files}")

# Define transformation function
def transform_csv(df, new_column_order):
    """Transform a DataFrame by selecting and reordering specified columns."""
    # Keep only columns that exist in the DataFrame from the desired order
    existing_columns = [col for col in new_column_order if col in df.columns]
    df = df[existing_columns]
    return df
# Name,Address,Street,Municipality,Categories,Phone,Phones,Claimed,Review Count,Average Rating
def get_phone(url):
    try:
        page = requests.get(url)
        soup = BeautifulSoup(page.content, 'html.parser')
        return soup.find('button', {'data-item-id': 'phone'}).get_text()
    except:
        return "Not found"

df = pd.read_csv('your_file.xlsx')
df['Phone'] = df['Maps_url'].apply(get_phone)
df.to_excel('output.xlsx', index=False)
# Desired columns
new_columns = ['Name', 'Address', 'Category', 'Phone', 'Avg_review', 'Reviews', 'Website', 'Maps_url']

# Process each CSV file individually
for file in files:
    try:
        # Read the CSV
        df = pd.read_csv(file)
        print(f"Processing: {os.path.basename(file)} - Shape: {df.shape}")

        # Apply transformations (select and reorder columns) 
        df_transformed = transform_csv(df, new_columns)

        # Handle missing values and reset index
        df_cleaned = df_transformed.dropna(how='all').reset_index(drop=False)
        print(f"Cleaned shape: {df_cleaned.shape}")

        # Define output file path
        output_file = os.path.join(output_path, f"{os.path.basename(file)}_transformed.csv")

        # Save to CSV
        df_cleaned.to_csv(output_file, index=False)
        print(f"Saved: {output_file}")

    except Exception as e:
        print(f"Error processing {os.path.basename(file)}: {e}")

print("All files processed successfully!")

Found 10 CSV files: ['C:/Users/ADMIN/Desktop/Python programming tutorial/Data-Scrap/TELUNGANA/RAW\\hospital.csv', 'C:/Users/ADMIN/Desktop/Python programming tutorial/Data-Scrap/TELUNGANA/RAW\\hotels(lodge).csv', 'C:/Users/ADMIN/Desktop/Python programming tutorial/Data-Scrap/TELUNGANA/RAW\\ladies_emp.csv', 'C:/Users/ADMIN/Desktop/Python programming tutorial/Data-Scrap/TELUNGANA/RAW\\malls.csv', 'C:/Users/ADMIN/Desktop/Python programming tutorial/Data-Scrap/TELUNGANA/RAW\\pooja stores.csv', 'C:/Users/ADMIN/Desktop/Python programming tutorial/Data-Scrap/TELUNGANA/RAW\\salons.csv', 'C:/Users/ADMIN/Desktop/Python programming tutorial/Data-Scrap/TELUNGANA/RAW\\salons_1.csv', 'C:/Users/ADMIN/Desktop/Python programming tutorial/Data-Scrap/TELUNGANA/RAW\\schools.csv', 'C:/Users/ADMIN/Desktop/Python programming tutorial/Data-Scrap/TELUNGANA/RAW\\silk_sarees.csv', 'C:/Users/ADMIN/Desktop/Python programming tutorial/Data-Scrap/TELUNGANA/RAW\\tex&handlooms.csv']
Processing: hospital.csv - Shape: (1

In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re

# Read the CSV file
df = pd.read_csv('salons_1.csv')

# Function to extract phone number from Google Maps URL
def get_phone_number(url):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Try different methods to find phone number
        phone = None
        
        # Method 1: Look for phone button
        phone_button = soup.find('button', {'data-item-id': 'phone'})
        if phone_button:
            phone = phone_button.get_text(strip=True)
        
        # Method 2: Search for phone pattern in the page
        if not phone:
            phone_pattern = re.compile(r'(\+?\d[\d\- ]{7,}\d)')
            matches = phone_pattern.search(response.text)
            if matches:
                phone = matches.group(1)
        
        return phone if phone else "Not found"
    
    except Exception as e:
        return f"Error: {str(e)}"

# Apply the function to each URL
df['phone_extracted'] = df['Maps_url'].apply(get_phone_number)

# Save to new CSV
df.to_csv('salons_with_phones.csv', index=False)
print("Phone numbers extracted and saved to salons_with_phones.csv")

Phone numbers extracted and saved to salons_with_phones.csv


In [None]:
import requests
import re
import csv
from urllib.parse import urlparse, parse_qs

def extract_phone_from_maps_url(maps_url):
    """
    Extracts a phone number from a Google Maps URL.

    Args:
        maps_url: The Google Maps URL.

    Returns:
        The phone number as a string, or None if not found.
    """
    try:
        response = requests.get(maps_url, allow_redirects=True) # important allow redirects.
        response.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx)

        # Look for phone number in the HTML content
        phone_match = re.search(r'tel:\+?(\d[\d\s\-\(\)]+)', response.text) # more robust regex
        if phone_match:
            phone_number = phone_match.group(1).strip()
            # Clean up phone number: remove spaces, dashes, parentheses
            phone_number = re.sub(r'[\s\-\(\)]', '', phone_number)
            return phone_number

        #alternative method to get phone from url parameters, less reliable.
        parsed_url = urlparse(response.url)
        query_params = parse_qs(parsed_url.query)
        if "q" in query_params:
            q_values = query_params["q"]
            for value in q_values:
                phone_match_q = re.search(r'tel:\+?(\d[\d\s\-\(\)]+)', value)
                if phone_match_q:
                    phone_number = phone_match_q.group(1).strip()
                    phone_number = re.sub(r'[\s\-\(\)]', '', phone_number)
                    return phone_number
        return None

    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL: {e}")
        return None
    except Exception as e:
        print(f"An unexpected error occured: {e}")
        return None


def process_csv(input_file, output_file):
    """
    Processes a CSV file, extracting phone numbers from Maps URLs and writing the results to a new CSV.

    Args:
        input_file: The path to the input CSV file.
        output_file: The path to the output CSV file.
    """
    try:
        with open(input_file, 'r', newline='', encoding='utf-8') as infile, \
             open(output_file, 'w', newline='', encoding='utf-8') as outfile:

            reader = csv.DictReader(infile)
            fieldnames = reader.fieldnames + ['phone_number']
            writer = csv.DictWriter(outfile, fieldnames=fieldnames)

            writer.writeheader()

            for row in reader:
                maps_url = row.get('Maps_url')
                phone_number = extract_phone_from_maps_url(maps_url)
                row['phone_number'] = phone_number
                writer.writerow(row)

    except FileNotFoundError:
        print(f"Error: Input file '{input_file}' not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

# Example usage:
input_csv_file = 'salons_1.csv' # replace with your input file name.
output_csv_file = 'output_with_phones.csv' # replace with your desired output file name.

process_csv(input_csv_file, output_csv_file)

print(f"Processing complete. Phone numbers written to {output_csv_file}")

Error fetching URL: Invalid URL 'None': No scheme supplied. Perhaps you meant http://None?
Error fetching URL: Invalid URL 'None': No scheme supplied. Perhaps you meant http://None?
Error fetching URL: Invalid URL 'None': No scheme supplied. Perhaps you meant http://None?
Error fetching URL: Invalid URL 'None': No scheme supplied. Perhaps you meant http://None?
Error fetching URL: Invalid URL 'None': No scheme supplied. Perhaps you meant http://None?
Error fetching URL: Invalid URL 'None': No scheme supplied. Perhaps you meant http://None?
Error fetching URL: Invalid URL 'None': No scheme supplied. Perhaps you meant http://None?
Error fetching URL: Invalid URL 'None': No scheme supplied. Perhaps you meant http://None?
Error fetching URL: Invalid URL 'None': No scheme supplied. Perhaps you meant http://None?
Error fetching URL: Invalid URL 'None': No scheme supplied. Perhaps you meant http://None?
Error fetching URL: Invalid URL 'None': No scheme supplied. Perhaps you meant http://None?