In [1]:
pip install requests beautifulsoup4



In [2]:
import requests
from bs4 import BeautifulSoup
import time
import csv
import pandas as pd
from google.colab import files

In [None]:
BASE_URL = "https://www.vidhikarya.com/free-legal-advice"
HEADERS = {
    "User-Agent": "Mozilla/5.0"
}

def get_question_data(link):
    """Extract full question and answers from a question page."""
    try:
        res = requests.get(link, headers=HEADERS)
        if res.status_code != 200:
            print(f"Failed to fetch question page: {link}")
            return "N/A", []

        soup = BeautifulSoup(res.text, 'html.parser')

        # Full question text
        question = soup.find(class_="qstn").find(class_="content").p.get_text(strip=True) if soup.find(class_="qstn") else "N/A"
        location = soup.find(class_="qstn").find(class_="content").div.get_text(strip=True) if soup.find(class_="qstn").find(class_="content").find("div") else "N/A"
        #print(question)
        #print(location)

        # All answers
        #answers = soup.find(class_="ans").find(class_="content").get_text(strip=True) if soup.find(class_="ans") else "N/A"
        #print(answers)

        all_answers = []
        answer_blocks = soup.find_all(class_="ans")
        for ans in answer_blocks:
            content = ans.find(class_="content")
            if content:
                answer_text = content.get_text(strip=True)
                if answer_text:
                    all_answers.append(answer_text)

        #print("Answers:", all_answers[0])

        return question, all_answers, location
    except Exception as e:
        print(f"Error fetching question data from {link}: {e}")
        return "N/A", []

def scrape_vidhikarya(pages):

    for page in range(2101, pages + 1):
        data = []
        print(f"Scraping page {page}")
        url = f"{BASE_URL}?page={page}"
        #print(f"URL: {url}")
        try:
            res = requests.get(url, headers=HEADERS)
            if res.status_code != 200:
                print(f"Failed to load page {page} — status {res.status_code}")
                continue

            soup = BeautifulSoup(res.text, 'html.parser')
            #print(soup.title.text)

            cards = soup.find_all(class_="adv_box")
            #print(f"Cards: {cards}")
            #print(cards[1])

            #print(f"Found {len(cards)} questions on page {page}")

            for i in range(len(cards)):
                category = cards[i].find(class_="tag_box").get_text(strip=True)
                #print(category)
                subcategory = cards[i].find(class_="content").a.get_text(strip=True)
                #print(subcategory)
                relative_link = cards[i].a["href"]
                #print(relative_link)
                full_link = relative_link
                question_text, answers,location = get_question_data(full_link)
                time.sleep(0.5)  # Be polite

                data.append({
                    "Category": category,
                    "Sub Category": subcategory,
                    "location": location,
                    "question": question_text,
                    "answers": " ||| ".join(answers),
                    "Number of Ans": len(answers),
                    "Link": full_link

                })

            time.sleep(1)
            save_to_csv(data)
        except Exception as e:
            print(f"Error scraping page {page}: {e}")
            continue

    return

def save_to_csv(data, filename="vidhikarya_data.csv"):
    if not data:
        print("❌ No data to save.")
        return

    try:
        with open(filename, "a", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=data[0].keys())
            #writer.writeheader()
            writer.writerows(data)
        #print(f"✅ Data saved to {filename}")
        print("✅ Data saved")
    except Exception as e:
        print(f"Error saving data to CSV: {e}")

# Run the scraper
if __name__ == "__main__":
    scrape_vidhikarya(pages=2500)  # Adjust number of pages here


Scraping page 2101
✅ Data saved
Scraping page 2102
✅ Data saved
Scraping page 2103
✅ Data saved
Scraping page 2104
✅ Data saved
Scraping page 2105
✅ Data saved
Scraping page 2106
✅ Data saved
Scraping page 2107
✅ Data saved
Scraping page 2108
✅ Data saved
Scraping page 2109
✅ Data saved
Scraping page 2110
✅ Data saved
Scraping page 2111
✅ Data saved
Scraping page 2112
✅ Data saved
Scraping page 2113
✅ Data saved
Scraping page 2114
✅ Data saved
Scraping page 2115
✅ Data saved
Scraping page 2116
✅ Data saved
Scraping page 2117
✅ Data saved
Scraping page 2118
✅ Data saved
Scraping page 2119
✅ Data saved
Scraping page 2120
✅ Data saved
Scraping page 2121
✅ Data saved
Scraping page 2122
✅ Data saved
Scraping page 2123
✅ Data saved
Scraping page 2124
✅ Data saved
Scraping page 2125
✅ Data saved
Scraping page 2126
✅ Data saved
Scraping page 2127
✅ Data saved
Scraping page 2128
✅ Data saved
Scraping page 2129


In [None]:
# Trigger download
files.download("vidhikarya_data.csv")

In [19]:
def show_all_duplicate_rows(filename="vidhikarya_data.csv"):
    try:
        # Load the CSV
        df = pd.read_csv(filename)

        # Identify duplicated rows (including all occurrences)
        duplicates_all = df[df.duplicated(keep=False)]

        if not duplicates_all.empty:
            print(f"❗ Found {len(duplicates_all)} rows involved in duplicates:\n")
            #print(duplicates_all)
        else:
            print("✅ No duplicate rows found.")

        return duplicates_all

    except Exception as e:
        print(f"Error reading file: {e}")
        return pd.DataFrame()

show_all_duplicate_rows()

❗ Found 14 rows involved in duplicates:



Unnamed: 0,Category,Sub Category,location,question,answers,Number of Ans,Link
1009,Commercial,FRANCHISE FRAUD,"( West Delhi, Delhi )","Hi Sir, \n\nI am writing this to acquire knowl...","Dear Client,Since you have taken up a franchis...",1,https://www.vidhikarya.com/FreeLegalAdvice/637...
1010,Commercial,FRANCHISE FRAUD,"( West Delhi, Delhi )","Hi Sir, \n\nI am writing this to acquire knowl...","Dear Client,Since you have taken up a franchis...",1,https://www.vidhikarya.com/FreeLegalAdvice/637...
11994,Administrative Law,MACP,"( Delhi, Delhi )",Sir if department not notified RRs for post. ...,"Dear Client,If the concerned deptt did not not...",1,https://www.vidhikarya.com/FreeLegalAdvice/434...
11995,Contracts and Agreements,Bond Settlement from previous employer.,"( West Delhi, Delhi )",I have been a part of company during 11 Jan 20...,"Dear Client,The validity of Employment bonds c...",2,https://www.vidhikarya.com/FreeLegalAdvice/434...
11996,Cyber Crime,Data entry fraud,"( Solapur, Maharashtra )",They said company will sent legal court notic...,"Dear client,Just ignore it. If they made any d...",1,https://www.vidhikarya.com/FreeLegalAdvice/434...
11997,Domestic Violence,Domestic violence - stree dhan - all jewellery,"( Mumbai, Maharashtra )",My stree dhan(all jewellery) is with in laws b...,"Dear Client,A Hindu married woman is the absol...",2,https://www.vidhikarya.com/FreeLegalAdvice/434...
11998,Consumer Protection,Filed a consumer case against real estate builder,"( Mumbai, Maharashtra )",I have filed a consumer case against real esta...,"Dear Client,Nomination is not allowed or permi...",2,https://www.vidhikarya.com/FreeLegalAdvice/434...
11999,Contracts and Agreements,Contract employee,"( Chandigarh, Chandigarh )",Can a contractual employee get paid leave for ...,"Dear Client,In case you are a contractual work...",1,https://www.vidhikarya.com/FreeLegalAdvice/434...
12000,Administrative Law,MACP,"( Delhi, Delhi )",Sir if department not notified RRs for post. ...,"Dear Client,If the concerned deptt did not not...",1,https://www.vidhikarya.com/FreeLegalAdvice/434...
12001,Contracts and Agreements,Bond Settlement from previous employer.,"( West Delhi, Delhi )",I have been a part of company during 11 Jan 20...,"Dear Client,The validity of Employment bonds c...",2,https://www.vidhikarya.com/FreeLegalAdvice/434...


In [20]:
df = pd.read_csv("vidhikarya_data.csv")
#df.head(10)
print(len(df))

21000
