In [1]:
import os
import requests
import pandas as pd
from bs4 import BeautifulSoup
import time
import io
from PIL import Image
import hashlib
import random
import re

In [None]:
# Install required packages
!pip install selenium
!pip install webdriver-manager
!pip install beautifulsoup4
!pip install nltk
!pip install requests
!pip install pandas
!pip install matplotlib
!pip install seaborn
!pip install pillow
!pip install pydub
!pip install ffmpeg-python
!pip install papaparse

In [2]:
# List of 20 categories
categories = [
    "Indian Street Food", "Historic Monuments", "Traditional Dance", 
    "Modern Architecture", "Wildlife Conservation", "Rural Landscapes",
    "Urban Markets", "Festival Celebrations", "Classical Music Instruments",
    "Sustainable Farming", "Textile Patterns", "Ancient Artifacts",
    "Mountain Ranges", "Coastal Cities", "Religious Ceremonies",
    "Adventure Sports", "Handicrafts", "Spice Markets",
    "Temple Architecture", "River Ecosystems"
]

In [7]:
## API keys
PEXELS_API_KEY = "bFuUPaQfE4ZJpkPgBaV0cbuuIh0A5ie0o5jjXQf1NRoyAFcy4zduwUE9"
#UPSPLASH_API_KEY = ""

In [8]:
def download_images_pexels(query, folder_path, num_images=50):
    # Create folder if it doesn't exist
    os.makedirs(folder_path, exist_ok=True)
    
    # Metadata list
    metadata = []
    
    # API headers
    headers = {
        "Authorization": PEXELS_API_KEY
    }
    
    # Calculate number of pages to fetch (15 images per page max)
    num_pages = (num_images + 14) // 15
    count = 0
    
    try:
        for page in range(1, num_pages + 1):
            # API endpoint
            url = f"https://api.pexels.com/v1/search?query={query}&per_page=15&page={page}"
            
            # Make API request
            response = requests.get(url, headers=headers)
            
            if response.status_code == 200:
                data = response.json()
                
                # Process each photo
                for photo in data.get("photos", []):
                    if count >= num_images:
                        break
                    
                    # Get image URL and info
                    img_url = photo["src"]["original"]
                    img_width = photo["width"]
                    img_height = photo["height"]
                    photographer = photo["photographer"]
                    
                    try:
                        # Download image
                        img_response = requests.get(img_url)
                        
                        if img_response.status_code == 200:
                            # Generate filename
                            filename = hashlib.md5(img_url.encode()).hexdigest() + ".jpg"
                            file_path = os.path.join(folder_path, filename)
                            
                            # Save image
                            with open(file_path, "wb") as f:
                                f.write(img_response.content)
                            
                            # Add metadata
                            metadata.append({
                                "category": query,
                                "url": img_url,
                                "filename": filename,
                                "resolution": f"{img_width}x{img_height}",
                                "width": img_width,
                                "height": img_height,
                                "photographer": photographer,
                                "source": "Pexels API"
                            })
                            
                            count += 1
                            print(f"Downloaded {count}/{num_images} images for {query}")
                            
                            # Be gentle to the API
                            time.sleep(0.5)
                    
                    except Exception as e:
                        print(f"Error downloading image: {e}")
            
            else:
                print(f"API request failed with status code {response.status_code}: {response.text}")
                break
            
            # If we've reached the desired count or there are no more images, break
            if count >= num_images or "next_page" not in data or data["next_page"] is None:
                break
                
            # Be nice to the API
            time.sleep(1)
        
        print(f"Downloaded {count} images for category '{query}'")
        
    except Exception as e:
        print(f"Error processing category {query}: {e}")
    
    return metadata

In [None]:
# List of 20 categories
categories = [
    "Indian Street Food", "Historic Monuments", "Traditional Dance", 
    "Modern Architecture", "Wildlife Conservation", "Rural Landscapes",
    "Urban Markets", "Festival Celebrations", "Classical Music Instruments",
    "Sustainable Farming", "Textile Patterns", "Ancient Artifacts",
    "Mountain Ranges", "Coastal Cities", "Religious Ceremonies",
    "Adventure Sports", "Handicrafts", "Spice Markets",
    "Temple Architecture", "River Ecosystems"
]

In [14]:
completed_categories = []

In [None]:
base_folder = "CulturalVisualCorpus"
os.makedirs(base_folder, exist_ok=True)

all_metadata = []

for category in categories:
    if category in completed_categories:
        continue
    print(f"\nProcessing category: {category}")
    folder_path = os.path.join(base_folder, category.replace(" ", "_"))
    
    # Download images and get metadata
    category_metadata = download_images_pexels(category, folder_path)
    all_metadata.extend(category_metadata)
    
    # Save category-specific metadata
    category_df = pd.DataFrame(category_metadata)
    if not category_df.empty:
        category_df.to_csv(os.path.join(folder_path, f"{category.replace(' ', '_')}_metadata.csv"), index=False)

    completed_categories.append(category)

# Save all metadata to a CSV file
metadata_df = pd.DataFrame(all_metadata)
metadata_df.to_csv(os.path.join(base_folder, "complete_metadata.csv"), index=False)

print(f"Dataset collection complete. Total images: {len(all_metadata)}")


Processing category: Rural Landscapes
Downloaded 1/50 images for Rural Landscapes
Downloaded 2/50 images for Rural Landscapes
Downloaded 3/50 images for Rural Landscapes
Downloaded 4/50 images for Rural Landscapes
Downloaded 5/50 images for Rural Landscapes
Downloaded 6/50 images for Rural Landscapes
Downloaded 7/50 images for Rural Landscapes
Downloaded 8/50 images for Rural Landscapes
Downloaded 9/50 images for Rural Landscapes
Downloaded 10/50 images for Rural Landscapes
Downloaded 11/50 images for Rural Landscapes
Downloaded 12/50 images for Rural Landscapes
Downloaded 13/50 images for Rural Landscapes
Downloaded 14/50 images for Rural Landscapes
Downloaded 15/50 images for Rural Landscapes
Downloaded 16/50 images for Rural Landscapes
Downloaded 17/50 images for Rural Landscapes
Downloaded 18/50 images for Rural Landscapes
Downloaded 19/50 images for Rural Landscapes
Downloaded 20/50 images for Rural Landscapes
Downloaded 21/50 images for Rural Landscapes
Downloaded 22/50 images f

## Text Extraction

In [1]:
import os
import requests
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import pandas as pd
import time
import random

In [2]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Rishita\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Rishita\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [16]:
categories_websites = {
    "Indian Politics": [
        "https://www.ndtv.com/india",
        "https://indianexpress.com/section/india/politics/",
        "https://timesofindia.indiatimes.com/politics"
    ],
    "Technology Startups": [
        "https://yourstory.com/",
        "https://inc42.com/",
        "https://techcrunch.com/tag/india/"
    ],
    "Classical Dance": [
        "https://www.narthaki.com/",
        "https://www.danceindia.com/",
        "https://darbar.org/dance"
    ],
    "Indian Cuisine": [
        "https://www.indianhealthyrecipes.com/",
        "https://www.vegrecipesofindia.com/",
        "https://food.ndtv.com/"
    ],
    "Environmental Conservation": [
        "https://india.mongabay.com/",
        "https://www.downtoearth.org.in/",
        "https://www.conservationindia.org/"
    ],
    "Education Policy": [
        "https://www.education.gov.in/",
        "https://theeducationpost.in/",
        "https://www.indiatoday.in/education-today"
    ],
    "Space Research": [
        "https://www.isro.gov.in/",
        "https://www.space.com/",
        "https://www.skyatnightmagazine.com/"
    ],
    "Film Industry": [
        "https://www.filmfare.com/",
        "https://www.bollywoodhungama.com/",
        "https://www.koimoi.com/"
    ],
    "Healthcare Innovation": [
        "https://healthcareindiamagazine.com/",
        "https://health.economictimes.indiatimes.com/",
        "https://www.expresshealthcare.in/"
    ],
    "Artificial Intelligence": [
        "https://analyticsindiamag.com/",
        "https://www.artificialintelligence-news.com/",
        "https://www.analyticsvidhya.com/"
    ],
    "Traditional Crafts": [
        "https://www.craftsvilla.com/blog/",
        "https://www.craftmark.org/",
        "https://www.gaatha.com/"
    ],
    "Agricultural Practices": [
        "https://krishijagran.com/",
        "https://www.agrifarming.in/",
        "https://www.agricultureinindia.net/"
    ],
    "Tourism": [
        "https://www.incredibleindia.org/",
        "https://www.holidify.com/",
        "https://www.thrillophilia.com/"
    ],
    "Renewable Energy": [
        "https://www.renewableenergyworld.com/",
        "https://mercomindia.com/",
        "https://energy.economictimes.indiatimes.com/tag/renewable+energy"
    ],
    "Literature": [
        "https://www.thehindu.com/books/",
        "https://indianculturalforum.in/",
        "https://litnet.co.za/"
    ],
    "Women Empowerment": [
        "https://www.shethepeople.tv/",
        "https://feminisminindia.com/",
        "https://www.womensweb.in/"
    ],
    "Sports": [
        "https://sportstar.thehindu.com/",
        "https://www.espn.in/",
        "https://www.sportskeeda.com/"
    ],
    "Economic Policy": [
        "https://www.livemint.com/",
        "https://economictimes.indiatimes.com/",
        "https://www.business-standard.com/"
    ],
    "Classical Music": [
        "https://www.darbar.org/",
        "https://www.raaga.com/",
        "https://www.carnatica.net/"
    ],
    "Historical Research": [
        "https://www.sahapedia.org/",
        "https://www.livehistoryindia.com/",
        "https://www.historytoday.com/"
    ]
}

In [17]:
def clean_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    
    # Remove special characters and numbers
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Join tokens back into text
    cleaned_text = ' '.join(tokens)
    
    return cleaned_text

In [18]:
# Function to crawl a website
def crawl_website(url, headers):
    try:
        response = requests.get(url, headers=headers, timeout=10)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Extract article titles
            titles = soup.find_all(['h1', 'h2', 'h3'])
            title_text = ' '.join([title.text.strip() for title in titles if title.text.strip()])
            
            # Extract article content
            paragraphs = soup.find_all('p')
            content_text = ' '.join([para.text.strip() for para in paragraphs if para.text.strip()])
            
            # Combine title and content
            all_text = title_text + ' ' + content_text
            
            # Clean the text
            cleaned_text = clean_text(all_text)
            
            return cleaned_text
        else:
            print(f"Failed to retrieve {url}: Status code {response.status_code}")
            return ""
    except Exception as e:
        print(f"Error crawling {url}: {e}")
        return ""



In [None]:
# Main execution
base_folder = "IndianContextualTextCorpus"
os.makedirs(base_folder, exist_ok=True)

# User agents list for rotation
user_agents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.59'
]

for category, websites in categories_websites.items():
    print(f"Processing category: {category}")
    category_text = []
    
    for website in websites:
        print(f"  Crawling: {website}")
        
        # Rotate user agents
        headers = {
            'User-Agent': random.choice(user_agents),
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Referer': 'https://www.google.com/'
        }
        
        # Crawl website
        text = crawl_website(website, headers)
        if text:
            category_text.append(text)
        
        # Respect websites by waiting between requests
        time.sleep(random.uniform(3, 7))
    
    # Combine all text from websites in this category
    combined_text = ' '.join(category_text)
    
    # Save to text file
    file_path = os.path.join(base_folder, f"{category.replace(' ', '_')}.txt")
    with open(file_path, 'w', encoding='utf-8') as f:
        f.write(combined_text)
    
    print(f"  Saved {len(combined_text)} characters for {category}")

print("Text dataset collection complete.")

## Audio Dataset

In [None]:
import os
import requests
import pandas as pd
import datetime
import time
import subprocess
import json
import uuid

In [None]:
radio_stations = [
    {"name": "AIR FM Gold", "url": "http://airfmgold-lh.akamaihd.net/i/fmgold_1@507591/master.m3u8"},
    {"name": "Radio Mirchi", "url": "http://peridot.streamguys.com:7150/Mirchi"},
    {"name": "Radio City", "url": "http://prclive1.listenon.in:8888/"},
    {"name": "Radio One", "url": "http://51.15.208.163:8081/radio/mt20live_aac/icecast.audio"},
    {"name": "AIR Vividh Bharati", "url": "http://vividhbharati-lh.akamaihd.net/i/vividhbharati_1@507811/master.m3u8"},
    {"name": "AIR FM Rainbow", "url": "http://fmrainbow-lh.akamaihd.net/i/fmrainbow_1@507812/master.m3u8"},
    {"name": "Radio Indigo", "url": "http://51.15.208.163:8081/radio/radioindigoapp/icecast.audio"},
    {"name": "Big FM", "url": "http://sc-bb.1.fm:8017/"},
    {"name": "Radio Girmit", "url": "http://ample-zeno-20.radiojar.com/q9c92qnc8neuv"},
    {"name": "Fever FM", "url": "http://104.237.4.164:8093/stream"},
    {"name": "Club FM", "url": "http://radio.cloudconnect.in:8000/clubfmtvm"},
    {"name": "Red FM", "url": "http://104.237.4.164:8287/stream"},
    {"name": "Ishq FM", "url": "http://104.237.4.164:8336/stream"},
    {"name": "Radio Umang", "url": "http://stream.radioumang.com:8000/radioumang"},
    {"name": "BollyHits Radio", "url": "http://64.71.79.181:8201/stream"},
    {"name": "Hungama Bollywood Hits", "url": "http://103.16.47.70:7222/"},
    {"name": "NonStop Radio", "url": "http://s5.voscast.com:8216/"},
    {"name": "Radio Sai", "url": "http://stream.radiosai.net:8000/"},
    {"name": "Radio Mango", "url": "http://bcovlive-a.akamaihd.net/19b535b7499a4719a5c19e043063f5d9/ap-southeast-1/6034685947001/playlist.m3u8"},
    {"name": "Gyan Vani", "url": "http://14.139.40.200:8080/stream/4/"},
    {"name": "Aakash Vani", "url": "http://akashvani-lh.akamaihd.net/i/akashvani_1@507878/master.m3u8"},
    {"name": "Radio Delhi", "url": "http://stream.zenolive.com/u8rrx04qwfeuv"},
    {"name": "Radio Udaan", "url": "http://stream.zeno.fm/eyxg23ky4tzuv"},
    {"name": "Radio Sharda", "url": "http://stream.zeno.fm/8wk1s0pt4tzuv"},
    {"name": "Diverse FM", "url": "http://a12.streamgates.net/radios-audio/91fmMum/chunks.m3u8"},
    {"name": "Spice FM", "url": "http://stream.zenolive.com/4mbexzv0sy8uv"},
    {"name": "Radio Madhuban", "url": "http://radiomadhuban.out.airtime.pro:8000/radiomadhuban_a"},
    {"name": "Radio Brahmaputra", "url": "http://stream.zeno.fm/cmwt80nv8tzuv"},
    {"name": "Hindi Heart Radio", "url": "http://184.154.43.106:8148/stream"},
    {"name": "Moksha Radio", "url": "http://stream.zeno.fm/wwu28apzwtzuv"}
]


In [None]:
def record_audio(station_info, duration=60, output_folder="IndianRadioAudioCorpus"):
    # Create output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)
    
    # Generate a unique filename
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    unique_id = uuid.uuid4().hex[:8]
    station_name_safe = station_info["name"].replace(" ", "_").replace("/", "_")
    output_file = os.path.join(output_folder, f"{station_name_safe}_{timestamp}_{unique_id}.mp3")
    
    try:
        print(f"Recording {station_info['name']} for {duration} seconds...")
        
        # Use ffmpeg to record the stream
        command = [
            "ffmpeg",
            "-i", station_info["url"],
            "-t", str(duration),
            "-c", "copy" if station_info["url"].endswith(".mp3") else "libmp3lame",
            "-q:a", "2",
            output_file
        ]
        
        # Execute the command
        process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        stdout, stderr = process.communicate()
        
        if process.returncode != 0:
            print(f"Error recording {station_info['name']}: {stderr.decode()}")
            return None
        
        # Get file size
        file_size = os.path.getsize(output_file)
        
        # Return metadata
        metadata = {
            "station_name": station_info["name"],
            "stream_url": station_info["url"],
            "filename": os.path.basename(output_file),
            "timestamp": timestamp,
            "duration_seconds": duration,
            "file_size_bytes": file_size,
            "recording_date": datetime.datetime.now().strftime("%Y-%m-%d"),
            "recording_time": datetime.datetime.now().strftime("%H:%M:%S")
        }
        
        return metadata
    
    except Exception as e:
        print(f"Failed to record {station_info['name']}: {e}")
        # Remove partial file if it exists
        if os.path.exists(output_file):
            os.remove(output_file)
        return None

In [None]:
# Main execution
output_folder = "IndianRadioAudioCorpus"
metadata_list = []

# Record from each station
for i, station in enumerate(radio_stations[:30]):  # Limit to 30 stations
    print(f"Processing station {i+1}/30: {station['name']}")
    
    # Record for a random duration between 30 and 90 seconds
    duration = 60  # Fixed duration for consistency
    metadata = record_audio(station, duration, output_folder)
    
    if metadata:
        metadata_list.append(metadata)
        print(f"Successfully recorded {station['name']}")
    
    # Wait between recordings to avoid overloading
    time.sleep(5)
    
    # Stop after we have 30 successful recordings
    if len(metadata_list) >= 30:
        break

# Save metadata to CSV
metadata_df = pd.DataFrame(metadata_list)
metadata_df.to_csv(os.path.join(output_folder, "radio_recordings_metadata.csv"), index=False)

print(f"Audio dataset collection complete. Total recordings: {len(metadata_list)}")