<a href="https://colab.research.google.com/github/sanjay7178/amul-mascot-girl-flux-t2i/blob/main/amul_mascot_girl_preprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install requests beautifulsoup4 tqdm



In [3]:
import requests
from bs4 import BeautifulSoup
import csv
from tqdm import tqdm

def scrape_amul_hits(years, max_index):
    base_url = "https://amul.com/m/amul-hits"
    results = []

    total_iterations = len(years) * (max_index + 1)

    with tqdm(total=total_iterations, desc="Scraping progress", unit="request") as pbar:
        for year in years:
            prior_year = year - 1  # Calculate the prior year
            for i in range(max_index + 1):
                params = {
                    's': year,
                    'l': i
                }
                response = requests.get(base_url, params=params)

                if response.status_code == 200:
                    soup = BeautifulSoup(response.text, 'html.parser')
                    items = soup.select('div.brandslist.amulhits ul li a')

                    for item in items:
                        href = item.get('href')
                        title = item.get('title')
                        img_alt = item.find('img').get('alt') if item.find('img') else None

                        results.append({
                            'year': year,
                            'prior_year': prior_year,
                            'index': i,
                            'href': "https://amul.com/"+href,
                            'title': title,
                            'alt': img_alt
                        })
                pbar.update(1)

    return results

def save_to_csv(data, filename):
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=['year', 'prior_year', 'index', 'href', 'title', 'alt'])
        writer.writeheader()
        for row in data:
            writer.writerow(row)

# Example usage
# years = [2024, 2023, 2022]  # Add more years as needed
years = list(range(2024, 1989, -1)) + [1989,1987,1986,1983,1982,1981,1979,1976]
max_index = 30  # Set this to the maximum number of 'l' values you want to check

data = scrape_amul_hits(years, max_index)

# Save the results to a CSV file
save_to_csv(data, 'amul_hits.csv')

print(f"Data saved to 'amul_hits.csv'")


Scraping progress: 100%|██████████| 1333/1333 [33:57<00:00,  1.53s/request]

Data saved to 'amul_hits.csv'





In [5]:
import os
import csv
import requests
from urllib.parse import urljoin
from tqdm import tqdm
import zipfile
from concurrent.futures import ThreadPoolExecutor, as_completed

def download_image(url, folder_path, image_name):
    try:
        response = requests.get(url, stream=True)
        if response.status_code == 200:
            image_path = os.path.join(folder_path, image_name)
            with open(image_path, 'wb') as f:
                for chunk in response.iter_content(1024):
                    f.write(chunk)
        return True
    except Exception as e:
        print(f"Failed to download {url}: {e}")
        return False

def create_folder_structure_and_download(csv_file, dest_folder, max_workers=5):
    base_url = "https://amul.com"
    download_tasks = []

    if not os.path.exists(dest_folder):
        os.makedirs(dest_folder)

    with open(csv_file, mode='r', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            for row in tqdm(reader, desc="Submitting download tasks"):
                year = row['year']
                href = row['href'][18:]
                title = row['title'].replace('/', '-')  # Replace slash to avoid issues in filenames
                alt = row['alt'].replace('/', '-') if row['alt'] else ''  # Replace slash to avoid issues in filenames

                # Create folder for the year if it doesn't exist
                year_folder_path = os.path.join(dest_folder, year)
                if not os.path.exists(year_folder_path):
                    os.makedirs(year_folder_path)

                # Generate image name
                image_name = f"{title} {alt}.jpg"

                # Full image URL
                image_url = urljoin(base_url, href)

                # Submit the download task
                task = executor.submit(download_image, image_url, year_folder_path, image_name)
                download_tasks.append(task)

            # Wait for all download tasks to complete
            for task in tqdm(as_completed(download_tasks), total=len(download_tasks), desc="Downloading images"):
                task.result()

def zip_folder(folder_name):
    zip_filename = f"{folder_name}.zip"
    with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, dirs, files in os.walk(folder_name):
            for file in files:
                file_path = os.path.join(root, file)
                zipf.write(file_path, os.path.relpath(file_path, folder_name))
    print(f"Folder zipped to {zip_filename}")

# Example usage
csv_file = 'amul_hits.csv'  # Replace with your CSV file name
destination_folder = 'amul-girl-images'  # Replace with your desired destination folder
create_folder_structure_and_download(csv_file, '/content/dataset')
zip_folder(destination_folder)


Submitting download tasks: 3511it [00:00, 10188.68it/s]
Downloading images:  69%|██████▉   | 2433/3511 [13:59<04:59,  3.60it/s]

Failed to download https://amul.com/files/hits/amul-hits-210.jpg: [Errno 36] File name too long: "/content/dataset/2008/Michael Phelps, 23 year-old American swimmer won unprecedented eight gold medals in  Beijing Olympics 2008. He, thus bettered compatriot Mark Spitz's record of seven gold medals at Munich Olympics 1972 in Germany thirty six years ago - August '08. Butter fry! Bread stroke!.jpg"


Downloading images:  69%|██████▉   | 2440/3511 [14:01<04:18,  4.15it/s]

Failed to download https://amul.com/files/hits/amul-hits-204.jpg: [Errno 36] File name too long: "/content/dataset/2008/Hollywood movie 'The Dark Knight' actor Heath Ledger as The Joker with mesmerizing performance, a fascinating maniacal character and super-villain who likes to carve smiles on people's face after posing the query : Why so serious? - July '08. AMUL IS SERIOUSLY GOOD!.jpg"


Downloading images:  70%|██████▉   | 2443/3511 [14:02<06:21,  2.80it/s]

Failed to download https://amul.com/files/hits/amul-hits-199.jpg: [Errno 36] File name too long: "/content/dataset/2008/One of India's greatest soldier - inspiring Army Chief, militarily very astute and first Field Marshal  Sam Manekshaw affectionately called 'Sam Bahadur' passed away on Friday June 27, 2008 at the age of 94 years (1914 - 2008) - July '08. Jai Jawaan, Jai Sam!.jpg"


Downloading images:  70%|██████▉   | 2447/3511 [14:03<05:14,  3.38it/s]

Failed to download https://amul.com/files/hits/amul-hits-196.jpg: [Errno 36] File name too long: "/content/dataset/2008/Big budget Indian movie 'Dasavataram' stands for 10 avatars with popular actor Kamal Hasan performing ten different roles - orthodox Brahmin, a karate fighter, a dwarf, a scientist, a spy, an out cast person, a dark man, a robber, a philosopher and a DUS BUTTERUM.jpg"


Downloading images:  70%|██████▉   | 2453/3511 [14:05<04:51,  3.63it/s]

Failed to download https://amul.com/files/hits/amul-hits-191.jpg: [Errno 36] File name too long: '/content/dataset/2008/Indian Premier League (IPL) Twenty20 cricketers in race for top positions - Delhi Daredevils batsman Gautam Gambhir top run getter, Rajasthan Royals captian - spin bowler Shane Warne leading  wicket-taker & Chennai Super Kings captian - wicket-keeper MOST VALUABLE LAYER!.jpg'


Downloading images:  70%|███████   | 2463/3511 [14:08<04:43,  3.70it/s]

Failed to download https://amul.com/files/hits/amul-hits-294.jpg: [Errno 36] File name too long: '/content/dataset/2008/Indian Premier League (IPL) franchisees ranging from Indian business tycoons to Bollywood stars bid for top Indian & International Cricketers during an amazing price auction at Hilton Towers in Mumbai for Twenty20 tournment to be held from April 18 t Twenty Plenty ?.jpg'


Downloading images:  70%|███████   | 2473/3511 [14:11<05:46,  3.00it/s]

Failed to download https://amul.com/files/hits/amul-hits-283.jpg: [Errno 36] File name too long: "/content/dataset/2007/Item girl Rakhi Sawant’s outburst following her & partner Abhishek Awasthi defeat in reality dance show 'Nach Baliye 3' alleging Star TV channel in bogus Short Messaging Service (SMS) votes in favour of her opponent and blocking her SMSes - Decembe Ungli pe nachana!.jpg"


Downloading images:  71%|███████   | 2484/3511 [14:14<04:09,  4.12it/s]

Failed to download https://amul.com/files/hits/amul-hits-273.jpg: [Errno 36] File name too long: '/content/dataset/2007/Times of India’s `Lead India Contest’ across eight Indian Cities – Ahmedabad, Bangalore, Delhi, Hyderabad, Kolkata, Lucknow, Mumbai & Pune showcasing brightest young Indians (25 – 45 years) vision on various issues of literacy, poverty, equal FEED INDIA.jpg'


Downloading images:  71%|███████   | 2493/3511 [14:18<05:07,  3.31it/s]

Failed to download https://amul.com/files/hits/amul-hits-263.jpg: [Errno 36] File name too long: '/content/dataset/2007/Bollywood superstar Sanjay Dutt popularly known as Sanju Baba given six-year jail term by TADA Court (Terrorism and Disruptive Activities (Prevention)) being guilty under Arms Act for illegally possessing prohibited arms & ammunition in the 1993 Bomb Bole toh, apun saath hai, Baba!.jpg'


Downloading images:  71%|███████   | 2500/3511 [14:19<03:14,  5.18it/s]

Failed to download https://amul.com/files/hits/amul-hits-258.jpg: [Errno 36] File name too long: "/content/dataset/2007/India's Taj Mahal a masterpiece of architecture of 17th Century in the worldwide race for inclusion in the new Seven Wonders of the World. It was built by Mughal Emperor Shah Jahan as a symbol of enduring love for his wife Mumtaz on the outskirt of A Taaja Maal!.jpg"


Downloading images:  72%|███████▏  | 2529/3511 [14:28<03:44,  4.38it/s]

Failed to download https://amul.com/files/hits/amul-hits-338.jpg: [Errno 36] File name too long: "/content/dataset/2006/Australian master leg-spinner Shane Warne became the first cricketer to achieve unimaginable milestone of claiming 700 Test wickets during the fourth Ashes Test against England on his home ground at Melbourne Cricket Ground, Australia - December '06 700.jpg"


Downloading images:  72%|███████▏  | 2531/3511 [14:29<05:05,  3.21it/s]

Failed to download https://amul.com/files/hits/amul-hits-334.jpg: [Errno 36] File name too long: "/content/dataset/2006/On Hollywood actress Angelina Jolie's three British bodyguards arrested for threatening, insult to religion and verbal abuse of children's parents at Anjuman-e-Islam High School in Mumbai during the shooting of the movie 'A Mighty Heart'. - November Jolie ke aage kya hai?.jpg"


Downloading images:  73%|███████▎  | 2548/3511 [14:33<03:58,  4.04it/s]

Failed to download https://amul.com/files/hits/amul-hits-320.jpg: [Errno 36] File name too long: '/content/dataset/2006/Police action against Cable Operators in Mumbai & Suburbs and large parts of Maharashtra on Bombay High Court notification banning beaming of films-programmes with adult content resulting in Cable Operators resorting to blackout of cable services - A CERTIFIED (A).jpg'


Downloading images:  75%|███████▌  | 2635/3511 [14:59<03:00,  4.86it/s]

Failed to download https://amul.com/files/hits/amul-hits-401.jpg: [Errno 36] File name too long: "/content/dataset/2005/Bollywood actor Mr. Amrish Puri one of the best screen villian popularly know as Mogambo passed away on January 12, 2005. He mouthed one of the most memorable line 'Mogambo kush hua' which has left a mark on the film audience - January'05 Mogambo ... dukh hua.jpg"


Downloading images:  82%|████████▏ | 2893/3511 [16:08<02:01,  5.08it/s]

Failed to download https://amul.com/files/hits/amul-hits-654.jpg: [Errno 36] File name too long: "/content/dataset/2000/Amul Hits of 2000 - 2001   On the former Indian Prime-minister being found guilty by the court - November 2000   image On the C.B.I. report finding certain Indian cricketer's guilty of match-fixing - November 2000   image On the new TV quiz show bei No Mukti from this Morcha!.jpg"


Downloading images: 100%|██████████| 3511/3511 [18:43<00:00,  3.12it/s]

Folder zipped to amul-girl-images.zip





In [7]:
!du -sh  /content/dataset

368M	/content/dataset


In [8]:
!zip -r amul_dataset.zip /content/dataset


  adding: content/dataset/ (stored 0%)
  adding: content/dataset/1976/ (stored 0%)
  adding: content/dataset/1976/Maintenance of Internal Security Act introduced during emergency. Maintain Internal Security.jpg (deflated 0%)
  adding: content/dataset/1976/On compulsory sterilisation introduced during Indira Gandhi regime. We've always practised Compulsory Sterilisation.jpg (deflated 1%)
  adding: content/dataset/1976/Your good fortune! Your good fortune!.jpg (deflated 0%)
  adding: content/dataset/1976/When helmets were made compulsory in Bombay Use your head!.jpg (deflated 1%)
  adding: content/dataset/1976/The Big Payoff The Big Payoff.jpg (deflated 1%)
  adding: content/dataset/1976/When the Bruce-Lee film `Enter the Dragon' was released. Enter the Dragon Champ!.jpg (deflated 2%)
  adding: content/dataset/1976/Race through this course Race through this course.jpg (deflated 1%)
  adding: content/dataset/2022/ (stored 0%)
  adding: content/dataset/2022/Wishing everyone a Happy New Yea

In [9]:
!ls

2023  2024  amul_dataset.zip  amul-girl-images.zip  amul_hits.csv  dataset  sample_data


In [10]:
!cp amul_dataset.zip /content/drive/MyDrive/Amul

In [11]:
!cp -r /content/dataset /content/drive/MyDrive/Amul

In [13]:
!ls /content/drive/MyDrive/Amul/dataset

1976  1982  1987  1991	1994  1997  2000  2003	2006  2009  2012  2015	2018  2021  2024
1979  1983  1989  1992	1995  1998  2001  2004	2007  2010  2013  2016	2019  2022
1981  1986  1990  1993	1996  1999  2002  2005	2008  2011  2014  2017	2020  2023


In [16]:
!find /content/dataset -type f -name "*.jpg" -o -name "*.jpeg" -o -name "*.png" -o -name "*.gif" | wc -l

3496


### push to hugging face



In [17]:
!pip install huggingface_hub
!huggingface-cli login



    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) 
Token is valid (permission: write).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your term

In [19]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K

In [23]:
!pip install pyarrow==9.0.0  # Example version


Collecting pyarrow==9.0.0
  Downloading pyarrow-9.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Downloading pyarrow-9.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (35.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.3/35.3 MB[0m [31m41.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyarrow
  Attempting uninstall: pyarrow
    Found existing installation: pyarrow 17.0.0
    Uninstalling pyarrow-17.0.0:
      Successfully uninstalled pyarrow-17.0.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf-cu12 24.4.1 requires pyarrow<15.0.0a0,>=14.0.1, but you have pyarrow 9.0.0 which is incompatible.
datasets 2.21.0 requires pyarrow>=15.0.0, but you have pyarrow 9.0.0 which is incompatible.[0m[31m
[0mSuccessfully installed pyarrow-9.0.0


In [11]:
import os
from datasets import Dataset, Features, Value, Image
from huggingface_hub import HfApi, HfFolder

def prepare_metadata_from_directory(base_dir):
    images_metadata = []
    error_count = 0

    for year_dir in os.listdir(base_dir):
        year_path = os.path.join(base_dir, year_dir)
        if os.path.isdir(year_path):
            for image_name in os.listdir(year_path):
                if image_name.lower().endswith(('.jpg', '.jpeg', '.png')):
                    try:
                        title = image_name.split(' ')[0]  # Example logic
                        alt = ' '.join(image_name.split(' ')[1:]).replace('.jpg', '').replace('.jpeg', '').replace('.png', '')  # Example logic
                        image_path = os.path.join(year_path, image_name)

                        # Debug print to verify the metadata
                        print({
                            'year': year_dir,
                            'image_name': image_name,
                            'image_path': image_path,
                            'title': title,
                            'alt': alt
                        })

                        images_metadata.append({
                            'year': year_dir,
                            'image_name': image_name,
                            'image_path': image_path,
                            'title': title,
                            'alt': alt
                        })
                    except Exception as e:
                        print(f"Error processing image {image_name}: {e}")
                        error_count += 1

    return images_metadata, error_count

def load_image(image_path):
    try:
        with open(image_path, 'rb') as img_file:
            return img_file.read()
    except Exception as e:
        print(f"Failed to load image {image_path}: {e}")
        return None

def load_images(data):
    error_count = 0
    for item in data:
        try:
            # Debug print to verify if image_path is available
            print(f"Loading image for path: {item.get('image_path')}")
            if item.get('image_path'):
                item['image'] = load_image(item['image_path'])
            else:
                item['image'] = None  # Handle the missing image path differently
        except Exception as e:
            print(f"Error loading image {item.get('image_path')}: {e}")
            error_count += 1
    return data, error_count

def push_images_to_huggingface(images_metadata, repo_id):
    features = Features({
        'year': Value('string'),
        'image_name': Value('string'),
        'image': Image(),
        'title': Value('string'),
        'alt': Value('string')
    })

    error_count = 0

    # Ensure that all metadata records have 'image_path'
    for item in images_metadata:
        try:
            if 'image_path' not in item:
                print(f"Missing 'image_path' in metadata: {item}")
                error_count += 1
        except Exception as e:
            print(f"Error in metadata: {e}")
            error_count += 1

    try:
        dataset, data_errors = load_images(images_metadata)
        error_count += data_errors

        dataset = Dataset.from_list(dataset, features=features)

        # Log in to Hugging Face
        api = HfApi()
        api.login(token=HfFolder.get_token())

        # Push the dataset to Hugging Face Hub
        dataset.push_to_hub(repo_id)

        print(f"Dataset pushed to https://huggingface.co/datasets/{repo_id}")
    except Exception as e:
        print(f"Error pushing dataset to Hugging Face: {e}")
        error_count += 1

    return error_count


# Example usage
base_dir = '/content/dataset/'  # Replace with the directory where images are stored
repo_id = "sanjay7178/amul-mascot-girl"  # Replace with your Hugging Face repo name


# Prepare metadata
images_metadata, prep_errors = prepare_metadata_from_directory(base_dir)
print(f"Metadata preparation errors: {prep_errors}")

# Push metadata and images to Hugging Face
total_errors = push_images_to_huggingface(images_metadata, repo_id)
print(f"Total errors: {prep_errors + total_errors}")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
{'year': '1983', 'image_name': "INSAT-1's initial problems regarding opening of its flaps. These flaps open easily..jpg", 'image_path': "/content/dataset/1983/INSAT-1's initial problems regarding opening of its flaps. These flaps open easily..jpg", 'title': "INSAT-1's", 'alt': 'initial problems regarding opening of its flaps. These flaps open easily.'}
{'year': '1983', 'image_name': "Sunil Gavaskar's century against West Indies in Delhi. Don't of a Sunny era..jpg", 'image_path': "/content/dataset/1983/Sunil Gavaskar's century against West Indies in Delhi. Don't of a Sunny era..jpg", 'title': 'Sunil', 'alt': "Gavaskar's century against West Indies in Delhi. Don't of a Sunny era."}
{'year': '2019', 'image_name': 'Priyanka Gandhi joins politics!- (Jan’ 19) Family stree!.jpg', 'image_path': '/content/dataset/2019/Priyanka Gandhi joins politics!- (Jan’ 19) Family stree!.jpg', 'title': 'Priyanka', 'alt': 'Gandhi joins politics!

KeyError: 'image_path'

In [5]:
!apt install tree

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  tree
0 upgraded, 1 newly installed, 0 to remove and 45 not upgraded.
Need to get 47.9 kB of archives.
After this operation, 116 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tree amd64 2.0.2-1 [47.9 kB]
Fetched 47.9 kB in 1s (95.3 kB/s)
Selecting previously unselected package tree.
(Reading database ... 123595 files and directories currently installed.)
Preparing to unpack .../tree_2.0.2-1_amd64.deb ...
Unpacking tree (2.0.2-1) ...
Setting up tree (2.0.2-1) ...
Processing triggers for man-db (2.10.2-1) ...


In [15]:
from pathlib import Path
from huggingface_hub import HfApi, Repository
import os

# Set your Hugging Face username and dataset name
USERNAME = "sanjay7178"
DATASET_NAME = "amul-mascot-girl"

# Set the local directory containing your dataset
LOCAL_DATASET_DIR = Path("./content/dataset")

# Initialize the Hugging Face API
api = HfApi()

# Check if the dataset already exists
try:
    api.list_datasets(organization=USERNAME)
    if DATASET_NAME in api.list_datasets(organization=USERNAME):
        print(f"Dataset '{DATASET_NAME}' already exists.")
    else:
        print(f"Creating new dataset '{DATASET_NAME}'...")
        api.create_repo(
            repo_id=f"{USERNAME}/{DATASET_NAME}",
            repo_type="dataset",
            exist_ok=True,
        )
except:
    print(f"Creating new dataset '{DATASET_NAME}'...")
    api.create_repo(
        repo_id=f"{USERNAME}/{DATASET_NAME}",
        repo_type="dataset",
        exist_ok=True,
    )

# Initialize the repository
repo = Repository(local_dir=str(LOCAL_DATASET_DIR), name=DATASET_NAME, repo_type="dataset", organization=USERNAME)
repo.git_pull()

# Add the data to the repository
print("Adding data to the repository...")
for year_dir in LOCAL_DATASET_DIR.glob("*"):
    if year_dir.is_dir():
        year = year_dir.name
        for image_file in year_dir.glob("*"):
            image_name = image_file.stem
            repo.add_file(str(image_file), path=f"{year}/{image_name}/{image_file.name}")

# Commit and push the changes
print("Committing and pushing the changes...")
repo.git_add()
repo.git_commit(commit_message="Upload dataset")
repo.git_push()

print("Dataset upload complete!")

Creating new dataset 'amul-mascot-girl'...


TypeError: Repository.__init__() got an unexpected keyword argument 'name'

In [19]:
!pip install -U "huggingface_hub[cli]"

Collecting huggingface_hub[cli]
  Downloading huggingface_hub-0.24.6-py3-none-any.whl.metadata (13 kB)
Collecting InquirerPy==0.3.4 (from huggingface_hub[cli])
  Downloading InquirerPy-0.3.4-py3-none-any.whl.metadata (8.1 kB)
Collecting pfzy<0.4.0,>=0.3.1 (from InquirerPy==0.3.4->huggingface_hub[cli])
  Downloading pfzy-0.3.4-py3-none-any.whl.metadata (4.9 kB)
Downloading InquirerPy-0.3.4-py3-none-any.whl (67 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.7/67.7 kB[0m [31m945.6 kB/s[0m eta [36m0:00:00[0m
[?25hDownloading huggingface_hub-0.24.6-py3-none-any.whl (417 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m417.5/417.5 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pfzy-0.3.4-py3-none-any.whl (8.5 kB)
Installing collected packages: pfzy, InquirerPy, huggingface_hub
  Attempting uninstall: huggingface_hub
    Found existing installation: huggingface-hub 0.23.5
    Uninstalling huggingface-hub-0.23.5:
      Successfully 

In [4]:
!huggingface-cli dataset push --dataset-name amul-mascot-girl --organization sanjay7178 --local-dir ./dataset

usage: huggingface-cli <command> [<args>]
huggingface-cli: error: argument {download,upload,repo-files,env,login,whoami,logout,repo,lfs-enable-largefiles,lfs-multipart-upload,scan-cache,delete-cache,tag}: invalid choice: 'dataset' (choose from 'download', 'upload', 'repo-files', 'env', 'login', 'whoami', 'logout', 'repo', 'lfs-enable-largefiles', 'lfs-multipart-upload', 'scan-cache', 'delete-cache', 'tag')


In [5]:
!huggingface-cli upload sanjay7178/amul-mascot-girl ./dataset/ /train --repo-type=dataset

[1;30;43mStreaming output truncated to the last 5000 lines.[0m

Super Virat’s single-handed success! – March’16 KOHLI KE PEECHE KAUN HAI?.jpg:   0% 0.00/105k [00:00<?, ?B/s][A




Upload 3496 LFS files:  65% 2259/3496 [01:51<01:07, 18.39it/s][A[A[A[A[A

Superstar Rajinikanth mania sweeps the country!- July'16 He makes onions cry, tomatoes blush & butter fry!.jpg:   0% 0.00/91.5k [00:00<?, ?B/s][A[A


Stolen or gifted, that is the question!-April'16 Heera Pheri?.jpg: 100% 114k/114k [00:00<00:00, 1.18MB/s]
Superstar sings National Anthem at T20!-March’16 Big A from Big B!.jpg: 100% 91.5k/91.5k [00:00<00:00, 956kB/s]
Superstar Rajinikanth mania sweeps the country!- July'16 He makes onions cry, tomatoes blush & butter fry!.jpg: 100% 91.5k/91.5k [00:00<00:00, 611kB/s]





Upload 3496 LFS files:  65% 2264/3496 [01:51<00:56, 21.96it/s][A[A[A[A[A

TV anchor gets special security!- Oct'16 WHAT? WHEN? HOW? Y?.jpg: 100% 82.7k/82.7k [00:00<00:00, 855kB/s]
Super Virat’s single-hande

In [17]:
from pathlib import Path
from huggingface_hub import HfApi, create_repo, Repository
import os

# Set your Hugging Face username and dataset name
USERNAME = "sanjay7178"
DATASET_NAME = "amul-mascot-girl"

# Set the local directory containing your dataset
LOCAL_DATASET_DIR = Path("./content/dataset")

# Initialize the Hugging Face API
api = HfApi()

# Check if the dataset already exists
try:
    api.list_datasets(organization=USERNAME)
    if DATASET_NAME in api.list_datasets(organization=USERNAME):
        print(f"Dataset '{DATASET_NAME}' already exists.")
    else:
        print(f"Creating new dataset '{DATASET_NAME}'...")
        create_repo(
            repo_id=f"{USERNAME}/{DATASET_NAME}",
            repo_type="dataset",
            exist_ok=True,
        )
except:
    print(f"Creating new dataset '{DATASET_NAME}'...")
    create_repo(
        repo_id=f"{USERNAME}/{DATASET_NAME}",
        repo_type="dataset",
        exist_ok=True,
    )

# Initialize the repository
repo = Repository(local_dir=str(LOCAL_DATASET_DIR), clone_from=f"{USERNAME}/{DATASET_NAME}", repo_type="dataset")
repo.git_pull()

# Add the data to the repository
print("Adding data to the repository...")
for year_dir in LOCAL_DATASET_DIR.glob("*"):
    if year_dir.is_dir():
        year = year_dir.name
        for image_file in year_dir.glob("*"):
            image_name = image_file.stem
            repo..add_file(str(image_file), path=f"{year}/{image_name}/{image_file.name}")

# Commit and push the changes
print("Committing and pushing the changes...")
repo.git_add()
repo.git_commit(commit_message="Upload dataset")
repo.git_push()

print("Dataset upload complete!")

Creating new dataset 'amul-mascot-girl'...


For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
Cloning https://huggingface.co/datasets/sanjay7178/amul-mascot-girl into local empty directory.


Adding data to the repository...


AttributeError: 'Repository' object has no attribute 'add_file'

In [6]:
!tree /content/dataset

[01;34m/content/dataset[0m
├── [01;34m1976[0m
│   ├── [01;35mMaintenance of Internal Security Act introduced during emergency. Maintain Internal Security.jpg[0m
│   ├── [01;35mOn compulsory sterilisation introduced during Indira Gandhi regime. We've always practised Compulsory Sterilisation.jpg[0m
│   ├── [01;35mRace through this course Race through this course.jpg[0m
│   ├── [01;35mThe Big Payoff The Big Payoff.jpg[0m
│   ├── [01;35mWhen helmets were made compulsory in Bombay Use your head!.jpg[0m
│   ├── [01;35mWhen the Bruce-Lee film `Enter the Dragon' was released. Enter the Dragon Champ!.jpg[0m
│   └── [01;35mYour good fortune! Your good fortune!.jpg[0m
├── [01;34m1979[0m
│   ├── [01;35mA comment on Charlie Chaplin's film `Gold Rush'. The Gold Rush is Still on.jpg[0m
│   ├── [01;35mA comment on the film `Pati, Patni aur woh'. PATI PATNI AUR WAH!.jpg[0m
│   ├── [01;35mAmul celebrate the Diwali Festival. We're crackers about Amul!.jpg[0m
│   ├── [01;35mAmu