In [2]:
import os
import requests
import pandas as pd
import mimetypes
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
from tqdm import tqdm
import requests
import mimetypes
from googleapiclient.http import MediaIoBaseUpload
import io

Loaded data into MyDrive by using Google Drive API. Connected to my drive and loaded the images into there.

In [3]:
# Authenticate and connect to Google Drive
SCOPES = ['https://www.googleapis.com/auth/drive.file']

def authenticate():
    flow = InstalledAppFlow.from_client_secrets_file('credentials.json', SCOPES)
    creds = flow.run_local_server(port=0)
    return build('drive', 'v3', credentials=creds)

# Check/create folder
def get_or_create_folder(service, folder_name, parent_id=None):
    query = f"name='{folder_name}' and mimeType='application/vnd.google-apps.folder'"
    if parent_id:
        query += f" and '{parent_id}' in parents"

    results = service.files().list(q=query, fields="files(id, name)").execute()
    folders = results.get('files', [])

    if folders:
        return folders[0]['id']
    else:
        file_metadata = {
            'name': folder_name,
            'mimeType': 'application/vnd.google-apps.folder',
        }
        if parent_id:
            file_metadata['parents'] = [parent_id]

        folder = service.files().create(body=file_metadata, fields='id').execute()
        return folder['id']
    
# Get list of files already uploaded in the folder
def get_uploaded_filenames(service, folder_id):
    uploaded_filenames = set()
    page_token = None
    while True:
        response = service.files().list(
            q=f"'{folder_id}' in parents and trashed = false",
            spaces='drive',
            fields='nextPageToken, files(name)',
            pageToken=page_token
        ).execute()
        for file in response.get('files', []):
            uploaded_filenames.add(file['name'])
        page_token = response.get('nextPageToken', None)
        if page_token is None:
            break
    return uploaded_filenames


# Upload image to Drive
def upload_image(service, image_url, filename, folder_id):
    try:
        response = requests.get(image_url)
        if response.status_code == 200:
            mime_type = mimetypes.guess_type(filename)[0]
            file_metadata = {
                'name': filename,
                'parents': [folder_id]
            }
            media = {
                'mimeType': mime_type,
                'body': response.content
            }

            from googleapiclient.http import MediaIoBaseUpload
            import io

            media_body = MediaIoBaseUpload(io.BytesIO(response.content), mimetype=mime_type, resumable=True)
            service.files().create(body=file_metadata, media_body=media_body, fields='id').execute()
    except Exception as e:
        print(f"Failed to upload {filename}: {e}")

# Main function to run the process
def download_and_upload_images(dataframe, image_url_column='image_url'):
    service = authenticate()

    outer_folder_id = get_or_create_folder(service, 'Data Mining 476')
    project_folder_id = get_or_create_folder(service, 'project_images', parent_id=outer_folder_id)

    uploaded_filenames = get_uploaded_filenames(service, project_folder_id)

    for idx, row in tqdm(dataframe.iterrows(), total=len(dataframe)):
        filename = f"image_{idx}.jpg"
        if filename in uploaded_filenames:
            continue  # Skip if already uploaded
        image_url = row[image_url_column]
        upload_image(service, image_url, filename, project_folder_id)

In [4]:
df = pd.read_csv('balanced_data.csv')  # or use the DataFrame directly if loaded
download_and_upload_images(df)

Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=202968894204-muo6t8rq86fgjq8596rnkt4os6n7fksa.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A65098%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.file&state=YVIOVJJNUZvut5H7ElCLjlWlY8UwZO&access_type=offline


100%|██████████| 19584/19584 [11:25:52<00:00,  2.10s/it]  
