In [1]:
import os
import json
import pandas as pd
import requests
import zipfile
import urllib3
from zipfile import ZipFile

# Suppress only the single InsecureRequestWarning from urllib3 needed
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# Path to the kaggle.json file
kaggle_json_path = os.path.join(os.getenv('USERPROFILE'), '.kaggle', 'kaggle.json')

# Load Kaggle API credentials from the kaggle.json file
with open(kaggle_json_path, 'r') as file:
    kaggle_credentials = json.load(file)
    kaggle_username = kaggle_credentials['username']
    kaggle_key = kaggle_credentials['key']

# Kaggle API endpoint and headers
dataset_url = 'https://www.kaggle.com/api/v1/datasets/download/dinachanthan/cleaned-retail-shop-dataset'
headers = {
    'User-Agent': 'Mozilla/5.0',
    'Authorization': f'Bearer {kaggle_key}'
}

# Create the data directory if it doesn't exist
data_dir = './data'
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

# Download the dataset with SSL verification disabled
response = requests.get(dataset_url, headers=headers, stream=True, verify=False)
zip_path = os.path.join(data_dir, 'archive.zip')

if response.status_code == 200:
    with open(zip_path, 'wb') as file:
        for chunk in response.iter_content(chunk_size=1024):
            file.write(chunk)

    # Verify if the file is a valid zip file
    if zipfile.is_zipfile(zip_path):
        # Extract the dataset
        with ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(data_dir)

        # Remove the zip file
        os.remove(zip_path)

        # Load the dataset with faster CSV parser
        data_path = os.path.join(data_dir, 'dataset.csv')
        if not os.path.exists(data_path):
            downloaded_files = os.listdir(data_dir)
            print("Downloaded files:", downloaded_files)
            raise FileNotFoundError(f"Expected file dataset.csv not found in {data_dir}. Check downloaded files.")

        # Use 'pyarrow' for faster CSV reading if available, fallback to default 'c' engine
        try:
            df = pd.read_csv(data_path, engine='pyarrow')
        except ImportError:
            df = pd.read_csv(data_path, engine='c')

        # Display the first few rows of the dataframe
        print(df.head())
        print('\nData loaded successfully.\n')
    else:
        print("The downloaded file is not a valid zip file.")
else:
    print(f"Failed to download the dataset. Status code: {response.status_code}")

   Transaction_ID  Customer_ID  Total_Purchases  Amount  Total_Amount  \
0         4340470        10000                5  299.74       1498.69   
1         8180050        10000                7   61.42        429.92   
2         4759669        10000                6  447.18       2683.06   
3         8901617        10000                3  131.97        395.90   
4         6592641        10001                3  122.49        367.46   

  Product_Category Product_Type            Products   Feedback  Ratings  
0         Clothing       Shorts        Khaki shorts       Good        3  
1          Grocery        Water     Sparkling water  Excellent        5  
2      Electronics       Tablet  Amazon Fire Tablet    Average        2  
3       Home Decor      Bedding               Quilt       Good        4  
4          Grocery   Soft Drink          Grape soda        Bad        1  

Data loaded successfully.

