In [None]:
!pip install nbstripout

In [None]:
!nbstripout --install

In [None]:
!pip3 install pandas
import pandas as pd

In [None]:
!curl -o customers.csv https://raw.githubusercontent.com/neo4j-contrib/northwind-neo4j/master/data/customers.csv
!curl -o orders.csv https://raw.githubusercontent.com/neo4j-contrib/northwind-neo4j/master/data/orders.csv
!curl -o suppliers.csv https://raw.githubusercontent.com/neo4j-contrib/northwind-neo4j/master/data/suppliers.csv


In [None]:
customers = pd.read_csv("https://raw.githubusercontent.com/neo4j-contrib/northwind-neo4j/master/data/customers.csv")


In [None]:
import pandas as pd
import requests
import csv
from io import StringIO

# URL for customers.csv
url = "https://raw.githubusercontent.com/neo4j-contrib/northwind-neo4j/master/data/customers.csv"

# 1. Try loading with pandas, expecting errors
try:
    df = pd.read_csv(url)
    print("Loaded successfully with default settings:")
    print(df.head())
except Exception as e:
    print(f"Pandas parsing error: {e}")

# 2. Load with error-tolerant settings
try:
    df = pd.read_csv(url, quoting=csv.QUOTE_ALL, on_bad_lines='warn')
    print("\nLoaded with QUOTE_ALL and warn on bad lines:")
    print(df.head())
    print(f"Shape: {df.shape}")
except Exception as e:
    print(f"Pandas parsing error with QUOTE_ALL: {e}")

# 3. Inspect raw content (first 10 lines)
response = requests.get(url)
print("\nRaw content (first 500 characters):")
print(response.text[:500])

# 4. Inspect row-by-row with csv module
print("\nRow-by-row inspection (first 10 lines):")
csv_reader = csv.reader(StringIO(response.text))
for i, row in enumerate(csv_reader, 1):
    print(f"Line {i}: {row} ({len(row)} columns)")
    if i >= 10:  # Stop after 10 lines
        break

# 5. Check for hidden characters in line 8
print("\nInspecting line 8 specifically:")
lines = response.text.splitlines()
if len(lines) >= 8:
    line_8 = lines[7]  # Line 8 (0-based index 7)
    print(f"Raw line 8: {line_8}")
    print(f"Characters in line 8: {[c for c in line_8]}")
    # Parse line 8 with csv
    csv_reader = csv.reader([line_8])
    row_8 = next(csv_reader)
    print(f"Parsed line 8: {row_8} ({len(row_8)} columns)")

In [None]:
import pandas as pd
import requests
import csv
from io import StringIO

# URL for customers.csv
url = "https://raw.githubusercontent.com/neo4j-contrib/northwind-neo4j/master/data/customers.csv"

# 1. Try loading with pandas, expecting errors
try:
    df = pd.read_csv(url)
    print("Loaded successfully with default settings:")
    print(df.head())
except Exception as e:
    print(f"Pandas parsing error: {e}")

# 2. Load with QUOTE_ALL to handle embedded commas
try:
    df = pd.read_csv(url, quoting=csv.QUOTE_ALL, on_bad_lines='warn')
    print("\nLoaded with QUOTE_ALL and warn on bad lines:")
    print(df.head())
    print(f"Shape: {df.shape}")
except Exception as e:
    print(f"Pandas parsing error with QUOTE_ALL: {e}")

# 3. Row-by-row inspection with csv module
response = requests.get(url)
print("\nRow-by-row inspection (first 10 lines):")
csv_reader = csv.reader(StringIO(response.text))
header = next(csv_reader)
print(f"Header: {header} ({len(header)} columns)")
for i, row in enumerate(csv_reader, 1):
    print(f"Line {i}: {row} ({len(row)} columns)")
    if len(row) != len(header):
        print(f"  WARNING: Expected {len(header)} columns, found {len(row)}")
        print(f"  Fax column: {row[-1]}")  # Last column is fax
    if i >= 10:  # Stop after 10 lines
        break

# 4. Inspect line 8 specifically (0-based index 7)
print("\nInspecting line 8 specifically:")
lines = response.text.splitlines()
if len(lines) >= 8:
    line_8 = lines[7]
    print(f"Raw line 8: {line_8}")
    csv_reader = csv.reader([line_8])
    row_8 = next(csv_reader)
    print(f"Parsed line 8: {row_8} ({len(row_8)} columns)")
    print(f"Fax column: {row_8[-1]}")

In [None]:
for url in [
    "https://raw.githubusercontent.com/neo4j-contrib/northwind-neo4j/master/data/orders.csv",
    "https://raw.githubusercontent.com/neo4j-contrib/northwind-neo4j/master/data/suppliers.csv"
]:
    print(f"\nInspecting {url}:")
    response = requests.get(url)
    csv_reader = csv.reader(StringIO(response.text))
    header = next(csv_reader)
    print(f"Header: {header} ({len(header)} columns)")
    for i, row in enumerate(csv_reader, 1):
        print(f"Line {i}: {row} ({len(row)} columns)")
        if len(row) != len(header):
            print(f"  WARNING: Expected {len(header)} columns, found {len(row)}")
            # Check relevant columns (e.g., shipAddress, shipName for orders; homePage for suppliers)
            if 'orders' in url:
                print(f"  shipAddress: {row[9]}, shipName: {row[8]}")
            else:
                print(f"  homePage: {row[-1]}")
        if i >= 10:
            break

In [None]:
import requests
import csv
from io import StringIO
import pandas as pd

def clean_csv(url, problem_column, replace_char=' '):
    """
    Cleans a CSV by merging extra fields in the problem column caused by commas.
    
    Args:
        url (str): CSV URL.
        problem_column (str): Column with commas (e.g., 'fax').
        replace_char (str): Replace commas with this (default: ' ').
    
    Returns:
        str: Path to cleaned CSV.
    """
    response = requests.get(url)
    csv_reader = csv.reader(StringIO(response.text))
    header = next(csv_reader)
    
    try:
        col_index = header.index(problem_column)
    except ValueError:
        print(f"Column '{problem_column}' not found in {url}")
        return None
    
    cleaned_rows = [header]
    expected_cols = len(header)
    
    for row in csv_reader:
        if len(row) == expected_cols:
            cleaned_row = row.copy()
            cleaned_row[col_index] = row[col_index].replace(',', replace_char)
            cleaned_rows.append(cleaned_row)
        elif len(row) > expected_cols:
            cleaned_row = row[:col_index] + [replace_char.join(row[col_index:])] + row[col_index + 1:len(header)]
            cleaned_rows.append(cleaned_row)
            print(f"Fixed row in {url}: {row} -> {cleaned_row}")
        else:
            cleaned_rows.append(row)
    
    cleaned_file = f"cleaned_{url.split('/')[-1]}"
    with open(cleaned_file, 'w', encoding='utf-8', newline='') as f:
        writer = csv.writer(f, quoting=csv.QUOTE_ALL)
        writer.writerows(cleaned_rows)
    
    return cleaned_file

# Test cleaning
csv_configs = [
    ("https://raw.githubusercontent.com/neo4j-contrib/northwind-neo4j/master/data/customers.csv", "fax"),
    ("https://raw.githubusercontent.com/neo4j-contrib/northwind-neo4j/master/data/orders.csv", "shipCountry"),
    ("https://raw.githubusercontent.com/neo4j-contrib/northwind-neo4j/master/data/suppliers.csv", "homePage")
]

for url, col in csv_configs:
    print(f"\nCleaning {url} ({col})")
    cleaned_file = clean_csv(url, col, replace_char=' ')
    if cleaned_file:
        df = pd.read_csv(cleaned_file)
        print(f"Shape: {df.shape}")
        print(f"Columns: {len(df.columns)}")
        print(f"Commas in {col}: {df[col].str.contains(',', na=False).sum()}")

In [None]:
orders = pd.read_csv("https://raw.githubusercontent.com/neo4j-contrib/northwind-neo4j/master/data/orders.csv")


In [None]:
suppliers = pd.read_csv("https://raw.githubusercontent.com/neo4j-contrib/northwind-neo4j/master/data/suppliers.csv")
