# # Save Data
# Export scraped job data to various formats

## Load Dependencies


In [1]:
%run config.ipynb
import json

✓ Configuration loaded successfully
  - Default threads: 3
  - Max threads: 15
  - WebDriver timeout: 10s


## Check if Records Exist
# Make sure you've run main_scraper.ipynb first

In [2]:
try:
    %store -r records
    print(f"✓ Loaded {len(records)} records from Jupyter storage")
except:
    print("⚠ No stored records found")
    print("Attempting to check current memory...")
    
    # Option 2: Check if records exists in current memory
    try:
        test = records
        print(f"✓ Found {len(records)} records in current session")
    except NameError:
        print("\n❌ ERROR: No 'records' variable found!")
        print("\nPlease do ONE of the following:")
        print("  1. Run 4_main_scraper.ipynb in this same Jupyter session")
        print("  2. Or add '%store records' at the end of 4_main_scraper.ipynb")
        print("     Then run: %store -r records")
        print("  3. Or run: %run 4_main_scraper.ipynb")
        
        records = []  # Empty list to prevent errors

✓ Loaded 16 records from Jupyter storage


## Save to CSV

In [12]:
def save_to_csv(records, filename="indeed_jobs.csv"):
    """
    Save job records to CSV file
    
    Args:
        records: List of job tuples
        filename: Output CSV filename
    """
    if not records:
        print("⚠ No records to save")
        return
    
    headers = ["Title", "Company", "Location", "Salary", "URL", "Description"]
    
    with open(filename, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerow(headers)
        writer.writerows(records)
    
    print(f"✓ Data saved to {filename}")
    print(f"  Rows: {len(records)}")


In [13]:
# Execute CSV save
csv_filename = f"indeed_jobs_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
save_to_csv(records, csv_filename)

✓ Data saved to indeed_jobs_20251003_201222.csv
  Rows: 16


## Save to Excel

In [16]:
def save_to_excel(records, filename="indeed_jobs.xlsx"):
    """
    Save job records to Excel file with formatting
    
    Args:
        records: List of job tuples
        filename: Output Excel filename
    """
    if not records:
        print("⚠ No records to save")
        return
    
    df = pd.DataFrame(records, columns=["Title", "Company", "Location", "Salary", "URL", "Description"])
    # Save with auto-column width
    with pd.ExcelWriter(filename, engine='openpyxl') as writer:
        df.to_excel(writer, index=False, sheet_name='Jobs')
        
        # Auto-adjust column widths
        worksheet = writer.sheets['Jobs']
        for idx, col in enumerate(df.columns):
            max_length = max(
                df[col].astype(str).apply(len).max(),
                len(col)
            )
            worksheet.column_dimensions[chr(65 + idx)].width = min(max_length + 2, 50)
    
    print(f"✓ Data saved to {filename}")
    print(f"  Rows: {len(records)}")

In [None]:
# Execute Excel save
excel_filename = f"indeed_jobs_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx"
save_to_excel(records, excel_filename)

✓ Data saved to indeed_jobs_20251003_201256.xlsx
  Rows: 16


## Save to JSON


In [18]:
def save_to_json(records, filename="indeed_jobs.json"):
    """
    Save job records to JSON file
    
    Args:
        records: List of job tuples
        filename: Output JSON filename
    """
    if not records:
        print("⚠ No records to save")
        return
    
    jobs_list = []
    for record in records:
        job_dict = {
            "title": record[0],
            "company": record[1],
            "location": record[2],
            "salary": record[3],
            "url": record[4],
            "description": record[5]
        }
        jobs_list.append(job_dict)
    
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(jobs_list, f, indent=2, ensure_ascii=False)
    
    print(f"✓ Data saved to {filename}")
    print(f"  Records: {len(jobs_list)}")

In [19]:
# Execute JSON save
json_filename = f"indeed_jobs_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
save_to_json(records, json_filename)

✓ Data saved to indeed_jobs_20251003_201403.json
  Records: 16


## Data Quality Report


In [20]:
if records:
    df = pd.DataFrame(records, columns=["Title", "Company", "Location", "Salary", "URL", "Description"])
    
    print("\n" + "="*80)
    print("DATA QUALITY REPORT")
    print("="*80)
    print(f"\nTotal records: {len(df)}")
    print(f"\nMissing salaries: {df['Salary'].eq('').sum()} ({df['Salary'].eq('').sum()/len(df)*100:.1f}%)")
    print(f"Missing descriptions: {df['Description'].eq('None').sum()} ({df['Description'].eq('None').sum()/len(df)*100:.1f}%)")
    
    print(f"\nTop 5 companies:")
    print(df['Company'].value_counts().head())
    
    print(f"\nTop 5 locations:")
    print(df['Location'].value_counts().head())
    
    print("\n" + "="*80)


DATA QUALITY REPORT

Total records: 16

Missing salaries: 16 (100.0%)
Missing descriptions: 1 (6.2%)

Top 5 companies:
Company
General Motors                                        8
Disney Entertainment and ESPN Product & Technology    2
Advantest                                             1
Infosys                                               1
CuraFi                                                1
Name: count, dtype: int64

Top 5 locations:
Location
Mountain View, CA                                      6
San Francisco, CA 94105 \n(Financial District area)    2
Remote in Mountain View, CA                            1
San Jose, CA 95134 \n(North San Jose area)             1
Cupertino, CA                                          1
Name: count, dtype: int64



In [21]:
print("\n✓ All data export operations complete!")
print(f"\nFiles created:")
print(f"  - {csv_filename}")
print(f"  - {excel_filename}")
print(f"  - {json_filename}")


✓ All data export operations complete!

Files created:
  - indeed_jobs_20251003_201222.csv
  - indeed_jobs_20251003_201256.xlsx
  - indeed_jobs_20251003_201403.json
