In [9]:
import pandas as pd
import numpy as np
import os

# 1. Setup paths
raw_data_path = 'raw data - Sheet1.csv'
processed_data_path = 'data/processed/card_krueger_cleaned.csv'

# Create directories - they don't exist
os.makedirs('data/raw', exist_ok=True)
os.makedirs('data/processed', exist_ok=True)

# 2. Define the full set of column names based on the Card & Krueger (1994) codebook
# These correspond to the 46 variables in the public.dat file
column_names = [
    'sheet', 'chain', 'co_owned', 'state', 'region', 'southj', 'centralj', 'northj', 'pa1', 'pa2',
    'shore', 'empft', 'emppt', 'nmgrs', 'wage_st', 'ince_st', 'firstinc', 'bonus', 'pct_aff', 'meals',
    'open', 'hrsopen', 'psoda', 'pfries', 'pentree', 'nregs', 'nregs11', 'type2', 'status2', 'date2',
    'empft2', 'emppt2', 'nmgrs2', 'wage_st2', 'ince_st2', 'firstinc2', 'special2', 'meals2', 'open24_2',
    'hrsopen2', 'psoda2', 'pfries2', 'pentree2', 'nregs2', 'nregs11_2', 'status_final'
]

# 3. Ingest the Data
# - sep='\s+': Handles the variable number of spaces between columns.
# - na_values='.'
try:
    df = pd.read_csv(raw_data_path, sep='\s+', header=None, names=column_names, na_values='.')

    print("✓ Data Ingestion Successful")
    print(f"✓ Observations: {df.shape[0]}")
    print(f"✓ Variables: {df.shape[1]}")

    # 4. Preliminary Data Check (Phase 1)
    print("\n--- Raw Data Head ---")
    print(df.head())

    # 5. Export for Later use in project
    df.to_csv(processed_data_path, index=False)
    print(f"\n✓ Cleaned data exported to: {processed_data_path}")

except FileNotFoundError:
    print(f"Error: Could not find the file at {raw_data_path}.check folder structure.")
except Exception as e:
    print(f"An error occurred: {e}")

✓ Data Ingestion Successful
✓ Observations: 410
✓ Variables: 46

--- Raw Data Head ---
   sheet  chain  co_owned  state  region  southj  centralj  northj  pa1  pa2  \
0     46      1         0      0       0       0         0       1    0    0   
1     49      2         0      0       0       0         0       1    0    0   
2    506      2         1      0       0       0         0       1    0    0   
3     56      4         1      0       0       0         0       1    0    0   
4     61      4         1      0       0       0         0       1    0    0   

   ...  special2  meals2  open24_2  hrsopen2  psoda2  pfries2  pentree2  \
0  ...      0.08     1.0       2.0       6.5    16.5     1.03       NaN   
1  ...      0.05     0.0       2.0      10.0    13.0     1.01      0.89   
2  ...      0.25     NaN       1.0      11.0    11.0     0.95      0.74   
3  ...      0.15     0.0       2.0      10.0    12.0     0.92      0.79   
4  ...      0.15     0.0       2.0      10.0    12.0     

  df = pd.read_csv(raw_data_path, sep='\s+', header=None, names=column_names, na_values='.')
