In [32]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

# Load the data, marking '?' as NaN and stripping initial spaces from categories
df = pd.read_csv('adult_raw.csv', na_values='?', skipinitialspace=True)
print(f"Initial Dataset Shape: {df.shape}")

Initial Dataset Shape: (32561, 15)


In [33]:
# Define column types based on the final set to be used
numerical_cols = ['age', 'fnlwgt', 'education.num', 'capital.gain', 'capital.loss', 'hours.per.week']
categorical_cols_to_check = ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']

# Impute numerical missing values with the mean (safety check)
for col in numerical_cols:
    if df[col].isnull().any():
        df[col].fillna(df[col].mean(), inplace=True)

# Drop columns with missing categorical data and the redundant 'education' column
cols_to_drop = [col for col in categorical_cols_to_check if df[col].isnull().any()]
cols_to_drop.append('education')

df.drop(columns=cols_to_drop, inplace=True)

# Define the final lists for the preprocessor
categorical_cols = [col for col in categorical_cols_to_check if col not in cols_to_drop]

print(f"Columns Dropped: {cols_to_drop}")
df.head()

Columns Dropped: ['workclass', 'occupation', 'native.country', 'education']


Unnamed: 0,age,fnlwgt,education.num,marital.status,relationship,race,sex,capital.gain,capital.loss,hours.per.week,income
0,90,77053,9,Widowed,Not-in-family,White,Female,0,4356,40,<=50K
1,82,132870,9,Widowed,Not-in-family,White,Female,0,4356,18,<=50K
2,66,186061,10,Widowed,Unmarried,Black,Female,0,4356,40,<=50K
3,54,140359,4,Divorced,Unmarried,White,Female,0,3900,40,<=50K
4,41,264663,10,Separated,Own-child,White,Female,0,3900,40,<=50K


In [34]:
# Separate features (X) and target (y)
X = df.drop(columns=['income'])
y = df['income']

# Encode the target variable (y) to 0s and 1s
y = y.replace({'<=50K': 0, '>50K': 1}).astype(int)

# Split data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)
print(f"X_train shape: {X_train.shape}")

X_train shape: (26048, 10)


  y = y.replace({'<=50K': 0, '>50K': 1}).astype(int)


In [35]:
# Define the transformers for ColumnTransformer
numerical_transformer = StandardScaler() # Standardize numerical data
categorical_transformer = OneHotEncoder(handle_unknown='ignore', drop='first') # One-Hot Encode categorical data

# Combine transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ],
    remainder='passthrough'
)

# Build the final Pipeline: Preprocessor + Model
logreg_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000, random_state=42))
])

# Fit the pipeline to the training data (Preprocessing and Training occur here)
logreg_pipeline.fit(X_train, y_train)
print("\n✅ Preprocessing and Logistic Regression Model Training Complete!")



✅ Preprocessing and Logistic Regression Model Training Complete!


In [36]:
# Assuming you have run the full preprocessing code previously,
# we reuse the defined lists and the preprocessor object.

# Extract the preprocessor from the final pipeline
preprocessor = logreg_pipeline.named_steps['preprocessor']

# 1. Transform the training data (X_train)
X_train = preprocessor.transform(X_train)

# 2. Transform the testing data (X_test)
X_test = preprocessor.transform(X_test)

print("✅ Data transformation complete!")
print(f"X_train_processed shape: {X_train.shape}")
print(f"X_test_processed shape: {X_test.shape}")

✅ Data transformation complete!
X_train_processed shape: (26048, 22)
X_test_processed shape: (6513, 22)


In [37]:
# 1. Get feature names from the OneHotEncoder (for categorical columns)
# The preprocessor's 'cat' step is the OneHotEncoder.
feature_names_cat = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_cols).tolist()

# 2. Combine all feature names
# Remember the numerical columns are preserved
final_feature_names = numerical_cols + feature_names_cat

# 3. Convert the NumPy arrays back to pandas DataFrames
X_train_df = pd.DataFrame(X_train, columns=final_feature_names)
X_test_df = pd.DataFrame(X_test, columns=final_feature_names)

# Add the target variable (y) back to the DataFrames
X_train_df['income'] = y_train.reset_index(drop=True)
X_test_df['income'] = y_test.reset_index(drop=True)

print(f"Total features created: {len(final_feature_names)}")

Total features created: 22


In [38]:
# Save the processed training data
X_train_df.to_csv('train.csv', index=False)

# Save the processed testing data
X_test_df.to_csv('test.csv', index=False)

print("\n--- Saving Complete ---")
print("Saved training features and target to: train.csv")
print("Saved testing features and target to: test.csv")
print("You can now load these CSV files directly for future model training.")


--- Saving Complete ---
Saved training features and target to: census_train_processed.csv
Saved testing features and target to: census_test_processed.csv
You can now load these CSV files directly for future model training.
