In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_validate

train_data = pd.read_csv("train_final.csv")
test_data = pd.read_csv("test_final.csv")



In [2]:
# Define features
numeric_features = ['age', 'fnlwgt', 'education.num', 'capital.gain', 'capital.loss', 'hours.per.week']
categorical_features = ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']

# Handle missing values
def handle_missing_values(train_data, test_data):
    train_cleaned = train_data.copy()
    test_cleaned = test_data.copy()
    
    for feature in ['workclass', 'occupation', 'native.country']:
        mode_value = train_data[feature][train_data[feature] != '?'].mode()[0]
        train_cleaned[feature] = train_cleaned[feature].replace('?', mode_value)
        test_cleaned[feature] = test_cleaned[feature].replace('?', mode_value)
    
    return train_cleaned, test_cleaned

# Clean the data
train_data_cleaned, test_data_cleaned = handle_missing_values(train_data, test_data)

# Create preprocessing pipelines
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create SGD Classifier pipeline
sgd_model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', SGDClassifier(
        loss='modified_huber',      # Loss function with probability estimates
        penalty='l2',               # L2 regularization to prevent overfitting
        alpha=0.0001,              # Regularization strength
        max_iter=1000,             # Maximum number of iterations
        tol=1e-3,                  # Tolerance for stopping criterion
        random_state=42,
        learning_rate='adaptive',   # Adaptive learning rate
        eta0=0.1,                  # Initial learning rate
        early_stopping=True,        # Use early stopping
        validation_fraction=0.1,    # Fraction of training data for early stopping
        n_iter_no_change=5,        # Number of iterations with no improvement
        class_weight='balanced'     # Handle class imbalance
    ))
])

# Prepare target variable
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_data_cleaned['income>50K'])

# Perform cross-validation with multiple metrics
scoring = {
    'accuracy': 'accuracy',
    'precision': 'precision',
    'recall': 'recall',
    'f1': 'f1'
}

cv_results = cross_validate(sgd_model, 
                          train_data_cleaned.drop('income>50K', axis=1), 
                          y_train, 
                          cv=5, 
                          scoring=scoring)

# Print cross-validation results
print("\nSGD Classification Results:")
print("===========================")
print("Model Parameters:")
print("Loss function: modified_huber")
print("Penalty: L2")
print("Learning rate: adaptive")
print("Class weights: balanced")
print("\nCross-validation metrics:")
for metric in scoring.keys():
    scores = cv_results[f'test_{metric}']
    print(f"{metric.capitalize()}:")
    print(f"  Mean: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")

# Fit the model on full training data
sgd_model.fit(train_data_cleaned.drop('income>50K', axis=1), y_train)

# Generate predictions for test data
test_ids = test_data_cleaned['ID'].copy()
test_predictions = sgd_model.predict(test_data_cleaned.drop('ID', axis=1))

# Create submission dataframe
submission = pd.DataFrame({
    'ID': test_ids,
    'income>50K': label_encoder.inverse_transform(test_predictions)
})



SGD Classification Results:
Model Parameters:
Loss function: modified_huber
Penalty: L2
Learning rate: adaptive
Class weights: balanced

Cross-validation metrics:
Accuracy:
  Mean: 0.805 (+/- 0.006)
Precision:
  Mean: 0.563 (+/- 0.009)
Recall:
  Mean: 0.850 (+/- 0.021)
F1:
  Mean: 0.677 (+/- 0.012)




In [3]:

# Save predictions
submission.to_csv('sgd_predictions.csv', index=False)

# Print training accuracy
train_predictions = sgd_model.predict(train_data_cleaned.drop('income>50K', axis=1))
train_accuracy = accuracy_score(y_train, train_predictions)
print(f"\nTraining Accuracy: {train_accuracy:.3f}")

# Print detailed classification report
print("\nDetailed Classification Report (Training Set):")
print(classification_report(y_train, train_predictions))


Training Accuracy: 0.807

Detailed Classification Report (Training Set):
              precision    recall  f1-score   support

           0       0.95      0.79      0.86     18984
           1       0.57      0.86      0.68      6016

    accuracy                           0.81     25000
   macro avg       0.76      0.82      0.77     25000
weighted avg       0.85      0.81      0.82     25000

