In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score, cross_validate
import warnings
warnings.filterwarnings("ignore")

train_data = pd.read_csv("train_final.csv")
test_data = pd.read_csv("test_final.csv")

In [2]:
print(train_data.columns)
print(test_data.columns)

Index(['age', 'workclass', 'fnlwgt', 'education', 'education.num',
       'marital.status', 'occupation', 'relationship', 'race', 'sex',
       'capital.gain', 'capital.loss', 'hours.per.week', 'native.country',
       'income>50K'],
      dtype='object')
Index(['ID', 'age', 'workclass', 'fnlwgt', 'education', 'education.num',
       'marital.status', 'occupation', 'relationship', 'race', 'sex',
       'capital.gain', 'capital.loss', 'hours.per.week', 'native.country'],
      dtype='object')


In [7]:
# find out missing features between these two datasets
missing_features = []
for i in train_data.columns:
    count = sum(train_data[i]=='?')
    if count>0:
        missing_features.append(i)
print(missing_features)

missing_test_features = []
for i in test_data.columns:
    count = sum(test_data[i]=='?')
    if count>0:
        missing_test_features.append(i)
print(missing_test_features)

['workclass', 'occupation', 'native.country']
['workclass', 'occupation', 'native.country']


In [8]:
# Let's handle missing values in both train and test data
def handle_missing_values(data):
    data = data.copy()
    
    # Replace '?' with mode for each categorical feature with missing values
    for feature in ['workclass', 'occupation', 'native.country']:
        # Calculate mode excluding '?' values
        mode_value = data[feature][data[feature] != '?'].mode()[0]
        data[feature] = data[feature].replace('?', mode_value)
    
    return data

# Apply missing value handling
train_data_cleaned = handle_missing_values(train_data)
test_data_cleaned = handle_missing_values(test_data)

In [11]:
# Test if there are missing value remaining
for feature in ['workclass', 'occupation', 'native.country']:
    q_count = sum(train_data_cleaned[feature] == '?')
    print(f"{feature}: {q_count} '?' values remaining")

for feature in ['workclass', 'occupation', 'native.country']:
    q_count = sum(test_data_cleaned[feature] == '?')
    print(f"{feature}: {q_count} '?' values remaining")

workclass: 0 '?' values remaining
occupation: 0 '?' values remaining
native.country: 0 '?' values remaining
workclass: 0 '?' values remaining
occupation: 0 '?' values remaining
native.country: 0 '?' values remaining


In [14]:
print("\nChecking all types of missing values:")
print(train_data_cleaned.isnull().sum())  # Checks for None/NaN
print("\nTotal missing values:", train_data_cleaned.isnull().sum().sum())

print(test_data_cleaned.isnull().sum())  # Checks for None/NaN
print("\nTotal missing values:", test_data_cleaned.isnull().sum().sum())


Checking all types of missing values:
age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
income>50K        0
dtype: int64

Total missing values: 0
ID                0
age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
dtype: int64

Total missing values: 0


In [15]:
# Separate features
numeric_features = ['age', 'fnlwgt', 'education.num', 'capital.gain', 'capital.loss', 'hours.per.week']
categorical_features = ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']

# Create preprocessing pipelines
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create pipeline with LogisticRegression. Let's use logistic regression for this one.
model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000, random_state=2))
])

# Convert target to numeric
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_data_cleaned['income>50K'])

# Perform cross-validation with multiple metrics
scoring = {
    'accuracy': 'accuracy',
    'precision': 'precision',
    'recall': 'recall',
    'f1': 'f1'
}

cv_results = cross_validate(model, 
                          train_data_cleaned.drop('income>50K', axis=1), 
                          y_train, 
                          cv=5, 
                          scoring=scoring)

# Print detailed cross-validation results
print("\nCross-validation results:")
for metric in scoring.keys():
    scores = cv_results[f'test_{metric}']
    print(f"{metric.capitalize()}:")
    print(f"  Mean: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")

# Print missing value counts before and after cleaning
print("\nMissing value counts before cleaning:")
for feature in ['workclass', 'occupation', 'native.country']:
    print(f"{feature}: {sum(train_data[feature] == '?')} missing values")

print("\nMissing value counts after cleaning:")
for feature in ['workclass', 'occupation', 'native.country']:
    print(f"{feature}: {sum(train_data_cleaned[feature] == '?')} missing values")


Cross-validation results:
Accuracy:
  Mean: 0.850 (+/- 0.010)
Precision:
  Mean: 0.736 (+/- 0.022)
Recall:
  Mean: 0.590 (+/- 0.030)
F1:
  Mean: 0.655 (+/- 0.026)

Missing value counts before cleaning:
workclass: 1437 missing values
occupation: 1442 missing values
native.country: 427 missing values

Missing value counts after cleaning:
workclass: 0 missing values
occupation: 0 missing values
native.country: 0 missing values


In [16]:

# Fit the model on full training data
model.fit(train_data_cleaned.drop('income>50K', axis=1), y_train)

# Generate predictions for test data
test_ids = test_data_cleaned['ID'].copy()
test_predictions = model.predict(test_data_cleaned.drop('ID', axis=1))

# Create submission dataframe
submission = pd.DataFrame({
    'ID': test_ids,
    'income>50K': label_encoder.inverse_transform(test_predictions)
})



In [17]:
# Feature importance analysis
def get_feature_importance(pipeline, feature_names):
    categorical_features_encoded = pipeline.named_steps['preprocessor']\
        .named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_features)
    all_features = numeric_features + list(categorical_features_encoded)
    
    coefficients = pipeline.named_steps['classifier'].coef_[0]
    
    feature_importance = pd.DataFrame({
        'feature': all_features,
        'importance': np.abs(coefficients)
    })
    
    return feature_importance.sort_values('importance', ascending=False)

# Print feature importance
print("\nMost important features:")
feature_importance = get_feature_importance(model, numeric_features + categorical_features)
print(feature_importance.head(10))


Most important features:
                              feature  importance
3                        capital.gain    2.446761
29  marital.status_Married-civ-spouse    1.954115
28   marital.status_Married-AF-spouse    1.500919
41         occupation_Priv-house-serv    1.440416
89               native.country_South    1.239998
75             native.country_Ireland    1.195318
59            native.country_Columbia    1.147261
51                  relationship_Wife    1.119608
37         occupation_Farming-fishing    1.041549
10         workclass_Self-emp-not-inc    1.022040


In [18]:
# Save predictions
submission.to_csv('income_predictions_logistic_regression.csv', index=False)