In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt  # Added this import
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_validate

train_data = pd.read_csv("train_final.csv")
test_data = pd.read_csv("test_final.csv")

In [6]:
# Define features
numeric_features = ['age', 'fnlwgt', 'education.num', 'capital.gain', 'capital.loss', 'hours.per.week']
categorical_features = ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']

# Handle missing values
def handle_missing_values(train_data, test_data):
    train_cleaned = train_data.copy()
    test_cleaned = test_data.copy()
    
    for feature in ['workclass', 'occupation', 'native.country']:
        mode_value = train_data[feature][train_data[feature] != '?'].mode()[0]
        train_cleaned[feature] = train_cleaned[feature].replace('?', mode_value)
        test_cleaned[feature] = test_cleaned[feature].replace('?', mode_value)
    
    return train_cleaned, test_cleaned


In [9]:
# Clean the data
train_data_cleaned, test_data_cleaned = handle_missing_values(train_data, test_data)

# Create preprocessing pipelines
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create base decision tree (weak learner)
base_estimator = DecisionTreeClassifier(
    max_depth=3,              # Shallow trees as weak learners
    min_samples_split=30,
    min_samples_leaf=15,
    random_state=42
)

# Create AdaBoost Classifier pipeline
adaboost_model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', AdaBoostClassifier(
        estimator=base_estimator,
        n_estimators=100,     # Number of weak learners
        learning_rate=0.1,    # Contribution of each classifier
        random_state=42
    ))
])

# Prepare target variable
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_data_cleaned['income>50K'])

# Perform cross-validation with multiple metrics
scoring = {
    'accuracy': 'accuracy',
    'precision': 'precision',
    'recall': 'recall',
    'f1': 'f1'
}

cv_results = cross_validate(adaboost_model, 
                          train_data_cleaned.drop('income>50K', axis=1), 
                          y_train, 
                          cv=5, 
                          scoring=scoring)

# Print cross-validation results
print("\nAdaBoost Classification Results:")
print("================================")
print("Base Estimator: Decision Tree")
print("Number of estimators: 100")
print("Learning rate: 0.1")
print("\nCross-validation metrics:")
for metric in scoring.keys():
    scores = cv_results[f'test_{metric}']
    print(f"{metric.capitalize()}:")
    print(f"  Mean: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")

# Fit the model on full training data
adaboost_model.fit(train_data_cleaned.drop('income>50K', axis=1), y_train)

# Generate predictions for test data
test_ids = test_data_cleaned['ID'].copy()
test_predictions = adaboost_model.predict(test_data_cleaned.drop('ID', axis=1))

# Create submission dataframe
submission = pd.DataFrame({
    'ID': test_ids,
    'income>50K': label_encoder.inverse_transform(test_predictions)
})

# Feature importance analysis
def get_feature_importance(pipeline, feature_names):
    categorical_features_encoded = pipeline.named_steps['preprocessor']\
        .named_transformers_['cat'].get_feature_names_out(categorical_features)
    all_features = numeric_features + list(categorical_features_encoded)
    
    importances = pipeline.named_steps['classifier'].feature_importances_
    
    feature_importance = pd.DataFrame({
        'feature': all_features,
        'importance': importances
    })
    
    return feature_importance.sort_values('importance', ascending=False)

# Print feature importance
print("\nTop 10 Most Important Features:")
feature_importance = get_feature_importance(adaboost_model, numeric_features + categorical_features)
print(feature_importance.head(10))

# Save predictions
submission.to_csv('adaboost_predictions.csv', index=False)





AdaBoost Classification Results:
Base Estimator: Decision Tree
Number of estimators: 100
Learning rate: 0.1

Cross-validation metrics:
Accuracy:
  Mean: 0.871 (+/- 0.006)
Precision:
  Mean: 0.799 (+/- 0.014)
Recall:
  Mean: 0.623 (+/- 0.026)
F1:
  Mean: 0.700 (+/- 0.017)





Top 10 Most Important Features:
                              feature  importance
3                        capital.gain    0.136452
0                                 age    0.135823
2                       education.num    0.099944
29  marital.status_Married-civ-spouse    0.097219
5                      hours.per.week    0.083633
4                        capital.loss    0.063563
1                              fnlwgt    0.048129
56                           sex_Male    0.031065
51                  relationship_Wife    0.024945
40           occupation_Other-service    0.023951


In [10]:
# Create base decision tree (weak learner)
base_estimator = DecisionTreeClassifier(
    max_depth=3,              # Shallow trees as weak learners
    min_samples_split=30,
    min_samples_leaf=15,
    random_state=2
)

# Create AdaBoost Classifier pipeline
adaboost_model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', AdaBoostClassifier(
        estimator=base_estimator,
        n_estimators=100,     # Number of weak learners
        learning_rate=0.01,    # Contribution of each classifier
        random_state=2
    ))
])

# Prepare target variable
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_data_cleaned['income>50K'])

# Perform cross-validation with multiple metrics
scoring = {
    'accuracy': 'accuracy',
    'precision': 'precision',
    'recall': 'recall',
    'f1': 'f1'
}

cv_results = cross_validate(adaboost_model, 
                          train_data_cleaned.drop('income>50K', axis=1), 
                          y_train, 
                          cv=5, 
                          scoring=scoring)

# Print cross-validation results
print("\nAdaBoost Classification Results:")
print("================================")
print("Base Estimator: Decision Tree")
print("Number of estimators: 100")
print("Learning rate: 0.01")
print("\nCross-validation metrics:")
for metric in scoring.keys():
    scores = cv_results[f'test_{metric}']
    print(f"{metric.capitalize()}:")
    print(f"  Mean: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")

# Fit the model on full training data
adaboost_model.fit(train_data_cleaned.drop('income>50K', axis=1), y_train)

# Generate predictions for test data
test_ids = test_data_cleaned['ID'].copy()
test_predictions = adaboost_model.predict(test_data_cleaned.drop('ID', axis=1))

# Create submission dataframe
submission = pd.DataFrame({
    'ID': test_ids,
    'income>50K': label_encoder.inverse_transform(test_predictions)
})

# Feature importance analysis
def get_feature_importance(pipeline, feature_names):
    categorical_features_encoded = pipeline.named_steps['preprocessor']\
        .named_transformers_['cat'].get_feature_names_out(categorical_features)
    all_features = numeric_features + list(categorical_features_encoded)
    
    importances = pipeline.named_steps['classifier'].feature_importances_
    
    feature_importance = pd.DataFrame({
        'feature': all_features,
        'importance': importances
    })
    
    return feature_importance.sort_values('importance', ascending=False)

# Print feature importance
print("\nTop 10 Most Important Features:")
feature_importance = get_feature_importance(adaboost_model, numeric_features + categorical_features)
print(feature_importance.head(10))





AdaBoost Classification Results:
Base Estimator: Decision Tree
Number of estimators: 100
Learning rate: 0.01

Cross-validation metrics:
Accuracy:
  Mean: 0.847 (+/- 0.010)
Precision:
  Mean: 0.775 (+/- 0.022)
Recall:
  Mean: 0.516 (+/- 0.037)
F1:
  Mean: 0.619 (+/- 0.031)





Top 10 Most Important Features:
                              feature  importance
29  marital.status_Married-civ-spouse    0.446050
2                       education.num    0.218059
3                        capital.gain    0.139500
0                                 age    0.137232
5                      hours.per.week    0.031257
4                        capital.loss    0.027871
14                     education_12th    0.000031
6                 workclass_Local-gov    0.000000
7              workclass_Never-worked    0.000000
8                   workclass_Private    0.000000


In [11]:
# Save predictions
submission.to_csv('adaboost_predictions2.csv', index=False)