In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, r2_score

In [2]:
# Load the dataset
df = pd.read_csv('adult_salary.csv')

In [3]:
print(df.shape)

(48842, 15)


In [4]:
# Print the columns of the DataFrame
print(df.columns)

Index(['Age', 'Emp_type', 'Fnlwgt', 'Education', 'Education_num', 'Marital',
       'Occupation', 'Relationship', 'Race', 'Sex', 'Capital_gain',
       'Capital_loss', 'weekly_hours', 'Country', 'Income'],
      dtype='object')


In [5]:
# Print First 5 rows
print(df.head())

   Age           Emp_type  Fnlwgt   Education  Education_num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

               Marital          Occupation    Relationship    Race      Sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   Capital_gain  Capital_loss  weekly_hours         Country  Income  
0          2174             0            40   United-States   <=50K  


In [6]:
# Print last 5 rows
print(df.tail())

       Age       Emp_type  Fnlwgt   Education  Education_num  \
48837   39        Private  215419   Bachelors             13   
48838   64              ?  321403     HS-grad              9   
48839   38        Private  374983   Bachelors             13   
48840   44        Private   83891   Bachelors             13   
48841   35   Self-emp-inc  182148   Bachelors             13   

                   Marital        Occupation     Relationship  \
48837             Divorced    Prof-specialty    Not-in-family   
48838              Widowed                 ?   Other-relative   
48839   Married-civ-spouse    Prof-specialty          Husband   
48840             Divorced      Adm-clerical        Own-child   
48841   Married-civ-spouse   Exec-managerial          Husband   

                      Race      Sex  Capital_gain  Capital_loss  weekly_hours  \
48837                White   Female             0             0            36   
48838                Black     Male             0             

In [7]:
# Display basic information about the dataset
print("Dataset Information:")
print(df.info())

Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Age            48842 non-null  int64 
 1   Emp_type       48842 non-null  object
 2   Fnlwgt         48842 non-null  int64 
 3   Education      48842 non-null  object
 4   Education_num  48842 non-null  int64 
 5   Marital        48842 non-null  object
 6   Occupation     48842 non-null  object
 7   Relationship   48842 non-null  object
 8   Race           48842 non-null  object
 9   Sex            48842 non-null  object
 10  Capital_gain   48842 non-null  int64 
 11  Capital_loss   48842 non-null  int64 
 12  weekly_hours   48842 non-null  int64 
 13  Country        48842 non-null  object
 14  Income         48842 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB
None


In [8]:
print(df.describe())

                Age        Fnlwgt  Education_num  Capital_gain  Capital_loss  \
count  48842.000000  4.884200e+04   48842.000000  48842.000000  48842.000000   
mean      38.643585  1.896641e+05      10.078089   1079.067626     87.502314   
std       13.710510  1.056040e+05       2.570973   7452.019058    403.004552   
min       17.000000  1.228500e+04       1.000000      0.000000      0.000000   
25%       28.000000  1.175505e+05       9.000000      0.000000      0.000000   
50%       37.000000  1.781445e+05      10.000000      0.000000      0.000000   
75%       48.000000  2.376420e+05      12.000000      0.000000      0.000000   
max       90.000000  1.490400e+06      16.000000  99999.000000   4356.000000   

       weekly_hours  
count  48842.000000  
mean      40.422382  
std       12.391444  
min        1.000000  
25%       40.000000  
50%       40.000000  
75%       45.000000  
max       99.000000  


In [9]:
# Count duplicate rows in the DataFrame
duplicate_count = df.duplicated().sum()

# Print the count of duplicate rows
print("Duplicate Rows:", duplicate_count)

# Drop duplicates
df.drop_duplicates(inplace=True)

new_duplicate_count = df.duplicated().sum()
print("Number of duplicate rows after cleaning :",new_duplicate_count)

Duplicate Rows: 29
Number of duplicate rows after cleaning : 0


In [10]:
# Check for null values in the DataFrame
null_values_count = df.isnull().sum()

# Print the count of null values in each column
print("\nNull Values:")
print(null_values_count)


Null Values:
Age              0
Emp_type         0
Fnlwgt           0
Education        0
Education_num    0
Marital          0
Occupation       0
Relationship     0
Race             0
Sex              0
Capital_gain     0
Capital_loss     0
weekly_hours     0
Country          0
Income           0
dtype: int64


In [11]:
# Preprocess the dataset
df.replace('?', pd.NA, inplace=True)  # Replace '?' with NaN

In [12]:
df.dropna(inplace=True)  # Drop rows with missing values

In [13]:
# Save the cleaned dataset to a new CSV file
df.to_csv('cleaned_adult_salary.csv', index=False)

In [14]:
# Select relevant columns for modeling
# X for features
X = df[['Age', 'Emp_type', 'Education', 'Education_num', 'Marital', 'Occupation', 'Relationship', 'Race', 'Sex',
        'Capital_gain', 'Capital_loss', 'weekly_hours', 'Country']]

In [15]:
# y for target variable
y = df['Income']

In [16]:
# Encode categorical variables
encoders = {}

In [17]:
for col in X.select_dtypes(include=['object']).columns:
    encoders[col] = LabelEncoder()
    X[col] = encoders[col].fit_transform(X[col])

Splitting Datasets into training and testing sets. 70% for training. 30% for testing

In [18]:
#  Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [19]:
# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Gradient Boosting Classifier

In [20]:
# Select a machine learning model (Gradient Boosting Classifier)
model_gb = GradientBoostingClassifier(random_state=42)

In [21]:
# Build the Gradient Boosting Classifier model
model_gb.fit(X_train_scaled, y_train)

In [22]:
# Train the Gradient Boosting Classifier model
y_pred_train_gb = model_gb.predict(X_train_scaled)

In [23]:
# Evaluate the Gradient Boosting Classifier model on testing data
y_pred_test_gb = model_gb.predict(X_test_scaled)
test_accuracy_gb = accuracy_score(y_test, y_pred_test_gb)

In [24]:
# Print the testing accuracy for Gradient Boosting Classifier
print(f"Testing Accuracy (Gradient Boosting Classifier): {test_accuracy_gb}")

Testing Accuracy (Gradient Boosting Classifier): 0.5770281343895111


In [25]:
# Create two new records with reasonable values
new_predicts = pd.DataFrame({
    'Age': [35, 45],
    'Emp_type': ['Private', 'Government'],
    'Education': ['Bachelors', 'Masters'],
    'Education_num': [9, 13],
    'Marital': ['Never-married', 'Married'],
    'Occupation': ['Adm-clerical', 'Exec-managerial'],
    'Relationship': ['Not-in-family', 'Husband'],
    'Race': ['White', 'Asian-Pac-Islander'],
    'Sex': ['Male', 'Female'],
    'Capital_gain': [5000, 0],
    'Capital_loss': [0, 1000],
    'weekly_hours': [40, 50],
    'Country': ['United-States', 'China']
})

In [26]:
# Encode categorical variables in new records using the same label encoder as for training data
for col in new_predicts.select_dtypes(include=['object']).columns:
    if col in encoders:
        # Check if the label encoder has learned the category in new_predicts[col]
        if set(new_predicts[col]) <= set(encoders[col].classes_):
            new_predicts[col] = encoders[col].transform(new_predicts[col])
        else:
            print(f"Unseen labels in {col} column: {set(new_predicts[col]) - set(encoders[col].classes_)}")

Unseen labels in Emp_type column: {'Government', 'Private'}
Unseen labels in Education column: {'Bachelors', 'Masters'}
Unseen labels in Marital column: {'Married', 'Never-married'}
Unseen labels in Occupation column: {'Adm-clerical', 'Exec-managerial'}
Unseen labels in Relationship column: {'Husband', 'Not-in-family'}
Unseen labels in Race column: {'White', 'Asian-Pac-Islander'}
Unseen labels in Sex column: {'Female', 'Male'}
Unseen labels in Country column: {'China', 'United-States'}


In [27]:
# Define a custom label encoder dictionary
custom_encoders = {}

In [28]:
# Encode categorical variables in new records using a custom label encoder
for col in new_predicts.select_dtypes(include=['object']).columns:
    # Initialize a new label encoder for each column
    custom_encoders[col] = LabelEncoder()
    
    # Ensure all values in the combined data are strings
    combined_data = pd.concat([X[col].astype(str), new_predicts[col].astype(str)], axis=0)
    
    # Fit the label encoder to the combined data
    custom_encoders[col].fit(combined_data)
    
    # Transform the values in new records using the fitted label encoder
    new_predicts[col] = custom_encoders[col].transform(new_predicts[col].astype(str))

In [29]:
# Feature scaling for new records
new_records_scaled = scaler.transform(new_predicts)

In [30]:
# Predict income for new records using the model trained on the combined data
predicted_income = model_gb.predict(new_records_scaled)
print("\nPredicted Income for New Records:")
for income in predicted_income:
    print("Income > 50K" if income == 1 else "Income <= 50K")


Predicted Income for New Records:
Income <= 50K
Income <= 50K


## Interpretations:
Interpretation: Similarly, the model predicts that this individual's annual income is less than or equal to $50,000 based on the provided features.

In [31]:
# Perform 10-fold cross-validation for GradientBoostingClassifier
cv_scores = cross_val_score(model_gb, X_train_scaled, y_train, cv=10, scoring='accuracy')
print("Cross-Validation Scores (Accuracy):")
print(cv_scores)
print(f"Mean Accuracy: {cv_scores.mean()}")

Cross-Validation Scores (Accuracy):
[0.56921276 0.57623646 0.57565116 0.5683348  0.57769974 0.57126134
 0.57974832 0.56804214 0.57330992 0.57318501]
Mean Accuracy: 0.5732681646184288
