In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, r2_score

import warnings
warnings.filterwarnings("ignore")

In [3]:
# Load the dataset
df = pd.read_csv('adult_salary.csv')

In [4]:
# Explore the data
print(df.head())

   Age           Emp_type  Fnlwgt   Education  Education_num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

               Marital          Occupation    Relationship    Race      Sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   Capital_gain  Capital_loss  weekly_hours         Country  Income  
0          2174             0            40   United-States   <=50K  


In [5]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Age            48842 non-null  int64 
 1   Emp_type       48842 non-null  object
 2   Fnlwgt         48842 non-null  int64 
 3   Education      48842 non-null  object
 4   Education_num  48842 non-null  int64 
 5   Marital        48842 non-null  object
 6   Occupation     48842 non-null  object
 7   Relationship   48842 non-null  object
 8   Race           48842 non-null  object
 9   Sex            48842 non-null  object
 10  Capital_gain   48842 non-null  int64 
 11  Capital_loss   48842 non-null  int64 
 12  weekly_hours   48842 non-null  int64 
 13  Country        48842 non-null  object
 14  Income         48842 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB
None


In [5]:
print(df.describe())

                Age        Fnlwgt  Education_num  Capital_gain  Capital_loss  \
count  48842.000000  4.884200e+04   48842.000000  48842.000000  48842.000000   
mean      38.643585  1.896641e+05      10.078089   1079.067626     87.502314   
std       13.710510  1.056040e+05       2.570973   7452.019058    403.004552   
min       17.000000  1.228500e+04       1.000000      0.000000      0.000000   
25%       28.000000  1.175505e+05       9.000000      0.000000      0.000000   
50%       37.000000  1.781445e+05      10.000000      0.000000      0.000000   
75%       48.000000  2.376420e+05      12.000000      0.000000      0.000000   
max       90.000000  1.490400e+06      16.000000  99999.000000   4356.000000   

       weekly_hours  
count  48842.000000  
mean      40.422382  
std       12.391444  
min        1.000000  
25%       40.000000  
50%       40.000000  
75%       45.000000  
max       99.000000  


In [8]:
print(df.tail())

       Age       Emp_type  Fnlwgt   Education  Education_num  \
48837   39        Private  215419   Bachelors             13   
48838   64              ?  321403     HS-grad              9   
48839   38        Private  374983   Bachelors             13   
48840   44        Private   83891   Bachelors             13   
48841   35   Self-emp-inc  182148   Bachelors             13   

                   Marital        Occupation     Relationship  \
48837             Divorced    Prof-specialty    Not-in-family   
48838              Widowed                 ?   Other-relative   
48839   Married-civ-spouse    Prof-specialty          Husband   
48840             Divorced      Adm-clerical        Own-child   
48841   Married-civ-spouse   Exec-managerial          Husband   

                      Race      Sex  Capital_gain  Capital_loss  weekly_hours  \
48837                White   Female             0             0            36   
48838                Black     Male             0             

In [6]:
# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())


Missing Values:
Age              0
Emp_type         0
Fnlwgt           0
Education        0
Education_num    0
Marital          0
Occupation       0
Relationship     0
Race             0
Sex              0
Capital_gain     0
Capital_loss     0
weekly_hours     0
Country          0
Income           0
dtype: int64


In [7]:
# Preprocess the dataset
df.replace('?', pd.NA, inplace=True)  # Replace '?' with NaN
df.dropna(inplace=True)  # Drop rows with missing values

In [8]:
# Save the cleaned dataset to a new CSV file
df.to_csv('adult_salary_cleaned.csv', index=False)

In [13]:
# Select relevant columns for modeling
X = df[['Age', 'Emp_type', 'Education', 'Education_num', 'Marital', 'Occupation', 'Relationship', 'Race', 'Sex',
        'Capital_gain', 'Capital_loss', 'weekly_hours', 'Country']]
y = df['Income']

In [14]:
# Encode categorical variables
label_encoders = {}

In [15]:
for col in X.select_dtypes(include=['object']).columns:
    label_encoders[col] = LabelEncoder()
    X[col] = label_encoders[col].fit_transform(X[col])

In [16]:
#  Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [1]:
# Select a machine learning model (Random Forest Classifier)
model = RandomForestClassifier(n_estimators=100, random_state=42)

NameError: name 'RandomForestClassifier' is not defined

In [19]:
# Build the model
model.fit(X_train_scaled, y_train)

In [20]:
# Train the model
y_pred_train = model.predict(X_train_scaled)

In [21]:
# Evaluate the model on testing data
y_pred_test = model.predict(X_test_scaled)
test_accuracy = accuracy_score(y_test, y_pred_test)
print(f"Testing Accuracy: {test_accuracy}")

Testing Accuracy: 0.513460947896407


In [22]:
# Create two new records with reasonable values
new_records = pd.DataFrame({
    'Age': [35, 45],
    'Emp_type': ['Private', 'Government'],
    'Education': ['Bachelors', 'Masters'],
    'Education_num': [13, 14],
    'Marital': ['Married', 'Single'],
    'Occupation': ['Exec-managerial', 'Tech-support'],
    'Relationship': ['Husband', 'Not-in-family'],
    'Race': ['White', 'Asian-Pac-Islander'],
    'Sex': ['Male', 'Female'],
    'Capital_gain': [5000, 0],
    'Capital_loss': [0, 1000],
    'weekly_hours': [40, 50],
    'Country': ['United-States', 'China']
})

In [23]:
# Encode categorical variables in new records using the same label encoder as for training data
for col in new_records.select_dtypes(include=['object']).columns:
    if col in label_encoders:
        # Check if the label encoder has learned the category in new_records[col]
        if set(new_records[col]) <= set(label_encoders[col].classes_):
            new_records[col] = label_encoders[col].transform(new_records[col])
        else:
            print(f"Unseen labels in {col} column: {set(new_records[col]) - set(label_encoders[col].classes_)}")

Unseen labels in Emp_type column: {'Private', 'Government'}
Unseen labels in Education column: {'Masters', 'Bachelors'}
Unseen labels in Marital column: {'Married', 'Single'}
Unseen labels in Occupation column: {'Exec-managerial', 'Tech-support'}
Unseen labels in Relationship column: {'Husband', 'Not-in-family'}
Unseen labels in Race column: {'White', 'Asian-Pac-Islander'}
Unseen labels in Sex column: {'Female', 'Male'}
Unseen labels in Country column: {'United-States', 'China'}


In [24]:
# Define a custom label encoder dictionary
custom_label_encoders = {}

In [25]:
# Encode categorical variables in new records using a custom label encoder
for col in new_records.select_dtypes(include=['object']).columns:
    # Initialize a new label encoder for each column
    custom_label_encoders[col] = LabelEncoder()
    
    # Ensure all values in the combined data are strings
    combined_data = pd.concat([X[col].astype(str), new_records[col].astype(str)], axis=0)
    
    # Fit the label encoder to the combined data
    custom_label_encoders[col].fit(combined_data)
    
    # Transform the values in new records using the fitted label encoder
    new_records[col] = custom_label_encoders[col].transform(new_records[col].astype(str))


In [26]:
# Feature scaling for new records
new_records_scaled = scaler.transform(new_records)

In [27]:
# Predict income for new records using the model trained on the combined data
predicted_income = model.predict(new_records_scaled)
print("\nPredicted Income for New Records:")
for income in predicted_income:
    print("Income > 50K" if income == 1 else "Income <= 50K")


Predicted Income for New Records:
Income <= 50K
Income <= 50K


## Here are the interpretations:

1. Individual 1 (Age 35, Private Employee, Bachelors in Education, Married, Exec-managerial Occupation, Husband Relationship, White Race, Male, Capital Gain of $5000, Capital Loss of $0, 40 Weekly Hours, Country: United States):
Predicted Income: Income <= $50K

Interpretation: The model predicts that this individual's annual income is less than or equal to $50,000 based on the provided features.

2. Individual 2 (Age 45, Government Employee, Masters in Education, Single, Tech-support Occupation, Not-in-family Relationship, Asian-Pac-Islander Race, Female, Capital Gain of $0, Capital Loss of $1000, 50 Weekly Hours, Country: China):
Predicted Income: Income <= $50K

Interpretation: Similarly, the model predicts that this individual's annual income is less than or equal to $50,000 based on the provided features.

These predictions are based on the trained Random Forest Classifier model using the given dataset and features.

In [28]:
# Perform 10-fold cross-validation for Random Forest Classifier
cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=10, scoring='accuracy')
print("Cross-Validation Scores (Accuracy):")
print(cv_scores)
print(f"Mean Accuracy: {cv_scores.mean()}")

Cross-Validation Scores (Accuracy):
[0.51535312 0.50383828 0.51893552 0.51497313 0.50550294 0.51318147
 0.50806245 0.51625288 0.50268748 0.51394932]
Mean Accuracy: 0.5112736594082635


In [29]:
# Perform 10-fold cross-validation for Random Forest Classifier with scoring as 'r2' (for regression)
# Since RandomForestClassifier does not have 'r2' scoring, we'll use 'accuracy' for demonstration purposes
cv_scores_r2 = cross_val_score(model, X_train_scaled, y_train, cv=10, scoring='r2')
print("\nCross-Validation Scores (R-squared):")
print(cv_scores_r2)
print(f"Mean R-squared: {cv_scores_r2.mean()}")


Cross-Validation Scores (R-squared):
[nan nan nan nan nan nan nan nan nan nan]
Mean R-squared: nan
