In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load dataset
file_path = r'C:\Users\dsali\Downloads\BIA 500 - CAPSTONE PROJECT DATA SET (WORKING COPY).csv'
df = pd.read_csv(file_path)

# handle missing values
imputer = SimpleImputer(strategy='median')
numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
df[numeric_columns] = imputer.fit_transform(df[numeric_columns])

# drp irrelevant columns
columns_to_drop = ['personName', 'lastName', 'firstName', 'date', 'birthDate', 'birthDay', 'birthMonth', 'birthYear', 'latitude_country', 'longitude_country', 'rank', 'finalWorth']
df.drop(columns=columns_to_drop, inplace=True)

# perform one hot encoding for categorical columns
categorical_columns = df.select_dtypes(include=['object']).columns
df = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

# ensure no NaN values in the dataset after one hot encoding, if so replace with zero
df.fillna(0, inplace=True)

# define target variable
y = df['selfMade']  

# inspect dataset... for testing only!!
print("First few rows of the dataset:")
print(df.head())

print("\nBalance of the target variable 'selfMade':")
print(y.value_counts(normalize=True))

print("\nSummary statistics of the dataset:")
print(df.describe())

# select all other features for prediction
X = df.drop('selfMade', axis=1)

# split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# train randomForest model
rf_classifier = RandomForestClassifier(n_estimators=150, random_state=42)
rf_classifier.fit(X_train, y_train)

# prediction and evaluation
y_pred = rf_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# featureImportanceAnalysis
feature_importances = pd.Series(rf_classifier.feature_importances_, index=X.columns).sort_values(ascending=False)

# results
print("\nSelf-Made Billionaire Prediction Accuracy:", accuracy)
print("Classification Report:\n", report)
print("\nFeature Importances:\n", feature_importances)


First few rows of the dataset:
    age  selfMade  cpi_country  cpi_change_country  \
0  74.0     False       110.05                 1.1   
1  51.0      True       117.24                 7.5   
2  59.0      True       117.24                 7.5   
3  78.0      True       117.24                 7.5   
4  92.0      True       117.24                 7.5   

   gross_tertiary_education_enrollment  \
0                                 65.6   
1                                 88.2   
2                                 88.2   
3                                 88.2   
4                                 88.2   

   gross_primary_education_enrollment_country  life_expectancy_country  \
0                                       102.5                     82.5   
1                                       101.8                     78.5   
2                                       101.8                     78.5   
3                                       101.8                     78.5   
4                    