In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier

In [18]:
# Read the data
income_data = pd.read_csv("adult.csv", header = 0, delimiter = ",")

# print(income_data.iloc[0]) # print first column
income_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [19]:
# Select the column "income"
labels = income_data[["income"]]

In [22]:
# Random forests can’t use columns that contain Strings
# They have to be continuous values like integers or floats
# Let's use DataFrame.apply() from Pandas to convert sex and native-country columns

income_data["sex-int"] = income_data["sex"].apply(lambda row: 0 if row == "Male" else 1)

income_data["country-int"] = income_data["native.country"].apply(lambda row: 0 if row == "United-States" else 1)

In [24]:
# Select columns used to predict the income
data = income_data[["age", "capital.gain", "capital.loss", "hours.per.week", "sex-int", "country-int"]]

In [25]:
# Split data and labels into training set and test set
train_data, test_data, train_labels, test_labels = train_test_split(data, labels, random_state = 1)

In [26]:
# Create the Random Forest Classifier
forest = RandomForestClassifier(random_state=1)

In [35]:
# Fit the model
forest.fit(train_data, train_labels.values.ravel()) 
# .values will give the values in an array. (shape: (n,1)
# .ravel will convert that array shape to (n, )

RandomForestClassifier(random_state=1)

In [38]:
# Print the score of the model with the test data
print(forest.score(test_data, test_labels))

0.8222577078982926


In [39]:
# Prediction (for the first 10 values)
print(forest.predict(test_data)[:10])

['<=50K' '>50K' '<=50K' '<=50K' '<=50K' '<=50K' '<=50K' '<=50K' '<=50K'
 '<=50K']


In [40]:
# This is the relevance of each column from the training data
# ["age", "capital.gain", "capital.loss", "hours.per.week", "sex-int", "country-int"]
print(forest.feature_importances_)

[0.31767365 0.28810792 0.11310256 0.20623251 0.06619892 0.00868443]
