In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import matplotlib as plot
%matplotlib inline

In [2]:
dataset = pd.read_csv('cmc.csv')
dataset.head(5)

Unnamed: 0,wifeAge,wifeEducation,husbandEducation,children,wifeReligion,wifeWorking,husbandOccupation,stdLiving,mediaExposure,contraceptiveMethod
0,24,2,3,3,1,1,2,3,0,1
1,45,1,3,10,1,1,3,4,0,1
2,43,2,3,7,1,1,3,4,0,1
3,42,3,2,9,1,1,3,3,0,1
4,36,3,3,8,1,1,3,2,0,1


In [3]:
y = dataset['contraceptiveMethod'].values

dataset = dataset.drop(['contraceptiveMethod'], axis=1)
X = dataset.values

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [5]:
forest = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1)
forest.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10000, n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [6]:
# Calculate the feature importances (the higher, the more important the feature).

importances = forest.feature_importances_

importances

array([ 0.34932473,  0.08259152,  0.06557417,  0.22778302,  0.03445626,
        0.04494106,  0.08950134,  0.08931171,  0.01651619])

In [7]:
feat_labels = dataset.columns[:]
feat_labels

Index(['wifeAge', 'wifeEducation', 'husbandEducation', 'children',
       'wifeReligion', 'wifeWorking', 'husbandOccupation', 'stdLiving',
       'mediaExposure'],
      dtype='object')

In [8]:
for i in range (0, len(feat_labels)):
    print('Feature: ' + str(feat_labels[i]) + ', importance: ' + str(importances[i]) )

Feature: wifeAge, importance: 0.349324730942
Feature: wifeEducation, importance: 0.0825915237901
Feature: husbandEducation, importance: 0.0655741661836
Feature: children, importance: 0.227783015629
Feature: wifeReligion, importance: 0.0344562630525
Feature: wifeWorking, importance: 0.0449410624054
Feature: husbandOccupation, importance: 0.0895013363529
Feature: stdLiving, importance: 0.0893117135726
Feature: mediaExposure, importance: 0.0165161880723


In [9]:
indices = np.argsort(importances)[::-1]
for f in range(X_train.shape[1]):
    print("%2d) %-*s %f" % (f + 1, 30, feat_labels[f],importances[indices[f]]))

 1) wifeAge                        0.349325
 2) wifeEducation                  0.227783
 3) husbandEducation               0.089501
 4) children                       0.089312
 5) wifeReligion                   0.082592
 6) wifeWorking                    0.065574
 7) husbandOccupation              0.044941
 8) stdLiving                      0.034456
 9) mediaExposure                  0.016516


In [None]:
title('Feature Importances')
bar(range(X_train.shape[1]), importances[indices], 
                     color='green', align='center')
xticks(range(X_train.shape[1]),
feat_labels, rotation=90)
xlim([-1, X_train.shape[1]])
tight_layout()
show()