In [32]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor

from sklearn.datasets import fetch_openml
from sklearn.datasets import load_digits
from sklearn.datasets import load_boston
import numpy as np
import pandas as pd
# Sklearn tools
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

In [3]:
titanic = fetch_openml("titanic", version=1, as_frame=True)
print(type(titanic))
X = titanic.data
X = X.drop(['name', 'embarked', 'cabin', 'ticket', 'body', 'home.dest', 'boat', 'parch'], axis=1)
X['sex'] = X.sex.eq('male').mul(1)
X['age'] = X['age'].fillna(0)
X['sibsp'] = X['sibsp'].fillna(0)
X['fare'] = X['fare'].fillna(0)
y = titanic.target
X.tail()

<class 'sklearn.utils.Bunch'>


Unnamed: 0,pclass,sex,age,sibsp,fare
1304,3.0,0,14.5,1.0,14.4542
1305,3.0,0,0.0,1.0,14.4542
1306,3.0,1,26.5,0.0,7.225
1307,3.0,1,27.0,0.0,7.225
1308,3.0,1,29.0,0.0,7.875


In [5]:
forest = RandomForestClassifier(max_depth=2, random_state=0, oob_score=True).fit(X, y)
score = forest.oob_score_
print(score)

0.7815126050420168


In [9]:
depths = range(1, 6)
criterions = ["gini", "entropy"]
splits = range(2, 8)
leaf_size = range(1, 6)

best_score = 0

for d in depths:
    for c in criterions:
        for split in splits:
            for l in leaf_size:
                forest = RandomForestClassifier(max_depth=d,
                                                criterion=c,
                                                min_samples_split=split,
                                                min_samples_leaf=l,
                                                random_state=0,
                                                oob_score=True,
                                                warm_start=False).fit(X, y)
                score = forest.oob_score_
                if score > best_score:
                    print(score)
                    best_score = score
                    print(d, c, split, l)

0.7692895339954163
1 gini 2 1
0.7738731856378915
1 entropy 2 1
0.7815126050420168
2 gini 2 1
0.7944996180290298
3 gini 2 1
0.7975553857906799
3 entropy 2 5
0.8021390374331551
4 gini 2 1
0.8036669213139801
4 gini 2 2
0.8044308632543926
4 gini 6 1
0.8051948051948052
4 gini 6 2


In [10]:
forest = RandomForestClassifier(max_depth=4,
                                criterion="gini",
                                min_samples_split=6,
                                min_samples_leaf=2,
                                random_state=0,
                                oob_score=True,
                                warm_start=False).fit(X, y)
feature_importances = forest.feature_importances_
print(feature_importances)

[0.16648271 0.59059524 0.05200656 0.03932236 0.15159312]


In [12]:
digits = load_digits()
X = digits.data
y = digits.target

In [13]:
best_score = 0

for d in depths:
    for c in criterions:
        for split in splits:
            for l in leaf_size:
                forest = RandomForestClassifier(n_estimators=200,
                                                max_depth=d,
                                                criterion=c,
                                                min_samples_split=split,
                                                min_samples_leaf=l,
                                                random_state=0,
                                                oob_score=True,
                                                warm_start=False).fit(X, y)
                score = forest.oob_score_
                if score > best_score:
                    print(score)
                    best_score = score
                    print(d, c, split, l)

0.6944908180300501
1 gini 2 1
0.7223149693934335
1 entropy 2 1
0.8046744574290484
2 gini 2 1
0.8096828046744574
2 entropy 2 1
0.8686700055648303
3 gini 2 1
0.8781302170283807
3 entropy 2 1
0.90929326655537
4 gini 2 1
0.9131886477462438
4 gini 2 2
0.9137451307735114
4 gini 5 1
0.9154145798553144
4 gini 6 1
0.9159710628825821
4 gini 6 2
0.9187534780189204
4 entropy 2 1
0.9215358931552587
4 entropy 2 2
0.9376739009460211
5 gini 2 1
0.9387868670005565
5 gini 2 5
0.9398998330550918
5 gini 4 1
0.9493600445186422
5 entropy 2 1
0.9504730105731776
5 entropy 6 1


In [16]:
houses = load_boston()

X = pd.DataFrame(houses.data, columns=houses.feature_names)
y = houses.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [19]:
best_score = 0

max_features = ["auto", "sqrt", "log2"]

for d in depths:
    for f in max_features:
        forest = RandomForestRegressor(n_estimators=100,
                                        max_depth=d,
                                        max_features=f,
                                        random_state=0,
                                        oob_score=True,
                                        warm_start=False).fit(X, y)
        score = forest.oob_score_
        if score > best_score:
            print(score)
            best_score = score
            print(d, f)

0.5415668952901658
1 auto
0.7248722332815014
2 auto
0.8013579316805193
3 auto
0.8356087612758278
4 auto
0.8467031282075687
5 auto


In [20]:
forest = RandomForestRegressor(n_estimators=100,
                                max_depth=5,
                                max_features="auto",
                                random_state=0,
                                oob_score=True,
                                warm_start=False).fit(X, y)
feature_importances = forest.feature_importances_
print(feature_importances)

[3.87842251e-02 3.87593078e-04 2.87818872e-03 5.57128755e-04
 1.62322238e-02 4.78424044e-01 5.39141305e-03 6.26473029e-02
 1.39249103e-03 9.72118029e-03 1.26537671e-02 6.18507659e-03
 3.64745365e-01]


In [25]:
print("Attribute importances from smallest (left) to largest (right)")
print(np.argsort(feature_importances.copy()))
print("Most important attribute indices")
print(np.argsort(feature_importances.copy())[-3:])
print("Least important attribute indices")
print(np.argsort(feature_importances.copy())[:3])

Attribute importances from smallest (left) to largest (right)
[ 1  3  8  2  6 11  9 10  4  0  7 12  5]
Most important attribute indices
[ 7 12  5]
Least important attribute indices
[1 3 8]


In [26]:
life = pd.read_csv('Life_Expectancy_Data.csv')
# Data cleaning code
# rename columns
life.rename(columns={'Life expectancy ':'Life Expectancy',
                     'infant deaths':'Infant Deaths',
                     'percentage expenditure':'Percentage Expenditure',
                     'Measles ':'Measles',
                     ' BMI ':'BMI',
                     'under-five deaths ':'Under 5 Deaths',
                     'Diphtheria ':'Diptheria',
                     ' HIV/AIDS':'HIV/AIDS',
                     ' thinness  1-19 years':'Thinness 1-19 years',
                     ' thinness 5-9 years':'Thinness 5-9 years',
                     'Income composition of resources':'Income Comp'},
            inplace=True)

In [27]:
def clean_na(df):
    col = df.columns
    col = col.drop('Country')
    # get unique countries
    countries = df.Country.unique()
    for country in countries:
        cur_country = df[df.Country==country]
        ind = cur_country.index
        for c in col:
            na_ind = cur_country[cur_country[c].isna()].index
            # proceed with calculations if there are any na values
            if len(na_ind) > 0:
                value_ind = [i for i in ind if i not in na_ind]
                for i in na_ind:
                    # if there is a value in that country in the year before and the year after
                    # fill it with the average between those two
                    if i-1 in value_ind and i+1 in value_ind:
                        df.at[i,c] = np.mean(df.loc[[i-1,i+1],c])
                        # if it is the first or last year, or surrounded by na, just take average of whole column
                    else:
                        df.at[i,c] = np.mean(df.loc[value_ind,c])
    return df

In [28]:
def clean_na_col(df):
    col = df.columns
    col = col.drop('Country')
    # get unique countries
    countries = df.Country.unique()
    for country in countries:
        cur_country = df[df.Country==country]
        ind = cur_country.index
        for c in col:
            na_ind = cur_country[cur_country[c].isna()].index
            # proceed with calculations if there are any na values
            if len(na_ind) > 0:
                value_ind = [i for i in ind if i not in na_ind]
                if len(value_ind) == 0:
                    # developing or developed
                    status = list(cur_country.Status)[0]
                    same_status = df[df.Status==status]
                    # get each year
                    for i in na_ind:
                        year = df.loc[i].Year
                        # find average for respective status and year
                        year_status = same_status[same_status["Year"] == year]
                        df.at[i,c] = np.mean(year_status[c])
    return df

In [29]:
df = clean_na(life)
df = clean_na_col(df)
df = pd.get_dummies(life, columns=['Status','Country'], drop_first=True)

X = df.drop(columns=['Life Expectancy'])
y = df['Life Expectancy']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3)

In [30]:
X.head()

Unnamed: 0,Year,Adult Mortality,Infant Deaths,Alcohol,Percentage Expenditure,Hepatitis B,Measles,BMI,Under 5 Deaths,Polio,...,Country_United Republic of Tanzania,Country_United States of America,Country_Uruguay,Country_Uzbekistan,Country_Vanuatu,Country_Venezuela (Bolivarian Republic of),Country_Viet Nam,Country_Yemen,Country_Zambia,Country_Zimbabwe
0,2015,263.0,62,0.01,71.279624,65.0,1154,19.1,83,6.0,...,0,0,0,0,0,0,0,0,0,0
1,2014,271.0,64,0.01,73.523582,62.0,492,18.6,86,58.0,...,0,0,0,0,0,0,0,0,0,0
2,2013,268.0,66,0.01,73.219243,64.0,430,18.1,89,62.0,...,0,0,0,0,0,0,0,0,0,0
3,2012,272.0,69,0.01,78.184215,67.0,2787,17.6,93,67.0,...,0,0,0,0,0,0,0,0,0,0
4,2011,275.0,71,0.01,7.097109,68.0,3013,17.2,97,68.0,...,0,0,0,0,0,0,0,0,0,0


In [39]:
best_score = 0

n_estimators = range(50, 350, 50)

for d in depths:
    for e in n_estimators:
        for f in max_features:
            forest = RandomForestRegressor(n_estimators=e,
                                            max_depth=d,
                                            max_features=f,
                                            random_state=0,
                                            oob_score=True,
                                            warm_start=False).fit(X, y)
            score = forest.oob_score_
            if score > best_score:
                print(score)
                best_score = score
                print(e, d, f)                                                                                                                                                                                


0.5862298365325955
50 1 auto
0.5871501341546084
100 1 auto
0.5874531570928128
150 1 auto
0.5877983666395521
200 1 auto
0.7570339943321958
50 2 auto
0.765623700698796
100 2 auto
0.8555526213401523
50 3 auto
0.858947008979842
100 3 auto
0.8969902058042433
50 4 auto
0.8995727898478444
100 4 auto
0.92237029565207
50 5 auto
0.9243685538332991
100 5 auto
0.9244331278708443
150 5 auto
0.9246003010189403
200 5 auto
0.924706457942038
250 5 auto
0.9250068634631685
300 5 auto


In [40]:
forest = RandomForestRegressor(n_estimators=300,
                                max_depth=5,
                                max_features="auto",
                                random_state=0,
                                oob_score=True,
                                warm_start=False).fit(X, y)
feature_importances = forest.feature_importances_
print(feature_importances)

[2.85819031e-03 1.20131155e-01 2.79461680e-03 2.89662805e-03
 4.53416271e-04 3.08435788e-04 7.97997440e-04 1.11172251e-02
 1.25205650e-02 1.23784418e-03 2.58415672e-04 9.29061506e-04
 6.41852814e-01 5.75020868e-04 1.39450709e-04 3.49905055e-03
 1.13094020e-03 1.78473028e-01 1.50979607e-02 1.58608512e-05
 0.00000000e+00 0.00000000e+00 7.86028047e-05 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 1.30989605e-06
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 1.16000667e-05 9.48998363e-06
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 3.23225199e-06 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 3.18451722e-06 9.30341217e-05
 0.00000000e+00 0.000000

In [43]:
print("Attribute importances from smallest (left) to largest (right)")
print(np.argsort(feature_importances.copy()))
print("Most important attribute indices")
most_important = np.argsort(feature_importances.copy())[-3:][::-1]
print(most_important)
print("Least important attribute indices")
least_important = np.argsort(feature_importances.copy())[:3]
print(least_important)

Attribute importances from smallest (left) to largest (right)
[105 175 108 109 110 111 112 113 106 114 117 118 119 120 121 122 123 115
 124 210 177  82  83 181 169  86  88 180 104 179  93  95  96  97  98  99
 100  92 125 126 127 152 153 154 155 174 157 158 151 159 161 162 173 164
 165 167 168 160 150 149 148 128 129 130 131 132 134 136 137 138 139 141
 142 143 144 145 146 147  81  80  85  40  23  24  25  26  27  28  29  30
  31  32  33  34  36  37  38  39  41 190  42  21 191 209 208 207 206 205
 204 203 202 201 200 199 198 197 196 195 194 192  20  43 170  45  68  71
  66  65  73  64 187  74  61  44  60  59  58  57 186 185  55 189 188  48
  77  49  56  50  70  76  53  54  51 103  67 166 184 133  35 102 193 116
  72 178  62  52 140  78  87 135  90 211  47  94 101 107 182  46  84  69
  19 183  89 163  91  22 176  63  14  75 172  10   5 171  79   4  13   6
  11  16 156   9   2   0   3  15   7   8  18   1  17  12]
Most important attribute indices
[12 17  1]
Least important attribute indices

In [44]:
most_important_attributes = X.columns[most_important]
least_important_attributes = X.columns[least_important]
print("Most important attributes")
print(most_important_attributes)
print("Least important attribute indices")
print(least_important_attributes)

Most important attributes
Index(['HIV/AIDS', 'Income Comp', 'Adult Mortality'], dtype='object')
Least important attribute indices
Index(['Country_Kazakhstan', 'Country_Solomon Islands', 'Country_Kuwait'], dtype='object')
