![logo_ironhack_blue 7](https://user-images.githubusercontent.com/23629340/40541063-a07a0a8a-601a-11e8-91b5-2f13e4e6b441.png)

# Lab | Random Forests

For this lab, you will be using the CSV files provided in the `files_for_lab` folder.

### Instructions

- Apply the Random Forests algorithm but this time only by upscaling the data.
- Discuss the output and its impact in the bussiness scenario. Is the cost of a false positive equals to the cost of the false negative? How would you change your algorithm or data in order to maximize the return of the bussiness?

In [120]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_boston
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import confusion_matrix
from IPython.display import display
from sklearn.utils import resample
from sklearn.preprocessing import MinMaxScaler

In [107]:
numerical = pd.read_csv('numerical.csv')
categorical = pd.read_csv('categorical.csv')
targets = pd.read_csv('target.csv')
data = pd.concat([numerical, categorical, targets], axis = 1)
data['TARGET_B'].value_counts()

0    90569
1     4843
Name: TARGET_B, dtype: int64

In [83]:
data.head()

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,MINRDATE_YR,MINRDATE_MM,MAXRDATE_YR,MAXRDATE_MM,LASTDATE_YR,LASTDATE_MM,FIRSTDATE_YR,FIRSTDATE_MM,TARGET_B,TARGET_D
0,0,60.0,5,9,0,0,39,34,18,10,...,92,8,94,2,95,12,89,11,0,0.0
1,1,46.0,6,9,16,0,15,55,11,6,...,93,10,95,12,95,12,93,10,0,0.0
2,1,61.611649,3,1,2,0,20,29,33,6,...,91,11,92,7,95,12,90,1,0,0.0
3,0,70.0,1,4,2,0,23,14,31,3,...,87,11,94,11,95,12,87,2,0,0.0
4,0,78.0,3,2,60,1,28,9,53,26,...,93,10,96,1,96,1,79,3,0,0.0


In [84]:
data.isna().sum().sum()

0

In [85]:
numerical.shape

(95412, 315)

In [86]:
data.head()

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,MINRDATE_YR,MINRDATE_MM,MAXRDATE_YR,MAXRDATE_MM,LASTDATE_YR,LASTDATE_MM,FIRSTDATE_YR,FIRSTDATE_MM,TARGET_B,TARGET_D
0,0,60.0,5,9,0,0,39,34,18,10,...,92,8,94,2,95,12,89,11,0,0.0
1,1,46.0,6,9,16,0,15,55,11,6,...,93,10,95,12,95,12,93,10,0,0.0
2,1,61.611649,3,1,2,0,20,29,33,6,...,91,11,92,7,95,12,90,1,0,0.0
3,0,70.0,1,4,2,0,23,14,31,3,...,87,11,94,11,95,12,87,2,0,0.0
4,0,78.0,3,2,60,1,28,9,53,26,...,93,10,96,1,96,1,79,3,0,0.0


In [94]:
y = data['TARGET_B']
X = data.drop(['TARGET_B'], axis = 1)

numericalX = X.select_dtypes(np.number)
categoricalX = X.select_dtypes(np.object)

encoder = OneHotEncoder(drop='first').fit(categoricalX)
encoded_categorical = encoder.transform(categoricalX).toarray()
encoded_categorical = pd.DataFrame(encoded_categorical)
X = pd.concat([numericalX, encoded_categorical], axis = 1)

# we need to do train/test split before upsampling, and then only upsample the training set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


In [95]:
trainset = pd.concat([X_train, y_train], axis=1)

category_0 = trainset[trainset['TARGET_B']==0]
category_1 = trainset[trainset['TARGET_B']==1]

category_1_oversampled = resample(category_1, 
                                  replace=True, 
                                  n_samples = len(category_0))

print(category_0.shape)
print(category_1_oversampled.shape)

trainset_new = pd.concat([category_0, category_1_oversampled], axis = 0)
trainset_new = trainset_new.sample(frac =1) #randomize the rows
X_train = trainset_new.drop(['TARGET_B'], axis=1)
y_train = trainset_new['TARGET_B']
data = data.reset_index(drop=True)
print(X_train.shape)

(72486, 356)
(72486, 356)
(144972, 355)


In [96]:
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

y_train_regression = X_train['TARGET_D']
y_test_regression = X_test['TARGET_D']

# Now we can remove the column target d from the set of features
X_train = X_train.drop(['TARGET_D'], axis = 1)
X_test = X_test.drop(['TARGET_D'], axis = 1)

In [97]:
y_test

58053    0
9484     0
13395    0
1466     0
2076     0
        ..
94255    0
26449    0
1969     0
48574    0
73270    0
Name: TARGET_B, Length: 19083, dtype: int64

In [98]:
y_test_regression

58053    0.0
9484     0.0
13395    0.0
1466     0.0
2076     0.0
        ... 
94255    0.0
26449    0.0
1969     0.0
48574    0.0
73270    0.0
Name: TARGET_D, Length: 19083, dtype: float64

In [99]:
clf = RandomForestClassifier(max_depth=5,
                             min_samples_split=20,
                             min_samples_leaf =20,
                             max_samples=0.2,
                             random_state = 42)
clf.fit(X_train, y_train)
print(clf.score(X_train, y_train))
print(clf.score(X_test, y_test))

y_pred = clf.predict(X_test)
display(y_test.value_counts())
display(confusion_matrix(y_test, y_pred))

0.6236238721960102
0.6039406801865534


0    18083
1     1000
Name: TARGET_B, dtype: int64

array([[10979,  7104],
       [  454,   546]], dtype=int64)

In [None]:
# Discuss the output and its impact in the bussiness scenario. Is the cost of a false positive equals to the cost of the false negative? 
# There is the risk we calculate the probability someone will donate but this person won't OR someone won't donate but this person would.
# If the avaerage donation is 16 $ and the cost for 1 letter 0.40$ we need 1 donation of 16$ to cover the cost for this 40 persons. 
# In this case 0 is not responded to mailing list, and 1 is responded to mailing list. 
# From the array([[10979,  7104],[  454,   546]] above it apppears that the random forest classifier predicted way more false positives (1 when its actually 0) 
# than false negatives what could be bad for the business.

# After more than 1 donation the thing pays for itself. Because the "earnings" 
# How would you change your algorithm or data in order to maximize the return of the bussiness?
# With bagging or boosting.

![logo_ironhack_blue 7](https://user-images.githubusercontent.com/23629340/40541063-a07a0a8a-601a-11e8-91b5-2f13e4e6b441.png)

# Lab | Final regression model in "Health Care for All" Case

### Instructions

At this point, we have created a model to predict who will make a donation and who won't. But, what about the ammount of money that each person will give?
In this lab, subset those that made a donation and use that subset to create a model to predict how much money will they give.

Evaluate the result of your model and estimate how much better the result are for the bussiness in comparison with the naive scenario we discuss on Monday.

In [128]:
data_d = data[data['TARGET_B']==1]
data_d

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,MINRDATE_YR,MINRDATE_MM,MAXRDATE_YR,MAXRDATE_MM,LASTDATE_YR,LASTDATE_MM,FIRSTDATE_YR,FIRSTDATE_MM,TARGET_B,TARGET_D
20,2,62.000000,3,8,10,2,25,40,27,11,...,88,1,94,4,96,3,87,1,1,4.0
30,0,61.611649,5,9,0,1,37,58,16,8,...,90,4,93,1,95,12,90,4,1,7.0
45,0,66.000000,5,9,5,0,33,24,39,6,...,93,12,94,4,96,2,87,4,1,5.0
78,0,69.000000,6,9,0,0,34,20,54,2,...,90,1,95,3,95,11,90,1,1,13.0
93,1,73.000000,1,7,10,0,21,53,8,5,...,92,9,95,9,95,9,92,9,1,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95298,2,45.000000,5,9,0,0,45,28,37,9,...,89,6,96,1,96,1,86,8,1,20.0
95309,0,51.000000,5,6,1,1,32,43,24,7,...,93,10,94,2,95,12,93,10,1,15.0
95398,0,86.000000,5,9,0,1,32,21,26,9,...,89,6,95,11,96,2,87,11,1,3.0
95403,0,58.000000,4,9,0,0,24,46,20,6,...,90,3,93,12,96,1,90,3,1,10.0


In [129]:
y= data_d['TARGET_D']
X= data_d.drop(['TARGET_D'], axis = 1)

numericalX = X.select_dtypes(np.number)
categoricalX = X.select_dtypes(np.object)

encoder = OneHotEncoder(drop='first').fit(categoricalX)
encoded_categorical = encoder.transform(categoricalX).toarray()
encoded_categorical = pd.DataFrame(encoded_categorical)

transformer = MinMaxScaler().fit(numericalX)
x_normalized = transformer.transform(numericalX)
x_normalized

x_normalized = pd.DataFrame(x_normalized, columns=numericalX.columns)

X = pd.concat([x_normalized, encoded_categorical], axis = 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [130]:
X

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,14,15,16,17,18,19,20,21,22,23
0,0.000051,0.617021,0.333333,0.888889,0.041667,0.020833,0.316456,0.404040,0.272727,0.229167,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.000000,0.612890,0.666667,1.000000,0.000000,0.010417,0.468354,0.585859,0.161616,0.166667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.000000,0.659574,0.666667,1.000000,0.020833,0.000000,0.417722,0.242424,0.393939,0.125000,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.691489,0.833333,1.000000,0.000000,0.000000,0.430380,0.202020,0.545455,0.041667,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.000026,0.734043,0.000000,0.777778,0.041667,0.000000,0.265823,0.535354,0.080808,0.104167,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4838,0.000051,0.436170,0.666667,1.000000,0.000000,0.000000,0.569620,0.282828,0.373737,0.187500,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4839,0.000000,0.500000,0.666667,0.666667,0.004167,0.010417,0.405063,0.434343,0.242424,0.145833,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
4840,0.000000,0.872340,0.666667,1.000000,0.000000,0.010417,0.405063,0.212121,0.262626,0.187500,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
4841,0.000000,0.574468,0.500000,1.000000,0.000000,0.000000,0.303797,0.464646,0.202020,0.125000,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0


In [131]:
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

X_train = X_train.drop(['TARGET_B'], axis = 1)
X_test = X_test.drop(['TARGET_B'], axis = 1)

In [132]:
y_test

31771    20.0
80425    10.0
61896    20.0
36289     5.0
30894     4.0
         ... 
11536    51.0
48736    12.0
22224    37.0
244      16.0
63751     5.0
Name: TARGET_D, Length: 969, dtype: float64

In [133]:
clf = RandomForestRegressor(max_depth=5,
                             min_samples_split=20,
                             min_samples_leaf =20,
                             max_samples=0.2,
                             random_state = 42)
clf.fit(X_train, y_train)
print(clf.score(X_train, y_train))
print(clf.score(X_test, y_test))

y_pred = clf.predict(X_test)
display(y_test.value_counts())

0.5171268165959964
0.4482368356978579


10.0     188
15.0     122
20.0     107
5.0      100
25.0      83
7.0       28
30.0      26
12.0      25
11.0      23
50.0      23
17.0      21
6.0       17
14.0      17
16.0      16
21.0      15
8.0       15
3.0       15
4.0       15
9.0       12
18.0      12
23.0      12
13.0      10
100.0      8
35.0       7
26.0       6
40.0       5
24.0       5
19.0       5
36.0       5
45.0       3
12.5       3
51.0       2
22.0       2
2.0        2
27.0       2
47.0       2
32.0       1
43.0       1
41.0       1
31.0       1
29.0       1
200.0      1
38.0       1
75.0       1
46.0       1
37.0       1
Name: TARGET_D, dtype: int64