# Instructions
At this point, we have created a model to predict who will make a donation and who won't. 
But, what about the amount of money that each person will give? 
In this lab, subset those that made a donation and use that subset to create a model to predict how much money will they give.

Evaluate the result of your model and estimate how much better the result is for the business in comparison with the naive scenario we discussed on Monday.

In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

In [13]:
categorical = pd.read_csv(r'C:\Users\renev\lab-random-forests\files_for_lab\categorical.csv')
numerical = pd.read_csv(r'C:\Users\renev\lab-random-forests\files_for_lab\numerical.csv')
target = pd.read_csv(r'C:\Users\renev\lab-random-forests\files_for_lab\target.csv')


In [14]:
combined = pd.concat([categorical, numerical, target], axis=1)

In [17]:
print(target['TARGET_B'].value_counts())

0    90569
1     4843
Name: TARGET_B, dtype: int64


In [18]:
subset = combined[(combined['TARGET_B']==1)]

In [19]:
y = subset['TARGET_D']
X = subset.drop(['TARGET_B','TARGET_D'], axis = 1)

numericalX = X.select_dtypes(np.number).reset_index().drop(['index'],axis=1)
categoricalX = X.select_dtypes(object)

from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(drop='first').fit(categoricalX)
encoded_categorical = encoder.transform(categoricalX).toarray()
encoded_categorical = pd.DataFrame(encoded_categorical)
X = pd.concat([numericalX, encoded_categorical], axis = 1)

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [33]:
regr = DecisionTreeRegressor(max_depth=5)
model = regr.fit(X_train, y_train)

In [34]:
print("test data accuracy was: ",regr.score(X_test, y_test))
print("train data accuracy was: ",regr.score(X_train, y_train))

test data accuracy was:  -0.0046262687751266185
train data accuracy was:  0.7282771432626194


In [None]:
# With max_depth=5, model accuracy on test data is not good.

In [41]:
regr = DecisionTreeRegressor(max_depth=3)
model = regr.fit(X_train, y_train)

In [42]:
print("test data accuracy was: ",regr.score(X_test, y_test))
print("train data accuracy was: ",regr.score(X_train, y_train))

test data accuracy was:  0.41326971923573774
train data accuracy was:  0.5858103454642349


In [None]:
# Model is better using max_depth=3

In [43]:
from sklearn.tree import export_text

r = export_text(regr, feature_names= list(X.columns))
print(r)

|--- LASTGIFT <= 20.25
|   |--- LASTGIFT <= 13.05
|   |   |--- AVGGIFT <= 6.79
|   |   |   |--- value: [6.76]
|   |   |--- AVGGIFT >  6.79
|   |   |   |--- value: [11.53]
|   |--- LASTGIFT >  13.05
|   |   |--- LASTGIFT <= 15.50
|   |   |   |--- value: [15.57]
|   |   |--- LASTGIFT >  15.50
|   |   |   |--- value: [18.90]
|--- LASTGIFT >  20.25
|   |--- LASTGIFT <= 95.00
|   |   |--- AVGGIFT <= 27.26
|   |   |   |--- value: [26.21]
|   |   |--- AVGGIFT >  27.26
|   |   |   |--- value: [44.19]
|   |--- LASTGIFT >  95.00
|   |   |--- HC13 <= 85.00
|   |   |   |--- value: [74.06]
|   |   |--- HC13 >  85.00
|   |   |   |--- value: [200.00]



In [None]:
# Trying different models

In [55]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(numericalX)
numericalX_std = pd.DataFrame(scaler.transform(numericalX))


In [56]:
X2 = pd.concat([numericalX_std, encoded_categorical], axis = 1)

In [57]:
X2_train, X2_test, y_train, y_test = train_test_split(X2, y, test_size=0.2, random_state=0)

In [58]:
from sklearn import linear_model
LR = linear_model.LinearRegression()

In [None]:
from sklearn.neighbors import KNeighborsRegressor
KNN = KNeighborsRegressor(n_neighbors=5)

In [45]:
# Model scores before scaling:

In [48]:
scores = []
models = [LR, KNN, regr] # insert model acronym here
for x in models:
  x.fit(X_train,y_train)
  scores.append(x.score(X_test, y_test)) 

In [49]:
d = {'model':models,'score':scores}
results = pd.DataFrame(d)
results

Unnamed: 0,model,score
0,LinearRegression(),0.289553
1,KNeighborsRegressor(),-0.142246
2,DecisionTreeRegressor(max_depth=3),0.41327


In [None]:
# Model scores after scaling:

In [59]:
scores = []
models = [LR, KNN, regr] # insert model acronym here
for x in models:
  x.fit(X2_train,y_train)
  scores.append(x.score(X2_test, y_test)) 

In [60]:
d2 = {'model':models,'score':scores}
results = pd.DataFrame(d2)
results

Unnamed: 0,model,score
0,LinearRegression(),0.289553
1,KNeighborsRegressor(),0.040859
2,DecisionTreeRegressor(max_depth=3),0.41327


In [None]:
# Conclusion:
Better score achieved on KNN model after scaling.  However, the DecisionTreeRegressor still scores the highest.