### Learning to learn a Decision Tree model using scikit-learn library on publicly available data
- This learning is done from Kaggle learning course on "Into to ML"
- The data is publicly available in my GitHub repo.

In [40]:
# Importing libraries
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor

In [41]:
#creating data frame
cell_df = pd.read_csv('cell_samples.csv')
cell_df.head()
cell_df.describe()

Unnamed: 0,ID,Clump,UnifSize,UnifShape,MargAdh,SingEpiSize,BlandChrom,NormNucl,Mit,Class
count,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0
mean,1071704.0,4.41774,3.134478,3.207439,2.806867,3.216023,3.437768,2.866953,1.589413,2.689557
std,617095.7,2.815741,3.051459,2.971913,2.855379,2.2143,2.438364,3.053634,1.715078,0.951273
min,61634.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0
25%,870688.5,2.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0
50%,1171710.0,4.0,1.0,1.0,1.0,2.0,3.0,1.0,1.0,2.0
75%,1238298.0,6.0,5.0,5.0,4.0,4.0,5.0,4.0,1.0,4.0
max,13454350.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,4.0


In [42]:
cell_df.Class.value_counts()

Class
2    458
4    241
Name: count, dtype: int64

In [43]:
y = cell_df.Class # target object
y.head()

0    2
1    2
2    2
3    2
4    2
Name: Class, dtype: int64

In [44]:
#checking features
cell_df.columns

Index(['ID', 'Clump', 'UnifSize', 'UnifShape', 'MargAdh', 'SingEpiSize',
       'BareNuc', 'BlandChrom', 'NormNucl', 'Mit', 'Class'],
      dtype='object')

In [45]:
#making features
features = ['UnifSize', 'UnifShape', 'MargAdh', 'SingEpiSize', 'BlandChrom', 'NormNucl', 'Mit']
X = cell_df[features]
X.head()

Unnamed: 0,UnifSize,UnifShape,MargAdh,SingEpiSize,BlandChrom,NormNucl,Mit
0,1,1,1,2,3,1,1
1,4,4,5,7,3,2,1
2,1,1,1,2,3,1,1
3,8,8,1,3,3,7,1
4,1,1,3,2,3,1,1


In [46]:
#splitting into validation and training datasets using scikit library
train_X, val_X, train_y, val_y = train_test_split(X,y,random_state=1)
train_X.head()
train_y.head()

304    4
414    4
262    4
247    4
62     4
Name: Class, dtype: int64

In [47]:
#specifying model
cell_model = DecisionTreeRegressor(random_state=1)
#Fitting Model for recognizing pattern between features (chosen) and the target value
cell_model.fit(train_X, train_y)
#our mdoel is ready on training datasets. Now it is time to check its accuracy on testing datasets.

In [71]:
#Validation prediction and mean absolute error
val_predictions = cell_model.predict(val_X)
val_mae = mean_absolute_error(val_predictions, val_y)
#print("Validation MAE: {}".format(val_mae))
val_mae

0.09142857142857143

In [54]:
#approaching for better prediction by optimizing size of the tree (underfitting anf overfitting concept)
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

In [72]:
#compare different tree sizes
candidate_max_leaf_nodes = [5, 25, 50, 100, 250, 500]

for max_leaf_nodes in [5, 25, 50, 100, 250, 500]:
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print("Max leaf nodes: %d \t\t Mean absolute error: %f" %(max_leaf_nodes, my_mae))

#minimum error is for >50 nodes, I will choose any node (say 50)
best_tree_size = 5

Max leaf nodes: 5 		 Mean absolute error: 0.155266
Max leaf nodes: 25 		 Mean absolute error: 0.123446
Max leaf nodes: 50 		 Mean absolute error: 0.114286
Max leaf nodes: 100 		 Mean absolute error: 0.114286
Max leaf nodes: 250 		 Mean absolute error: 0.114286
Max leaf nodes: 500 		 Mean absolute error: 0.114286


###### You know the best tree size. If you were going to deploy this model in practice, you would make it even more accurate by using all of the data and keeping that tree size. That is, you don't need to hold out the validation data now that you've made all your modeling decisions.

In [74]:
final_model = DecisionTreeRegressor(max_leaf_nodes=50,random_state=1)
final_model.fit(X,y)

In [75]:
val_mae = mean_absolute_error(final_model.predict(val_X), val_y)
val_mae

0.003706563706563699