# Evalutatoin Matrix

* 1- Accuracy = (# predicted correctly) / (total # of observations)
* 2- Precision =(# predicted as spam that are acutally spam) / (total # predicted as spam)
*  3- Recall = (# predicted as spam that are acutally spam) / (total # that are actually spam)

# Ensemble Method
* Technique that create multiple methods and then combine them to produce better results than any single model individually

## Random Forest
* Ensemble learning method that construct a collection of decision trees and then aggregate the predictions of each tree to determine the final result

# Building Machine Learning Classifier : Building a basic random forest model

### Read in & Clean text

In [4]:
import pandas as pd
import re
import string
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

pd.set_option('display.max_colwidth' ,100)

data = pd.read_csv('spam.csv', encoding='latin-1')
data = data.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1)
data.columns = ['lable' , 'body_text']
# 
def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count / ( len(text) - text.count(" ") ) , 3) * 100

data['punct%'] = data['body_text'].apply(lambda x : count_punct(x))
data['body_lenght'] = data['body_text'].apply(lambda x : len(x) - x.count(" "))

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])  # Remove Punctuation
    tokens = re.split('\W+',text)                                                     # Tokenize 
    text = [ps.stem(word) for word in tokens if word not in stopwords]               # Remove stopwordsand stem
    return text

tfidf_vect = TfidfVectorizer(analyzer = clean_text)
X_tfidf = tfidf_vect.fit_transform(data['body_text'])

X_feature = pd.concat([ data['body_lenght'] , data['punct%'] , pd.DataFrame(X_tfidf.toarray()) ] , axis = 1)
X_feature.head()


Unnamed: 0,body_lenght,punct%,0,1,2,3,4,5,6,7,...,8050,8051,8052,8053,8054,8055,8056,8057,8058,8059
0,92,9.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,24,25.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,128,4.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,39,15.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,49,4.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Explore RandomForestClassifier Attributes & Hyperparameter

In [5]:
from sklearn.ensemble import RandomForestClassifier

In [6]:
print(dir(RandomForestClassifier))
print(RandomForestClassifier())

['__abstractmethods__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_abc_impl', '_estimator_type', '_get_param_names', '_get_tags', '_make_estimator', '_more_tags', '_required_parameters', '_set_oob_score', '_validate_X_predict', '_validate_estimator', '_validate_y_class_weight', 'apply', 'decision_path', 'feature_importances_', 'fit', 'get_params', 'predict', 'predict_log_proba', 'predict_proba', 'score', 'set_params']
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0

# Explore RandomForestClassifier through CrossValidation

In [7]:
from sklearn.model_selection import KFold , cross_val_score

In [8]:
rf = RandomForestClassifier(n_jobs= -1)
k_fold = KFold(n_splits = 5)
cross_val_score(rf,X_feature,data['lable'] , cv = k_fold , scoring='accuracy' , n_jobs=-1)

array([0.96681614, 0.97309417, 0.97217235, 0.96947935, 0.97127469])

# Explore RandomForestClassifier through Holdout Set

In [9]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split

In [10]:
X_train , X_test , y_train , y_test = train_test_split(X_feature , data['lable'] , test_size = 0.2)

In [11]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=50 , max_depth=20,n_jobs=-1)
rf_model = rf.fit(X_train , y_train)

In [13]:
sorted(zip(rf_model.feature_importances_,X_train.columns) , reverse=True)[0:10]

[(0.0539549297376766, 7292),
 (0.04174574943757594, 1789),
 (0.03886012264006324, 'body_lenght'),
 (0.032997903433765206, 8038),
 (0.02647778568939725, 4765),
 (0.025270395576718004, 2018),
 (0.024713848425015388, 3118),
 (0.022660366479336388, 6976),
 (0.017865654077299297, 5683),
 (0.017793830692806326, 6695)]

In [16]:
y_pred = rf_model.predict(X_test)
precision, recall, fscore , support = score(y_test,y_pred , pos_label='spam' , average = 'binary')

In [24]:
print('Precision = {} \nRecall = {} \nAccuracy = {}'.format(round(precision ,3),
                                                      round(recall,3),
                                                      round((y_pred == y_test).sum() / len(y_pred),3)))

Precision = 1.0 
Recall = 0.643 
Accuracy = 0.951


# Explore Random Forest Model with Grid-Search

In [25]:
import pandas as pd
import re
import string
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

pd.set_option('display.max_colwidth' ,100)

data = pd.read_csv('spam.csv', encoding='latin-1')
data = data.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1)
data.columns = ['lable' , 'body_text']
# 
def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count / ( len(text) - text.count(" ") ) , 3) * 100

data['punct%'] = data['body_text'].apply(lambda x : count_punct(x))
data['body_lenght'] = data['body_text'].apply(lambda x : len(x) - x.count(" "))

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])  # Remove Punctuation
    tokens = re.split('\W+',text)                                                     # Tokenize 
    text = [ps.stem(word) for word in tokens if word not in stopwords]               # Remove stopwordsand stem
    return text

tfidf_vect = TfidfVectorizer(analyzer = clean_text)
X_tfidf = tfidf_vect.fit_transform(data['body_text'])

X_feature = pd.concat([ data['body_lenght'] , data['punct%'] , pd.DataFrame(X_tfidf.toarray()) ] , axis = 1)
X_feature.head()


Unnamed: 0,body_lenght,punct%,0,1,2,3,4,5,6,7,...,8050,8051,8052,8053,8054,8055,8056,8057,8058,8059
0,92,9.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,24,25.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,128,4.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,39,15.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,49,4.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Build our owen grid-search

In [26]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split

In [27]:
X_train , X_test , y_train , y_test = train_test_split(X_feature , data['lable'] , test_size = 0.2)

In [30]:
def train_RF(n_est , depth):
    rf = RandomForestClassifier(n_estimators=n_est , max_depth =depth ,n_jobs= -1 )
    rf_model = rf.fit(X_train , y_train)
    y_pred = rf.predict(X_test)
    precision, recall, fscore , support = score(y_test,y_pred,pos_label = 'spam',average='binary')
    print('Est: {} / Depth: {} --- Precision = {} Recall = {} Accuracy = {}'.format(
        n_est , depth , round(precision , 3) , round(recall , 3) ,  round(recall , 3) ,
        round((y_pred == y_test).sum() / len(y_pred),3)))

In [31]:
for n_est in [10,50,100]:
    for depth in [10,20,30,None]:
        train_RF(n_est , depth)

Est: 10 / Depth: 10 --- Precision = 1.0 Recall = 0.378 Accuracy = 0.378
Est: 10 / Depth: 20 --- Precision = 1.0 Recall = 0.622 Accuracy = 0.622
Est: 10 / Depth: 30 --- Precision = 0.99 Recall = 0.662 Accuracy = 0.662
Est: 10 / Depth: None --- Precision = 0.984 Recall = 0.831 Accuracy = 0.831
Est: 50 / Depth: 10 --- Precision = 1.0 Recall = 0.345 Accuracy = 0.345
Est: 50 / Depth: 20 --- Precision = 1.0 Recall = 0.642 Accuracy = 0.642
Est: 50 / Depth: 30 --- Precision = 1.0 Recall = 0.716 Accuracy = 0.716
Est: 50 / Depth: None --- Precision = 1.0 Recall = 0.824 Accuracy = 0.824
Est: 100 / Depth: 10 --- Precision = 1.0 Recall = 0.358 Accuracy = 0.358
Est: 100 / Depth: 20 --- Precision = 1.0 Recall = 0.622 Accuracy = 0.622
Est: 100 / Depth: 30 --- Precision = 1.0 Recall = 0.73 Accuracy = 0.73
Est: 100 / Depth: None --- Precision = 1.0 Recall = 0.824 Accuracy = 0.824


# Building Machine Learning Classifier : Evaluate Random Forest with GridSearchCV

**Grid search** : builds a model for every combination of hyperparameters specified and evaluates each model.

**cross validation** : Divided dataset into K subsets and repeat the holdout method K times where a different subset is used as the holdout set in each iteration.

### Read in text

In [37]:
import pandas as pd
import re
import string
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer , CountVectorizer

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

pd.set_option('display.max_colwidth' ,100)

data = pd.read_csv('spam.csv', encoding='latin-1')
data = data.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1)
data.columns = ['lable' , 'body_text']
# 
def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count / ( len(text) - text.count(" ") ) , 3) * 100

data['punct%'] = data['body_text'].apply(lambda x : count_punct(x))
data['body_lenght'] = data['body_text'].apply(lambda x : len(x) - x.count(" "))

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])  # Remove Punctuation
    tokens = re.split('\W+',text)                                                     # Tokenize 
    text = [ps.stem(word) for word in tokens if word not in stopwords]               # Remove stopwordsand stem
    return text

#TF-IDF
tfidf_vect = TfidfVectorizer(analyzer = clean_text)
X_tfidf = tfidf_vect.fit_transform(data['body_text'])
X_tfidf_feature = pd.concat([ data['body_lenght'] , data['punct%'] , pd.DataFrame(X_tfidf.toarray()) ] , axis = 1)

# CountVecztorizer
count_vect  = CountVectorizer(analyzer = clean_text)
x_count = count_vect.fit_transform(data['body_text'])
X_count_feature = pd.concat([ data['body_lenght'] , data['punct%'] , pd.DataFrame(X_tfidf.toarray()) ] , axis = 1)

X_count_feature.head()

Unnamed: 0,body_lenght,punct%,0,1,2,3,4,5,6,7,...,8050,8051,8052,8053,8054,8055,8056,8057,8058,8059
0,92,9.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,24,25.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,128,4.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,39,15.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,49,4.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Exploring parameter setting using GridSearchCV

In [39]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [45]:
rf = RandomForestClassifier()
param = {'n_estimators' : [10,50,300],
        'max_depth' : [30,60,90,None]}
gs = GridSearchCV(rf , param , cv = 5 , n_jobs= -1)
gs_fit = gs.fit(X_tfidf_feature , data['lable'])
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score' , ascending= False)[0:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
7,6.91729,0.045603,0.197321,0.009511,90.0,50,"{'max_depth': 90, 'n_estimators': 50}",0.976682,0.979372,0.977558,0.97307,0.97307,0.975951,0.002507,1
11,29.038807,1.691639,0.259905,0.034671,,300,"{'max_depth': None, 'n_estimators': 300}",0.977578,0.975785,0.977558,0.970377,0.974865,0.975233,0.002643,2
8,34.134388,0.422645,0.391049,0.048281,90.0,300,"{'max_depth': 90, 'n_estimators': 300}",0.977578,0.975785,0.977558,0.970377,0.972172,0.974695,0.002922,3
10,7.115676,0.116252,0.204787,0.021857,,50,"{'max_depth': None, 'n_estimators': 50}",0.975785,0.976682,0.974865,0.970377,0.975763,0.974695,0.002234,3
5,28.150631,0.175317,0.38504,0.007732,60.0,300,"{'max_depth': 60, 'n_estimators': 300}",0.974888,0.975785,0.976661,0.968582,0.969479,0.97308,0.003364,5


In [46]:
rf = RandomForestClassifier()
param = {'n_estimators' : [10,50,300],
        'max_depth' : [30,60,90,None]}
gs = GridSearchCV(rf , param , cv = 5 , n_jobs= -1)
gs_fit = gs.fit(X_count_feature , data['lable'])
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score' , ascending= False)[0:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
7,7.299864,0.101086,0.199905,0.012363,90.0,50,"{'max_depth': 90, 'n_estimators': 50}",0.978475,0.979372,0.977558,0.964991,0.974865,0.975054,0.005252,1
8,34.382279,0.143826,0.421511,0.052594,90.0,300,"{'max_depth': 90, 'n_estimators': 300}",0.976682,0.975785,0.977558,0.970377,0.974865,0.975054,0.002504,1
11,28.08271,2.106032,0.25704,0.030138,,300,"{'max_depth': None, 'n_estimators': 300}",0.976682,0.976682,0.977558,0.969479,0.973968,0.974874,0.002955,3
10,7.92411,0.250899,0.195802,0.010935,,50,"{'max_depth': None, 'n_estimators': 50}",0.979372,0.973991,0.976661,0.970377,0.971275,0.974336,0.003345,4
4,6.052955,0.124377,0.179092,0.019541,60.0,50,"{'max_depth': 60, 'n_estimators': 50}",0.975785,0.973094,0.974865,0.968582,0.975763,0.973618,0.002701,5


# Gradient-Boosting

**Gradient-Boosting:**
Ensambleing learning method that takes in iterative approach to combining weak learners to craet a strong learner by focusing on mistakes of prior itereation

# Building Machine Learning Classifier : Explore Gradient-Boosting model with Grid-Search

In [47]:
import pandas as pd
import re
import string
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

pd.set_option('display.max_colwidth' ,100)

data = pd.read_csv('spam.csv', encoding='latin-1')
data = data.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1)
data.columns = ['lable' , 'body_text']
# 
def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count / ( len(text) - text.count(" ") ) , 3) * 100

data['punct%'] = data['body_text'].apply(lambda x : count_punct(x))
data['body_lenght'] = data['body_text'].apply(lambda x : len(x) - x.count(" "))

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])  # Remove Punctuation
    tokens = re.split('\W+',text)                                                     # Tokenize 
    text = [ps.stem(word) for word in tokens if word not in stopwords]               # Remove stopwordsand stem
    return text

tfidf_vect = TfidfVectorizer(analyzer = clean_text)
X_tfidf = tfidf_vect.fit_transform(data['body_text'])

X_feature = pd.concat([ data['body_lenght'] , data['punct%'] , pd.DataFrame(X_tfidf.toarray()) ] , axis = 1)
X_feature.head()

Unnamed: 0,body_lenght,punct%,0,1,2,3,4,5,6,7,...,8050,8051,8052,8053,8054,8055,8056,8057,8058,8059
0,92,9.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,24,25.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,128,4.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,39,15.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,49,4.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Explore Gradient-Boosting Classifier Attributes & Hyperparameter

In [48]:
from sklearn.ensemble import GradientBoostingClassifier

In [49]:
print(dir(GradientBoostingClassifier))
print(GradientBoostingClassifier())

['_SUPPORTED_LOSS', '__abstractmethods__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_abc_impl', '_check_initialized', '_check_params', '_clear_state', '_estimator_type', '_fit_stage', '_fit_stages', '_get_param_names', '_get_tags', '_init_state', '_is_initialized', '_make_estimator', '_raw_predict', '_raw_predict_init', '_required_parameters', '_resize_state', '_staged_raw_predict', '_validate_estimator', '_validate_y', 'apply', 'decision_function', 'feature_importances_', 'fit', 'get_params', 'predict', 'predict_log_proba', 'predict_proba', 'score', 'set_params', 'staged_decision_function', 'staged_predict', 'staged_p

# Build our owen Grid-search

In [50]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split

In [51]:
X_train , X_test , y_train , y_test = train_test_split(X_feature , data['lable'] , test_size = 0.2)

In [57]:
def train_GB(est , max_depth , lr):
    gb = GradientBoostingClassifier(n_estimators=est , max_depth=max_depth , learning_rate= lr)
    gb_model = gb.fit(X_train , y_train)
    y_pred = gb_model.predict(X_test)
    precision , recall, fscore, support = score(y_test , y_pred , pos_label='spam' ,average='binary')
    print('Est: {} / Depth: {} / Learning Rate: {}  --- Precision = {} Recall = {} Accuracy = {}'.format(
        n_est , depth , lr ,  round(precision , 3) , round(recall , 3) ,  round(recall , 3) ,
        round((y_pred == y_test).sum() / len(y_pred),3)))
    

In [58]:
for n_est in [10,50,100]:
    for depth in [3,7,11,15]:
        for lr in [0.01 , 0.1 , 1]:
            train_GB(n_est , depth , lr)

  'precision', 'predicted', average, warn_for)


Est: 10 / Depth: 3 / Learning Rate: 0.01  --- Precision = 0.0 Recall = 0.0 Accuracy = 0.0
Est: 10 / Depth: 3 / Learning Rate: 0.1  --- Precision = 0.983 Recall = 0.424 Accuracy = 0.424
Est: 10 / Depth: 3 / Learning Rate: 1  --- Precision = 0.923 Recall = 0.691 Accuracy = 0.691


  'precision', 'predicted', average, warn_for)


Est: 10 / Depth: 7 / Learning Rate: 0.01  --- Precision = 0.0 Recall = 0.0 Accuracy = 0.0
Est: 10 / Depth: 7 / Learning Rate: 0.1  --- Precision = 0.978 Recall = 0.655 Accuracy = 0.655
Est: 10 / Depth: 7 / Learning Rate: 1  --- Precision = 0.933 Recall = 0.799 Accuracy = 0.799


  'precision', 'predicted', average, warn_for)


Est: 10 / Depth: 11 / Learning Rate: 0.01  --- Precision = 0.0 Recall = 0.0 Accuracy = 0.0
Est: 10 / Depth: 11 / Learning Rate: 0.1  --- Precision = 0.981 Recall = 0.748 Accuracy = 0.748
Est: 10 / Depth: 11 / Learning Rate: 1  --- Precision = 0.919 Recall = 0.82 Accuracy = 0.82


  'precision', 'predicted', average, warn_for)


Est: 10 / Depth: 15 / Learning Rate: 0.01  --- Precision = 0.0 Recall = 0.0 Accuracy = 0.0
Est: 10 / Depth: 15 / Learning Rate: 0.1  --- Precision = 0.982 Recall = 0.784 Accuracy = 0.784
Est: 10 / Depth: 15 / Learning Rate: 1  --- Precision = 0.885 Recall = 0.827 Accuracy = 0.827
Est: 50 / Depth: 3 / Learning Rate: 0.01  --- Precision = 1.0 Recall = 0.058 Accuracy = 0.058
Est: 50 / Depth: 3 / Learning Rate: 0.1  --- Precision = 0.98 Recall = 0.698 Accuracy = 0.698
Est: 50 / Depth: 3 / Learning Rate: 1  --- Precision = 0.929 Recall = 0.748 Accuracy = 0.748


  'precision', 'predicted', average, warn_for)


Est: 50 / Depth: 7 / Learning Rate: 0.01  --- Precision = 0.0 Recall = 0.0 Accuracy = 0.0
Est: 50 / Depth: 7 / Learning Rate: 0.1  --- Precision = 0.965 Recall = 0.784 Accuracy = 0.784
Est: 50 / Depth: 7 / Learning Rate: 1  --- Precision = 0.927 Recall = 0.82 Accuracy = 0.82
Est: 50 / Depth: 11 / Learning Rate: 0.01  --- Precision = 1.0 Recall = 0.072 Accuracy = 0.072
Est: 50 / Depth: 11 / Learning Rate: 0.1  --- Precision = 0.937 Recall = 0.849 Accuracy = 0.849
Est: 50 / Depth: 11 / Learning Rate: 1  --- Precision = 0.919 Recall = 0.813 Accuracy = 0.813
Est: 50 / Depth: 15 / Learning Rate: 0.01  --- Precision = 1.0 Recall = 0.014 Accuracy = 0.014
Est: 50 / Depth: 15 / Learning Rate: 0.1  --- Precision = 0.951 Recall = 0.842 Accuracy = 0.842
Est: 50 / Depth: 15 / Learning Rate: 1  --- Precision = 0.922 Recall = 0.849 Accuracy = 0.849
Est: 100 / Depth: 3 / Learning Rate: 0.01  --- Precision = 1.0 Recall = 0.353 Accuracy = 0.353
Est: 100 / Depth: 3 / Learning Rate: 0.1  --- Precision = 0

# Explore parameter setting using GradientSearchCV 

In [64]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

In [67]:
gb = GradientBoostingClassifier()
param = {'n_estimators' : [100,150],
        'max_depth' : [7,11,15],
         'learning_rate' : [0.1]
        }
gs = GridSearchCV(gb , param , cv = 5 , n_jobs = -1)
cv_fit= gs.fit(X_feature , data['label'])
pd.DataFrame(cv_fit.cv_results_).sort_values('mean_test_score' , ascending= False)[0:5]

KeyError: 'label'

In [66]:
gb = GradientBoostingClassifier()
param = {'n_estimators' : [100,150],
        'max_depth' : [7,11,15],
         'learning_rate' : [0.1]
        }
gs = GridSearchCV(gb , param , cv = 5 , n_jobs = -1)
cv_fit = gs.fit(X_count_feature , data['label'])
pd.DataFrame(cv_fit.cv_results_).sort_values('mean_test_score' , ascending= False)[0:5]

KeyError: 'label'