# Building Machine Learning Classifiers

## Basic Random Forest Model

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
import nltk
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import string

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

data = pd.read_csv("data/SMSSpamCollection.tsv", sep='\t')
data.columns = ['label', 'body_text']

data.head()

Unnamed: 0,label,body_text
0,spam,Free entry in 2 a wkly comp to win FA Cup fina...
1,ham,"Nah I don't think he goes to usf, he lives aro..."
2,ham,Even my brother is not like to speak with me. ...
3,ham,I HAVE A DATE ON SUNDAY WITH WILL!!
4,ham,As per your request 'Melle Melle (Oru Minnamin...


In [5]:
def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100

data['body_len'] = data['body_text'].apply(lambda x: len(x) - x.count(" "))
data['punct%'] = data['body_text'].apply(lambda x: count_punct(x))

data.head()

Unnamed: 0,label,body_text,body_len,punct%
0,spam,Free entry in 2 a wkly comp to win FA Cup fina...,128,4.7
1,ham,"Nah I don't think he goes to usf, he lives aro...",49,4.1
2,ham,Even my brother is not like to speak with me. ...,62,3.2
3,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,28,7.1
4,ham,As per your request 'Melle Melle (Oru Minnamin...,135,4.4


**TfidVectorizer** means a document term matrix where each cell is a weight of how important that word is, by measuring how frequently it occurs within that text message, relative to how frequently that word occurs across all other text messages. 

In [7]:
def clean_text(text):
    text = ''.join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

# Initialize TfidfVectorizer with the custom analyzer
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(data['body_text'])

# Create the DataFrame with additional features
X_features = pd.concat([
    data['body_len'],
    data['punct%'],
    pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vect.get_feature_names_out())
], axis=1)

# Display the first few rows of the DataFrame
X_features.head()

Unnamed: 0,body_len,punct%,Unnamed: 3,0,008704050406,0089mi,0121,01223585236,01223585334,0125698789,...,zindgi,zoe,zogtoriu,zoom,zouk,zyada,é,ü,üll,〨ud
0,128,4.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,49,4.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,62,3.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,28,7.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,135,4.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Explore RandomForestClassifier Attributes & Hyperparameters

In [9]:
from sklearn.ensemble import RandomForestClassifier

In [10]:
print(dir(RandomForestClassifier))
print()
print(RandomForestClassifier())

['__abstractmethods__', '__annotations__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__sklearn_clone__', '__str__', '__subclasshook__', '__weakref__', '_abc_impl', '_build_request_for_signature', '_check_feature_names', '_check_n_features', '_compute_oob_predictions', '_doc_link_module', '_doc_link_template', '_doc_link_url_param_generator', '_estimator_type', '_get_default_requests', '_get_doc_link', '_get_estimators_indices', '_get_metadata_request', '_get_oob_predictions', '_get_param_names', '_get_tags', '_make_estimator', '_more_tags', '_parameter_constraints', '_repr_html_', '_repr_html_inner', '_repr_mimebundle_', '_required_parameters', '_set_oob_score_and_

### Explore RandomForestClassifier through Cross-Validation

In [12]:
from sklearn.model_selection import KFold, cross_val_score

In [13]:
rf = RandomForestClassifier(n_jobs = -1) # Setting it to negative one basically just allows this to run faster 
                                         # by building the individual decision trees in parallel

k_fold = KFold(n_splits= 5)

cross_val_score(
    rf,                   # The classifier or regressor to evaluate, in this case, a RandomForestClassifier instance.
    X_features,           # The feature matrix (DataFrame) containing the input data for the model.
    data['label'],        # The target variable (series) containing the true labels for each sample.
    cv = k_fold,          # The cross-validation splitting strategy, here defined by the KFold object.
    scoring = 'accuracy', # The metric to evaluate the model's performance, which is accuracy for classification tasks.
    n_jobs = -1           # Number of parallel jobs to run; -1 means using all available CPUs.
)

array([0.97576302, 0.97755835, 0.97394429, 0.96495957, 0.97484277])

### Explore RandomForestClassifier through Holdout Set

In [15]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X_features, data['label'], test_size=0.2)

In [17]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators = 50,  # The number of trees in the forest. More trees generally improve performance but also increase computation time.
    max_depth = 20,     # The maximum depth of each tree in the forest. Limiting the depth helps prevent overfitting.
    n_jobs = -1         # The number of parallel jobs to run; -1 means using all available CPUs for faster computation.
)

rf_model = rf.fit(X_train,y_train)    # Fit the RandomForestClassifier model to the training data

In [18]:
# Get the feature importances from the trained model
# Sort the features by importance in descending order
# Display the top 10 features with the highest importance

sorted(zip(rf_model.feature_importances_,X_train.columns), reverse = True)[0:10]

[(0.05127839595044938, 'body_len'),
 (0.047101365959513315, 'txt'),
 (0.03395492552130747, 'call'),
 (0.033138926094627756, 'free'),
 (0.03192226678522689, 'claim'),
 (0.024781315571949247, 'stop'),
 (0.022608666285628055, 'mobil'),
 (0.021689809100748362, 'servic'),
 (0.01868831131828509, 'prize'),
 (0.018492119055469, 'urgent')]

In [19]:
# Predict the labels for the test set using the trained model
y_pred = rf_model.predict(X_test)

# Calculate precision, recall, F1 score, and support for the 'spam' class
precision, recall, fscore, support = score(
    y_test,            # True labels for the test set
    y_pred,            # Predicted labels by the model
    pos_label='spam',  # Positive label in binary classification
    average='binary'   # Compute metrics for binary classification
)

In [20]:
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3),        # Round precision to 3 decimal places
    round(recall, 3),           # Round recall to 3 decimal places
    round((y_pred == y_test).sum() / len(y_pred), 3)  # Round accuracy to 3 decimal places
))

Precision: 1.0 / Recall: 0.59 / Accuracy: 0.947


So our precision is 100%, our recall is 52%, and then our accuracy is 93.6%. So just as a reminder of what that actually means in the context of a spam filter, **100% precision**, what that actually means is that when the model identified something as spam, it actually was spam 100% of the time. So that's great. The **52% recall** means that of all the spam that has come into your email, 52% of that spam was properly placed in the spam folder, which means that the other 48% went into your inbox, so that's not great. And lastly, the **93.6%** accuracy just means that of all the emails that came into your email, spam or non-spam, they were identified correctly as one or the other, 93.6% of the time. 

So in summary, the amount of spam still making it to our inbox, tells us that our model's not quite aggressive enough in identifying spam. 

### Explore RandomForestClassifier with grid-search

**Grid-search:** Exhaustively search all parameter combinations in a given grid to determine the best model.

Grid-search basically means defining a grid of hyperparameter settings, and then exploring a model fit with each combination of those hyperparameter settings.

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X_features, data['label'], test_size=0.2)

In [24]:
def train_RF(n_est, depth):
    rf = RandomForestClassifier(n_estimators = n_est, 
                                max_depth = depth,
                                n_jobs = -1)
    
    rf_model = rf.fit(X_train,y_train)   # Fit the model on the training data
    y_pred = rf_model.predict(X_test)    # Predict the labels for the test data
    precision, recall, fscore, support = score(
                                                y_test,            # True labels for the test set
                                                y_pred,            # Predicted labels by the model
                                                pos_label='spam',  # Positive label in binary classification
                                                average='binary'   # Compute metrics for binary classification
                                                )
    print('Est: {} / Depth: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
                                                                                    n_est,
                                                                                    depth,
                                                                                    round(precision, 3),
                                                                                    round(recall, 3),
                                                                                    round((y_pred == y_test).sum() / len(y_pred), 3)
                                                                                     ))

In [25]:
for n_est in [10, 50, 100]:
    print()
    for depth in [10, 20, 30, None]:
        train_RF(n_est, depth)


Est: 10 / Depth: 10 ---- Precision: 1.0 / Recall: 0.245 / Accuracy: 0.906
Est: 10 / Depth: 20 ---- Precision: 1.0 / Recall: 0.568 / Accuracy: 0.946
Est: 10 / Depth: 30 ---- Precision: 1.0 / Recall: 0.64 / Accuracy: 0.955
Est: 10 / Depth: None ---- Precision: 0.981 / Recall: 0.741 / Accuracy: 0.966

Est: 50 / Depth: 10 ---- Precision: 1.0 / Recall: 0.187 / Accuracy: 0.899
Est: 50 / Depth: 20 ---- Precision: 1.0 / Recall: 0.59 / Accuracy: 0.949
Est: 50 / Depth: 30 ---- Precision: 1.0 / Recall: 0.712 / Accuracy: 0.964
Est: 50 / Depth: None ---- Precision: 0.991 / Recall: 0.763 / Accuracy: 0.969

Est: 100 / Depth: 10 ---- Precision: 1.0 / Recall: 0.201 / Accuracy: 0.9
Est: 100 / Depth: 20 ---- Precision: 1.0 / Recall: 0.583 / Accuracy: 0.948
Est: 100 / Depth: 30 ---- Precision: 1.0 / Recall: 0.676 / Accuracy: 0.96
Est: 100 / Depth: None ---- Precision: 0.991 / Recall: 0.777 / Accuracy: 0.971


 So in this example, as the depth increases from 10, to 20, to 30, and eventually to none, the recall increases quite drastically, while you see the precision doesn't really drop. So the model is getting much better and more aggressive as the depth increases.

If **max_depth** is set to *None*, it means that the trees in the forest are allowed to grow until they contain only one sample per leaf, or until other stopping criteria are met (like the minimum number of samples required to split a node).

# Evaluate Random Forest with GridSearchCV

**Cross-validation:** Divide a dataset into k subsets and repeat the holdout method k times where a different subset is used as the houdout set in each iteration.

In [28]:
# Importing both vectorizers
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer


# TF-IDF
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(data['body_text'])
X_tfidf_feat = pd.concat([
                        data['body_len'],
                        data['punct%'],
                        pd.DataFrame(X_tfidf.toarray())
                         ], axis=1)
X_tfidf_feat.columns = X_tfidf_feat.columns.astype(str)    # Ensure all column names are strings

# CountVectorizer
count_vect = CountVectorizer(analyzer=clean_text)
X_count = count_vect.fit_transform(data['body_text'])
X_count_feat = pd.concat([
                        data['body_len'],
                        data['punct%'],
                        pd.DataFrame(X_count.toarray())
                         ], axis=1) 
X_count_feat.columns = X_count_feat.columns.astype(str)   # Convert all column names to strings

We're doing this because this framework will allow us to test which of these vectorizing frameworks works better.

In [30]:
X_tfidf_feat.head()

Unnamed: 0,body_len,punct%,0,1,2,3,4,5,6,7,...,8094,8095,8096,8097,8098,8099,8100,8101,8102,8103
0,128,4.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,49,4.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,62,3.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,28,7.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,135,4.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
X_count_feat.head()

Unnamed: 0,body_len,punct%,0,1,2,3,4,5,6,7,...,8094,8095,8096,8097,8098,8099,8100,8101,8102,8103
0,128,4.7,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,49,4.1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,62,3.2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,28,7.1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,135,4.4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Exploring parameter settings using GridSearchCV

In [33]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [34]:
# Create a basic RandomForestClassifier model -------------- TF-IDF
rf = RandomForestClassifier()

# Define a dictionary with hyperparameters to tune:
param = {'n_estimators': [10, 150, 300],   # number of trees in the forest.
         'max_depth': [30, 60, 90, None]}  # maximum depth of each tree. 'None' means nodes are expanded until all leaves are pure

# Initialize GridSearchCV with the following parameters:
gs = GridSearchCV(
                 rf,             # The model to be tuned   
                 param,          # The grid of hyperparameters
                 cv=5,           # 5-fold cross-validation
                 n_jobs=-1       # Use all available cores for parallel processing
                 )

# Fit the GridSearchCV object to the data to find the best combination of parameters
gs_fit = gs.fit(X_tfidf_feat, data['label'])

# Convert the results of GridSearchCV into a DataFrame
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score',    # Sort by 'mean_test_score' in descending order to find the best-performing models
                                             ascending=False)[0:5] # Display the top 5 results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
10,18.430931,1.009646,0.164938,0.014989,,150,"{'max_depth': None, 'n_estimators': 150}",0.977558,0.978456,0.974843,0.969452,0.975741,0.97521,0.00315,1
7,16.792181,2.527809,0.242927,0.1011,90.0,150,"{'max_depth': 90, 'n_estimators': 150}",0.978456,0.975763,0.973046,0.969452,0.975741,0.974492,0.003046,2
8,26.188715,2.284023,0.193822,0.013693,90.0,300,"{'max_depth': 90, 'n_estimators': 300}",0.976661,0.978456,0.973046,0.966757,0.971249,0.973234,0.004122,3
11,22.145766,1.06139,0.163194,0.021852,,300,"{'max_depth': None, 'n_estimators': 300}",0.976661,0.973968,0.974843,0.967655,0.97035,0.972695,0.003251,4
5,22.627508,2.001571,0.203472,0.01878,60.0,300,"{'max_depth': 60, 'n_estimators': 300}",0.974865,0.974865,0.972147,0.965858,0.971249,0.971797,0.003302,5


So, **mean_fit_time** is the average time it takes each model to fit, **mean_score_time** is the average amount of time it takes each model to make a prediction on the test set, **mean_test_score** is the average accuracy on the test set, and then **mean_train_score** is the average accuracy on the training set. So in terms of parameter combinations, you'll notice that the best performing models are the ones with the deepest individual decision trees. So we have max_depth to 90, no max_depth, no max_depth, and max_depth to 90.

In [36]:
# Create a basic RandomForestClassifier model -------------- CountVectorizer
rf = RandomForestClassifier()

# Define a dictionary with hyperparameters to tune:
param = {'n_estimators': [10, 150, 300],   # number of trees in the forest.
         'max_depth': [30, 60, 90, None]}  # maximum depth of each tree. 'None' means nodes are expanded until all leaves are pure

# Initialize GridSearchCV with the following parameters:
gs = GridSearchCV(
                 rf,             # The model to be tuned   
                 param,          # The grid of hyperparameters
                 cv=5,           # 5-fold cross-validation
                 n_jobs=-1       # Use all available cores for parallel processing
                 )

# Fit the GridSearchCV object to the data to find the best combination of parameters
gs_fit = gs.fit(X_count_feat, data['label'])

# Convert the results of GridSearchCV into a DataFrame
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score',    # Sort by 'mean_test_score' in descending order to find the best-performing models
                                             ascending=False)[0:5] # Display the top 5 results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
7,16.137917,0.779453,0.250272,0.085463,90.0,150,"{'max_depth': 90, 'n_estimators': 150}",0.976661,0.972172,0.975741,0.967655,0.97035,0.972516,0.003347,1
10,16.097979,1.099049,0.195794,0.0205,,150,"{'max_depth': None, 'n_estimators': 150}",0.977558,0.97307,0.972147,0.965858,0.973046,0.972336,0.00375,2
11,21.483924,0.445812,0.160418,0.019236,,300,"{'max_depth': None, 'n_estimators': 300}",0.975763,0.975763,0.973944,0.967655,0.968553,0.972336,0.00353,3
8,26.92372,0.258614,0.208501,0.02029,90.0,300,"{'max_depth': 90, 'n_estimators': 300}",0.976661,0.97307,0.973046,0.966757,0.97035,0.971977,0.003292,4
3,2.055342,0.03241,0.134441,0.012076,60.0,10,"{'max_depth': 60, 'n_estimators': 10}",0.975763,0.97307,0.969452,0.965858,0.969452,0.970719,0.0034,5


# Gradient Boosting

<center>
    <img src="images/rfandgb.png" width="900" />
</center>

So with that, why would you go with gradient boosting? Well, **the trade off is that gradient boosting is typically more powerful and better-performing if tuned properly**.

In [40]:
import nltk
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import string

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

data = pd.read_csv("data/SMSSpamCollection.tsv", sep='\t')
data.columns = ['label', 'body_text']

data.head()

Unnamed: 0,label,body_text
0,spam,Free entry in 2 a wkly comp to win FA Cup fina...
1,ham,"Nah I don't think he goes to usf, he lives aro..."
2,ham,Even my brother is not like to speak with me. ...
3,ham,I HAVE A DATE ON SUNDAY WITH WILL!!
4,ham,As per your request 'Melle Melle (Oru Minnamin...


In [41]:
def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100

data['body_len'] = data['body_text'].apply(lambda x: len(x) - x.count(" "))
data['punct%'] = data['body_text'].apply(lambda x: count_punct(x))

data.head()

Unnamed: 0,label,body_text,body_len,punct%
0,spam,Free entry in 2 a wkly comp to win FA Cup fina...,128,4.7
1,ham,"Nah I don't think he goes to usf, he lives aro...",49,4.1
2,ham,Even my brother is not like to speak with me. ...,62,3.2
3,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,28,7.1
4,ham,As per your request 'Melle Melle (Oru Minnamin...,135,4.4


In [42]:
def clean_text(text):
    text = ''.join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

# Initialize TfidfVectorizer with the custom analyzer
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(data['body_text'])

# Create the DataFrame with additional features
X_features = pd.concat([
    data['body_len'],
    data['punct%'],
    pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vect.get_feature_names_out())
], axis=1)

# Display the first few rows of the DataFrame
X_features.head()

Unnamed: 0,body_len,punct%,Unnamed: 3,0,008704050406,0089mi,0121,01223585236,01223585334,0125698789,...,zindgi,zoe,zogtoriu,zoom,zouk,zyada,é,ü,üll,〨ud
0,128,4.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,49,4.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,62,3.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,28,7.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,135,4.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Explore GradientBoostingClassifier Attributes & Hyperparameters

In [44]:
from sklearn.ensemble import GradientBoostingClassifier

In [45]:
print(dir(GradientBoostingClassifier))
print()
print(GradientBoostingClassifier())

['__abstractmethods__', '__annotations__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__sklearn_clone__', '__str__', '__subclasshook__', '__weakref__', '_abc_impl', '_build_request_for_signature', '_check_feature_names', '_check_initialized', '_check_n_features', '_clear_state', '_compute_partial_dependence_recursion', '_doc_link_module', '_doc_link_template', '_doc_link_url_param_generator', '_encode_y', '_estimator_type', '_fit_stage', '_fit_stages', '_get_default_requests', '_get_doc_link', '_get_loss', '_get_metadata_request', '_get_param_names', '_get_tags', '_init_state', '_is_fitted', '_make_estimator', '_more_tags', '_parameter_constraints', '_raw_predict', 

### Build our own Grid-search

In [47]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X_features, data['label'], test_size=0.2)

In [49]:
def train_GB(est, max_depth, lr):
    
    # Initialize the Gradient Boosting Classifier with the given parameters
    gb = GradientBoostingClassifier(
        n_estimators = est,     # Number of boosting stages
        max_depth = max_depth,  # Maximum depth of the individual trees
        learning_rate = lr      # Learning rate shrinks the contribution of each tree
    )
    gb_model = gb.fit(X_train, y_train)  # Fit the model to the training data
    
    y_pred = gb_model.predict(X_test)    # Predict the labels for the test data

    precision, recall, fscore, support = score(
                                                y_test,            # True labels for the test set
                                                y_pred,            # Predicted labels by the model
                                                pos_label='spam',  # Positive label in binary classification
                                                average='binary'   # Compute metrics for binary classification
                                                )
    print('Est: {} / Depth: {} / LR: {}---- Precision: {} / Recall: {} / Accuracy: {}'.format(
                                                                                    est,
                                                                                    max_depth,
                                                                                    lr,
                                                                                    round(precision, 3),
                                                                                    round(recall, 3),
                                                                                    round((y_pred == y_test).sum() / len(y_pred), 3)
                                                                                     ))

In [50]:
for n_est in [50, 100, 150]:                  # Loop over different values for the number of estimators
    print()
    for max_depth in [3, 7, 11, 15]:          # Loop over different values for the maximum depth of the trees
        for lr in [0.01, 0.1, 1]:             # Loop over different learning rates
            train_GB(n_est, max_depth, lr)    # Call the function to train the Gradient Boosting model with the current parameters
print('END')


Est: 50 / Depth: 3 / LR: 0.01---- Precision: 0.0 / Recall: 0.0 / Accuracy: 0.863
Est: 50 / Depth: 3 / LR: 0.1---- Precision: 0.972 / Recall: 0.673 / Accuracy: 0.952
Est: 50 / Depth: 3 / LR: 1---- Precision: 0.852 / Recall: 0.791 / Accuracy: 0.952
Est: 50 / Depth: 7 / LR: 0.01---- Precision: 1.0 / Recall: 0.02 / Accuracy: 0.865
Est: 50 / Depth: 7 / LR: 0.1---- Precision: 0.913 / Recall: 0.758 / Accuracy: 0.957
Est: 50 / Depth: 7 / LR: 1---- Precision: 0.892 / Recall: 0.81 / Accuracy: 0.961
Est: 50 / Depth: 11 / LR: 0.01---- Precision: 1.0 / Recall: 0.007 / Accuracy: 0.864
Est: 50 / Depth: 11 / LR: 0.1---- Precision: 0.926 / Recall: 0.732 / Accuracy: 0.955
Est: 50 / Depth: 11 / LR: 1---- Precision: 0.917 / Recall: 0.791 / Accuracy: 0.961
Est: 50 / Depth: 15 / LR: 0.01---- Precision: 1.0 / Recall: 0.013 / Accuracy: 0.864
Est: 50 / Depth: 15 / LR: 0.1---- Precision: 0.911 / Recall: 0.739 / Accuracy: 0.954
Est: 50 / Depth: 15 / LR: 1---- Precision: 0.938 / Recall: 0.791 / Accuracy: 0.964



Let's take a look at some of the best models. Based on the results here, all of the best models had a learning rate of 0.1. Now we can draw that distinction, that that instruction to the model, of whether the learning rate is 0.01, or it's 0.1, is making a big difference in terms of the results of the model. For this problem, it appears that this learning rate of 0.1 is ideal. You'll also note that the estimators and the max depth are on the high end of the ranges that we tested out. 

# Building Machine Learning Classifiers: Evaluate Gradient Boosting with GridSearchCV

**Cross-validation:** Divide a dataset into k subsets and repeat the holdout method k times where a different subset is used as the houdout set in each iteration.

In [53]:
import nltk
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import string

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

data = pd.read_csv("data/SMSSpamCollection.tsv", sep='\t')
data.columns = ['label', 'body_text']

# --------------------------------------------
def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100

data['body_len'] = data['body_text'].apply(lambda x: len(x) - x.count(" "))
data['punct%'] = data['body_text'].apply(lambda x: count_punct(x))

def clean_text(text):
    text = ''.join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text
# --------------------------------------------

# TF-IDF
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(data['body_text'])
X_tfidf_feat = pd.concat([
                        data['body_len'],
                        data['punct%'],
                        pd.DataFrame(X_tfidf.toarray())
                         ], axis=1)
X_tfidf_feat.columns = X_tfidf_feat.columns.astype(str)    # Ensure all column names are strings

# CountVectorizer
count_vect = CountVectorizer(analyzer=clean_text)
X_count = count_vect.fit_transform(data['body_text'])
X_count_feat = pd.concat([
                        data['body_len'],
                        data['punct%'],
                        pd.DataFrame(X_count.toarray())
                         ], axis=1) 

X_count_feat.columns = X_count_feat.columns.astype(str)   # Convert all column names to strings

X_count_feat.head()

Unnamed: 0,body_len,punct%,0,1,2,3,4,5,6,7,...,8094,8095,8096,8097,8098,8099,8100,8101,8102,8103
0,128,4.7,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,49,4.1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,62,3.2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,28,7.1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,135,4.4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Exploring parameter settings using GridSearchCV

In [55]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

In [56]:
# TF-IDF

gb = GradientBoostingClassifier()
param = {
    'n_estimators': [100, 150],  # Number of boosting stages
    'max_depth' : [7, 11, 15],   # Maximum depth of the individual trees
    'learning_rate' : [0.1]      # The default is 0.1, so it's optional
}

# Initialize GridSearchCV with the following parameters:
gs = GridSearchCV(
                 gb,             # The model to be tuned   
                 param,          # The grid of hyperparameters
                 cv=5,           # 5-fold cross-validation
                 n_jobs=-1       # We'll train models on different subsets and parameter settings in parallel
                 )

# Fit the GridSearchCV object to the data to find the best combination of parameters
cv_fit = gs.fit(X_tfidf_feat, data['label'])

# Convert the results of GridSearchCV into a DataFrame
pd.DataFrame(cv_fit.cv_results_).sort_values('mean_test_score',    # Sort by 'mean_test_score' in descending order to find the best-performing models
                                             ascending=False)[0:5] # Display the top 5 results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
5,205.29683,0.991675,0.117536,0.007667,0.1,15,150,"{'learning_rate': 0.1, 'max_depth': 15, 'n_est...",0.964991,0.976661,0.971249,0.969452,0.966757,0.969822,0.004042,1
1,173.940286,0.836431,0.197344,0.029401,0.1,7,150,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.964991,0.979354,0.971249,0.966757,0.966757,0.969821,0.005197,2
3,244.594679,1.019254,0.188327,0.010996,0.1,11,150,"{'learning_rate': 0.1, 'max_depth': 11, 'n_est...",0.964991,0.979354,0.967655,0.968553,0.967655,0.969642,0.005001,3
0,112.311198,0.752044,0.294007,0.077784,0.1,7,100,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.9614,0.978456,0.971249,0.964061,0.968553,0.968744,0.005941,4
4,176.769086,1.053062,0.161939,0.021501,0.1,15,100,"{'learning_rate': 0.1, 'max_depth': 15, 'n_est...",0.964093,0.975763,0.969452,0.965858,0.968553,0.968744,0.003994,4


In [57]:
# CountVectorizer

gb = GradientBoostingClassifier()
param = {
    'n_estimators': [100, 150],  # Number of boosting stages
    'max_depth' : [7, 11, 15],   # Maximum depth of the individual trees
    'learning_rate' : [0.1]      # The default is 0.1, so it's optional
}

# Initialize GridSearchCV with the following parameters:
gs = GridSearchCV(
                 gb,             # The model to be tuned   
                 param,          # The grid of hyperparameters
                 cv=5,           # 5-fold cross-validation
                 n_jobs=-1       # We'll train models on different subsets and parameter settings in parallel
                 )

# Fit the GridSearchCV object to the data to find the best combination of parameters
cvc_fit = gs.fit(X_count_feat, data['label'])

# Convert the results of GridSearchCV into a DataFrame
pd.DataFrame(cvc_fit.cv_results_).sort_values('mean_test_score',    # Sort by 'mean_test_score' in descending order to find the best-performing models
                                             ascending=False)[0:5]  # Display the top 5 results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
5,196.260656,1.235784,0.119621,0.009106,0.1,15,150,"{'learning_rate': 0.1, 'max_depth': 15, 'n_est...",0.963196,0.979354,0.968553,0.966757,0.971249,0.969822,0.005437,1
3,236.49236,1.770359,0.204376,0.023957,0.1,11,150,"{'learning_rate': 0.1, 'max_depth': 11, 'n_est...",0.964093,0.977558,0.97035,0.96496,0.971249,0.969642,0.004868,2
1,166.323526,2.198403,0.220619,0.039965,0.1,7,150,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.964991,0.979354,0.967655,0.96496,0.967655,0.968923,0.005351,3
4,172.776075,0.74879,0.176183,0.017083,0.1,15,100,"{'learning_rate': 0.1, 'max_depth': 15, 'n_est...",0.963196,0.975763,0.966757,0.966757,0.971249,0.968744,0.004341,4
2,162.778714,1.437996,0.241345,0.054011,0.1,11,100,"{'learning_rate': 0.1, 'max_depth': 11, 'n_est...",0.964991,0.975763,0.966757,0.964061,0.969452,0.968205,0.004202,5


### Model Selection: Data Preparation

In [59]:
import nltk
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import string

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

data = pd.read_csv("data/SMSSpamCollection.tsv", sep='\t')
data.columns = ['label', 'body_text']


def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100

data['body_len'] = data['body_text'].apply(lambda x: len(x) - x.count(" "))
data['punct%'] = data['body_text'].apply(lambda x: count_punct(x))

def clean_text(text):
    text = ''.join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

data.head()

Unnamed: 0,label,body_text,body_len,punct%
0,spam,Free entry in 2 a wkly comp to win FA Cup fina...,128,4.7
1,ham,"Nah I don't think he goes to usf, he lives aro...",49,4.1
2,ham,Even my brother is not like to speak with me. ...,62,3.2
3,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,28,7.1
4,ham,As per your request 'Melle Melle (Oru Minnamin...,135,4.4


#### Split into train/test

In [61]:
# splitting the data first before vectorizing
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data[['body_text', 'body_len', 'punct%']], data['label'], test_size=0.2)

#### Vectorize text

In [63]:
tfidf_vect = TfidfVectorizer(analyzer=clean_text)  # Initialize TfidfVectorizer with custom text cleaning function 'clean_text'.
tfidf_vect = tfidf_vect.fit(X_train['body_text'])  # Fit the vectorizer on the training data to learn the vocabulary.

tfidf_train = tfidf_vect.fit_transform(X_train['body_text'])  # Fit and transform the training data into TF-IDF features.
tfidf_test = tfidf_vect.fit_transform(X_test['body_text'])    # Fit and transform the test data into TF-IDF features.

X_train_vect = pd.concat([  # Combine the original features and TF-IDF features.
                    X_train[['body_len', 'punct%']].reset_index(drop=True), # Select 'body_len' and 'punct%' columns from the training set.
                    pd.DataFrame(tfidf_train.toarray())                     # Convert TF-IDF features to a DataFrame and combine them w/ original feat.
                    ], axis=1)                                              # Concatenate along the columns.

X_test_vect = pd.concat([  # Combine the original features and TF-IDF features.
                    X_test[['body_len', 'punct%']].reset_index(drop=True), # Select 'body_len' and 'punct%' columns from the testing set.
                    pd.DataFrame(tfidf_test.toarray())                     # Convert TF-IDF features to a DataFrame and combine them w/ original feat.
                    ], axis=1)                                             # Concatenate along the columns.

X_train_vect.head()

Unnamed: 0,body_len,punct%,0,1,2,3,4,5,6,7,...,7109,7110,7111,7112,7113,7114,7115,7116,7117,7118
0,124,5.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,28,3.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,22,27.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,20,15.0,0.355565,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,30,3.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


`tfidf_train` and `tfidf_test` will have the same number of columns because they're both transformed using this TFIDF vect underscore fit that was trained on the training set so it only recognizes words in the training set and can only create columns for words from the training set.

#### Final evaluation of models

In [66]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import precision_recall_fscore_support as score
import time

In [67]:
# Initialize TfidfVectorizer
tfidf_vect = TfidfVectorizer(analyzer=clean_text)

# Fit and transform training data
tfidf_train = tfidf_vect.fit_transform(X_train['body_text'])

# Transform test data based on the fitted vectorizer
tfidf_test = tfidf_vect.transform(X_test['body_text'])

# Concatenate features for training and test sets
X_train_vect = pd.concat([X_train[['body_len', 'punct%']].reset_index(drop=True), 
                          pd.DataFrame(tfidf_train.toarray())], axis=1)
X_test_vect = pd.concat([X_test[['body_len', 'punct%']].reset_index(drop=True), 
                         pd.DataFrame(tfidf_test.toarray())], axis=1)

# Ensure all column names are strings
X_train_vect.columns = X_train_vect.columns.astype(str)
X_test_vect.columns = X_test_vect.columns.astype(str)

# Initialize RandomForestClassifier with specified parameters
rf = RandomForestClassifier(
    n_estimators=150,  # Number of trees in the forest
    max_depth=None,    # Maximum depth of the trees
    n_jobs=-1          # Number of parallel jobs to run
)

# Measure the time taken to fit the RandomForest model
start = time.time()                       # Record the start time
rf_model = rf.fit(X_train_vect, y_train)  # Fit the model to the training data
end = time.time()                         # Record the end time
fit_time = (end - start)                  # Calculate the duration of the fitting process in seconds

# Measure the time taken to make predictions
start = time.time()                      # Record the start time
y_pred = rf_model.predict(X_test_vect)  # Predict the labels for the test set
end = time.time()                       # Record the end time
pred_time = (end - start)               # Calculate the duration of the prediction process in seconds

# Compute evaluation metrics: precision, recall, and F1 score
precision, recall, fscore, support = score(
    y_test,              # True labels for the test data
    y_pred,              # Predicted labels for the test data
    pos_label='spam',    # Label considered as the positive class for evaluation
    average='binary'     # Type of averaging for the metrics
)

print('Fit time: {} / Predict time: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(fit_time, 3), round(pred_time, 3), round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

# precision: Proportion of true positive predictions among all positive predictions.
# recall: Proportion of true positive predictions among all actual positives.
# fscore: Harmonic mean of precision and recall, providing a single score to evaluate the model's performance.
# train_support: Number of occurrences of each label in the test set, useful for understanding class distribution.

Fit time: 0.79 / Predict time: 0.068 ---- Precision: 1.0 / Recall: 0.838 / Accuracy: 0.978


In [68]:
# Initialize GradientBoostingClassifier with specified parameters
gb = GradientBoostingClassifier(n_estimators=150, max_depth=11)

# Measure the time taken to fit the GradientBoosting model
start = time.time()
gb_model = gb.fit(X_train_vect, y_train)  # Fit the model to the training data
end = time.time()
fit_time = (end - start)

# Measure the time taken to make predictions
start = time.time()
y_pred = gb_model.predict(X_test_vect)    # Predict the labels for the test set
end = time.time()
pred_time = (end - start)

# Compute evaluation metrics: precision, recall, and F1 score
precision, recall, fscore, support = score(
    y_test,              # True labels for the test data
    y_pred,              # Predicted labels for the test data
    pos_label='spam',    # Label considered as the positive class for evaluation
    average='binary'     # Type of averaging for the metrics
)

print('Fit time: {} / Predict time: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(fit_time, 3), round(pred_time, 3), round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

# precision: Proportion of true positive predictions among all positive predictions.
# recall: Proportion of true positive predictions among all actual positives.
# fscore: Harmonic mean of precision and recall, providing a single score to evaluate the model's performance.
# train_support: Number of occurrences of each label in the test set, useful for understanding class distribution.

Fit time: 95.783 / Predict time: 0.083 ---- Precision: 0.9 / Recall: 0.851 / Accuracy: 0.968
