<a href="https://colab.research.google.com/github/srijit43/NLP-theory-and-code/blob/main/05_05_Building_Random_Forest_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Building Machine Learning Classifiers: Random Forest on a holdout test set

### Read in & clean text

In [1]:
import nltk
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import string
nltk.download('stopwords')

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

data = pd.read_csv("SMSSpamCollection.tsv", sep='\t')
data.columns = ['label', 'body_text']

def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)*100

data['body_len'] = data['body_text'].apply(lambda x: len(x) - x.count(" "))
data['punct%'] = data['body_text'].apply(lambda x: count_punct(x))

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(data['body_text'])

X_features = pd.concat([data['body_len'], data['punct%'], pd.DataFrame(X_tfidf.toarray())], axis=1)
X_features.head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,body_len,punct%,0,1,2,3,4,5,6,7,...,8094,8095,8096,8097,8098,8099,8100,8101,8102,8103
0,128,4.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,49,4.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,62,3.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,28,7.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,135,4.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


###Adjustment

In [2]:
X_features= X_features.rename(str,axis="columns")

### Explore RandomForestClassifier through Holdout Set

In [3]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split

###Doing the test train split

In [4]:
X_train,X_test,y_train,y_test = train_test_split(X_features,data['label'],test_size=0.2)

###Importing dependencies to apply Random Forest Model

In [5]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=50,max_depth=20,n_jobs=-1)
rf_model = rf.fit(X_train,y_train)

###Let us see feature importances

In [7]:
sorted(zip(rf_model.feature_importances_,X_train.columns),reverse=True)[0:10]  #top 10 columns of max importance

[(0.06155577025454342, 'body_len'),
 (0.05501751585757371, '1803'),
 (0.037773229797567205, '3134'),
 (0.033127881513997696, '7350'),
 (0.02433471425675886, '6285'),
 (0.02230842529996148, '5724'),
 (0.01977935663119112, '4796'),
 (0.018170198434453423, '7218'),
 (0.015757948837424506, '7782'),
 (0.014949901357466037, '2031')]

###Prediction

In [9]:
y_pred = rf_model.predict(X_test)
model_precision,model_recall,model_fscore,model_support = score(y_test,y_pred,pos_label='spam',average='binary')  #4 scores are achieved, pos label indicates what we are classifying

Formatting in a pretty way

In [12]:
print("Precision {}, Recall {}, Accuracy {}".format(round(model_precision,3),round(model_recall,3),round((y_pred == y_test).sum()/len(y_pred),3)))

Precision 1.0, Recall 0.64, Accuracy 0.955
