In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
df = pd.read_csv('SMSSpamCollection', sep='\t', names=['Label', "Msg"])
df.head()

Unnamed: 0,Label,Msg
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
ps = PorterStemmer()

In [5]:
corpus = []
for i in range (0,len(df['Msg'])):
    review = re.sub("^a-zA-Z0-9", " ", df['Msg'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(x) for x in review if x not in (stopwords.words('english'))]
    review = " ".join(review)
    corpus.append(review)
len(corpus)

5572

In [6]:
df["Msg"][0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [7]:
corpus[0]

'go jurong point, crazy.. avail bugi n great world la e buffet... cine got amor wat...'

In [8]:
tf = TfidfVectorizer()
x = tf.fit_transform(corpus).toarray()
x

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [11]:
y=pd.get_dummies(df["Label"])
y.iloc[:,1].values

array([0, 0, 1, ..., 0, 0, 0], dtype=uint8)

In [12]:
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=30,test_size=0.3)

In [13]:
len(x_train),len(y_train)

(3900, 3900)

In [14]:
len(x_test), len(y_test)

(1672, 1672)

## RandomForest

In [15]:
rf = RandomForestClassifier()
rf.fit(x_train,y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

### Model Evaluation with RF on Testing Dataset

In [16]:
y_pred = rf.predict(x_test)
acc_score = accuracy_score(y_test,y_pred)
print("Accuracy_Score:",acc_score)
print()
con_mat = confusion_matrix(y_test.values.argmax(axis=1), y_pred.argmax(axis=1))
print("confusion_matrix:\n",con_mat)
print()
cr = classification_report(y_test.values.argmax(axis=1), y_pred.argmax(axis=1))
print("classification_report:\n",cr)

Accuracy_Score: 0.9688995215311005

confusion_matrix:
 [[1455    4]
 [  45  168]]

classification_report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.98      1459
           1       0.98      0.79      0.87       213

    accuracy                           0.97      1672
   macro avg       0.97      0.89      0.93      1672
weighted avg       0.97      0.97      0.97      1672



### Model Evaluation with RF on Training Dataset

In [17]:
y_pred_train = rf.predict(x_train)
acc_score = accuracy_score(y_train,y_pred_train)
print("Accuracy_Score:",acc_score)
print()
con_mat = confusion_matrix(y_train.values.argmax(axis=1), y_pred_train.argmax(axis=1))
print("confusion_matrix:\n",con_mat)
print()
cr = classification_report(y_train.values.argmax(axis=1), y_pred_train.argmax(axis=1))
print("classification_report:\n",cr)

Accuracy_Score: 0.9976923076923077

confusion_matrix:
 [[3366    0]
 [   9  525]]

classification_report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      3366
           1       1.00      0.98      0.99       534

    accuracy                           1.00      3900
   macro avg       1.00      0.99      1.00      3900
weighted avg       1.00      1.00      1.00      3900

