In [1]:
import re
import pandas as pd
import numpy as np
import nltk

In [2]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [4]:
df = pd.read_csv("SMSSpamCollection", sep = "\t", names = ['Label','Msg'])
df.head(5)

Unnamed: 0,Label,Msg
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
df.shape

(5572, 2)

In [6]:
ps = PorterStemmer()
ls = WordNetLemmatizer()
corpus = []
for i in range(len(df)):
    review = re.sub("^a-zA-Z0-9", " ",df['Msg'][i])
    review = review.lower()
    review = review.split()
    review = [ls.lemmatize(x) for x in review if x not in (stopwords.words('english'))]
    review = " ".join(review)
    corpus.append(review) 

In [7]:
len(corpus)

5572

In [8]:
df["Msg"][0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [9]:
corpus[0]

'go jurong point, crazy.. available bugis n great world la e buffet... cine got amore wat...'

In [10]:
tf = TfidfVectorizer()
x = tf.fit_transform(corpus).toarray()
x

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [11]:
y = pd.get_dummies(df['Label'])
y = y.iloc[:,1].values
y

array([0, 0, 1, ..., 0, 0, 0], dtype=uint8)

In [18]:
x_train, x_test,y_train, y_test = train_test_split(x,y,test_size = 0.3, random_state = 10)

In [19]:
len(x_train)

3900

In [20]:
len(x_test)

1672

In [21]:
len(y_train)

3900

In [22]:
len(y_test)

1672

In [23]:
rf = RandomForestClassifier()
rf.fit(x_train,y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [24]:
y_pred = rf.predict(x_test)

In [25]:
accuracy_score (y_test,y_pred)

0.9629186602870813

In [46]:
class Evaluation:
    
    def __init__(self,model,x_train,x_test,y_train,y_test):
        self.x_train = x_train
        self.x_test = x_test
        self.y_train = y_train
        self.y_test = y_test
        self.model = model
        
    def train_evaluation(self):
        y_pred_train = self.model.predict(self.x_train)
        
        acc_scr_train = accuracy_score(self.y_train,y_pred_train )
        print("Accuracy Score On Training Data Set:",acc_scr_train)
        print()
        
        con_mat_train = confusion_matrix(self.y_train,y_pred_train )
        print("Confusion Matrix On Training Data Set:\n",con_mat_train)
        print()
        
        class_rep_train = classification_report(self.y_train,y_pred_train )
        print("Classification Report On Training Data Set:\n",class_rep_train)
        print()
        
        
    def test_evaluation(self):
        y_pred_test = self.model.predict(self.x_test)
        
        acc_scr_test = accuracy_score(self.y_test,y_pred_test )
        print("Accuracy Score On Testing Data Set:",acc_scr_test)
        print()
        
        con_mat_test = confusion_matrix(self.y_test,y_pred_test )
        print("Confusion Matrix On Testing Data Set:\n",con_mat_test)
        print()
        
        class_rep_test = classification_report(self.y_test,y_pred_test )
        print("Classification Report On Testing Data Set:\n",class_rep_test)
        print()

In [48]:
rf_obj = Evaluation(rf,x_train,x_test,y_train,y_test)

In [49]:
rf_obj.test_evaluation()

Accuracy Score On Testing Data Set: 0.9629186602870813

Confusion Matrix On Testing Data Set:
 [[1456    1]
 [  61  154]]

Classification Report On Testing Data Set:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98      1457
           1       0.99      0.72      0.83       215

    accuracy                           0.96      1672
   macro avg       0.98      0.86      0.91      1672
weighted avg       0.96      0.96      0.96      1672




In [50]:
Evaluation(rf,x_train,x_test,y_train,y_test).test_evaluation()

Accuracy Score On Testing Data Set: 0.9629186602870813

Confusion Matrix On Testing Data Set:
 [[1456    1]
 [  61  154]]

Classification Report On Testing Data Set:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98      1457
           1       0.99      0.72      0.83       215

    accuracy                           0.96      1672
   macro avg       0.98      0.86      0.91      1672
weighted avg       0.96      0.96      0.96      1672




In [51]:
Evaluation(rf,x_train,x_test,y_train,y_test).train_evaluation()

Accuracy Score On Training Data Set: 0.9982051282051282

Confusion Matrix On Training Data Set:
 [[3368    0]
 [   7  525]]

Classification Report On Training Data Set:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      3368
           1       1.00      0.99      0.99       532

    accuracy                           1.00      3900
   macro avg       1.00      0.99      1.00      3900
weighted avg       1.00      1.00      1.00      3900


