In [1]:
## Importing relevant libraries
import pandas as pd
import nltk
import re

In [2]:
## Reading the SPAM dataset
texts = pd.read_csv('spam.csv',usecols=[0,1],header=0, encoding='latin-1', names=['labels','message'])
texts.head()

Unnamed: 0,labels,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
texts.shape

(5572, 2)

In [4]:
## checking NULL value presence
texts.isnull().sum()

labels     0
message    0
dtype: int64

In [5]:
## creating a checkpoint
texts_copy = texts.copy()

### Cleaning the Text

In [6]:
## Download 'stopwords' and 'wordnet' if not previously downloaded.
# nltk.download('stopwords')
# nltk.download('wordnet')

## Importing stopwords to eliminate frequently occurring non-prominent words
## Importing WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [7]:
lemmatizer = WordNetLemmatizer()

In [8]:
for i in range(len(texts)):
    texts['message'][i] = re.sub('[^a-zA-Z]',' ', texts['message'][i]) 
    ## every character except a-z and A-Z is removed. Returned statement is a string.
    texts['message'][i] = texts['message'][i].lower()
    ## lowers every character so that during Vectorization, same words are not counted multiple times.
    texts['message'][i] = texts['message'][i].split()
    ## the string is splitted into a list so that for loop can be applied.
    texts['message'][i] = [lemmatizer.lemmatize(word) for word in texts['message'][i] if word not in
                          set(stopwords.words('english'))]
    ## post lemmatization, a list is returned.
    texts['message'][i] = ' '.join(texts['message'][i])
    ## the list is transformed back into a space separated string.
    
    
   

In [9]:
texts.head()

Unnamed: 0,labels,message
0,ham,go jurong point crazy available bugis n great ...
1,ham,ok lar joking wif u oni
2,spam,free entry wkly comp win fa cup final tkts st ...
3,ham,u dun say early hor u c already say
4,ham,nah think go usf life around though


### Encoding the labels feature

In [10]:
texts['labels'] = texts['labels'].replace(('ham','spam'),(0,1))
texts.head()

Unnamed: 0,labels,message
0,0,go jurong point crazy available bugis n great ...
1,0,ok lar joking wif u oni
2,1,free entry wkly comp win fa cup final tkts st ...
3,0,u dun say early hor u c already say
4,0,nah think go usf life around though


### Splitting the dataset

In [11]:
from sklearn.model_selection import train_test_split 

In [12]:
x = texts['message']
y = texts['labels']

In [13]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size =0.2, random_state = 355)

### Creating the TF-IDF model

#### Vectorizing the text data

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
Tf_Idf = TfidfVectorizer()
x_train = Tf_Idf.fit_transform(x_train)
x_test = Tf_Idf.transform(x_test)

In [15]:
x_train.shape

(4457, 6273)

### Checking and Dealing with IMBALANCE in the training dataset

In [16]:
print(y_train.value_counts())
print(y_train.value_counts()/y_train.shape[0])

0    3864
1     593
Name: labels, dtype: int64
0    0.866951
1    0.133049
Name: labels, dtype: float64


In [17]:
## target feature of the training dataset is highly IMBALANCED in the ratio 87:13
## Hence, we undersample it. 

In [18]:
from imblearn.under_sampling import RandomUnderSampler  
sm = RandomUnderSampler() 
x_train_resampled, y_train_resampled = sm.fit_sample(x_train, y_train.values.ravel())

In [19]:
## y_train_resampled is an ndarray. Hence, we need to convert it back to a dataframe.

In [20]:
y_train_resampled = pd.DataFrame(y_train_resampled)
y_train_resampled.head(2)

Unnamed: 0,0
0,0
1,0


In [21]:
print(y_train_resampled[0].value_counts())
print(y_train_resampled[0].value_counts()/y_train_resampled.shape[0])

1    593
0    593
Name: 0, dtype: int64
1    0.5
0    0.5
Name: 0, dtype: float64


In [22]:
## Now the target feature is BALANCED.

### Training the model : LogisticRegression

In [23]:
from sklearn.linear_model import LogisticRegression
spam_classifier_logistic = LogisticRegression()
spam_classifier_logistic.fit(x_train_resampled, y_train_resampled)

  return f(**kwargs)


LogisticRegression()

In [24]:
pred_logistic = spam_classifier_logistic.predict(x_test)

In [25]:
print("Training Accuracy : ", spam_classifier_logistic.score(x_train_resampled, y_train_resampled) )
print("Test Accuracy : ", spam_classifier_logistic.score(x_test, y_test) )

Training Accuracy :  0.9713322091062394
Test Accuracy :  0.9713004484304932


In [26]:
## Checking performance report
from sklearn.metrics import confusion_matrix, classification_report
cm = confusion_matrix(pred_logistic, y_test)
cr = classification_report(pred_logistic, y_test)
print("confusion_matrix : \n")
print(cm)
print("\n")
print("classification_report : \n")
print(cr)

confusion_matrix : 

[[942  13]
 [ 19 141]]


classification_report : 

              precision    recall  f1-score   support

           0       0.98      0.99      0.98       955
           1       0.92      0.88      0.90       160

    accuracy                           0.97      1115
   macro avg       0.95      0.93      0.94      1115
weighted avg       0.97      0.97      0.97      1115



In [27]:
## checking Cross validation score

from sklearn.model_selection import cross_val_score

scores_logistic = cross_val_score(spam_classifier_logistic, x_train_resampled, y_train_resampled, cv=10)
print(scores_logistic)

  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)


[0.94117647 0.93277311 0.94957983 0.94117647 0.92436975 0.91596639
 0.93220339 0.94067797 0.89830508 0.98305085]


### Training the model : Random Forest Classifer

In [28]:
from sklearn.ensemble import RandomForestClassifier
spam_classifier_RF = RandomForestClassifier()
spam_classifier_RF.fit(x_train_resampled, y_train_resampled)

  spam_classifier_RF.fit(x_train_resampled, y_train_resampled)


RandomForestClassifier()

In [29]:
pred_RF = spam_classifier_RF.predict(x_test)

In [30]:
print("Training Accuracy : ", spam_classifier_RF.score(x_train_resampled, y_train_resampled) )
print("Test Accuracy : ", spam_classifier_RF.score(x_test, y_test) )

Training Accuracy :  1.0
Test Accuracy :  0.9802690582959641


In [31]:
## Checking performance report
from sklearn.metrics import confusion_matrix, classification_report
cm = confusion_matrix(pred_RF, y_test)
cr = classification_report(pred_RF, y_test)
print("confusion_matrix : \n")
print(cm)
print("\n")
print("classification_report : \n")
print(cr)

confusion_matrix : 

[[950  11]
 [ 11 143]]


classification_report : 

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       961
           1       0.93      0.93      0.93       154

    accuracy                           0.98      1115
   macro avg       0.96      0.96      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [32]:
## checking Cross validation score

from sklearn.model_selection import cross_val_score

scores_RF = cross_val_score(spam_classifier_RF, x_train_resampled, y_train_resampled, cv=10)
print(scores_RF)

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


[0.95798319 0.93277311 0.94117647 0.97478992 0.95798319 0.93277311
 0.94915254 0.96610169 0.94067797 0.95762712]


## Conclusion

In [33]:
## Random Forest classifier is able to better classify SPAM texts from HAM texts.