In [356]:
import pandas as pd
import re
import nltk

In [357]:
messages = pd.read_csv('SMSSpamCollection',sep='\t',names =[ 'label','message'])

### Data Cleaning and Preprocessing

In [358]:
messages.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [359]:
messages.shape

(5572, 2)

In [360]:
messages.isnull().sum()

label      0
message    0
dtype: int64

#### No null values

#### next , Text PREPROCESSING

In [361]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [362]:
ps = PorterStemmer()
wnl = WordNetLemmatizer()
corpus = []
corpus_lemmatize = []


In [363]:
for i in range(0,len(messages)):
    review = re.sub('[^a-zA-Z]',' ',messages['message'][i]) # allowing only a-z & A-Z characters
    review = review.lower()  # lower all capital letters
    review = review.split()  # splits sentenses into words and stores them in list format
    # stemming
    review_stem = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review_stem = ' '.join(review_stem) # joining the stemmed words back into sentenses
    corpus.append(review_stem)
    # lemmatization
    review_lemmatize = [wnl.lemmatize(word) for word in review if not word in stopwords.words('english')]
    review_lemmatize = ' '.join(review_lemmatize) # joining the lemmatized words back into sentenses
    corpus_lemmatize.append(review_lemmatize)
    

#### Creating the bag of words

In [364]:
from sklearn.feature_extraction.text import CountVectorizer

In [365]:
cv = CountVectorizer(max_features=2500)                                                   

#### creating bag of words for stemmed data

In [366]:
X_stem = cv.fit_transform(corpus).toarray()

#### creating bag of words for lemmatized data

In [367]:
X_lemmatize = cv.fit_transform(corpus_lemmatize).toarray()

### Using TFIDF model

In [368]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [369]:
tfidf = TfidfVectorizer(max_features=2500)

#### for stemmed data

In [370]:
X_stem_tf = tfidf.fit_transform(corpus).toarray()

#### for lemmatized data

In [371]:
X_lemmatize_tf = tfidf.fit_transform(corpus_lemmatize).toarray()

#### converting Labels into dummy variable

In [372]:
y=pd.get_dummies(messages['label'])

In [373]:
y.head()

Unnamed: 0,ham,spam
0,1,0
1,1,0
2,0,1
3,1,0
4,1,0


##### the above case is dummy variable trap

In [374]:
y=y.iloc[:,1].values

In [375]:
y[0:5]

array([0, 0, 1, 0, 0], dtype=uint8)

## Model Building

In [376]:
from sklearn.model_selection import train_test_split

In [377]:
from sklearn.naive_bayes import MultinomialNB

### Model building for Bag of Word

#### Model Building for Stemmed data

In [378]:
X_train_S,X_test_S,y_train_S,y_test_S= train_test_split(X_stem,y,test_size=0.2,random_state=0)

In [379]:
spam_detector_stem = MultinomialNB().fit(X_train_S,y_train_S)

In [380]:
y_pred_S = spam_detector_stem.predict(X_test_S)

In [381]:
from sklearn.metrics import accuracy_score,confusion_matrix

In [382]:
accuracy_score(y_test_S,y_pred_S)

0.9856502242152466

In [383]:
confusion_matrix(y_test_S,y_pred_S)

array([[946,   9],
       [  7, 153]], dtype=int64)

#### Model Building for lemmatized data

In [384]:
X_train_L,X_test_L,y_train_L,y_test_L= train_test_split(X_lemmatize,y,test_size=0.2,random_state=0)

In [385]:
spam_detector_lemmatize = MultinomialNB().fit(X_train_L,y_train_L)

In [386]:
y_pred_L = spam_detector_lemmatize.predict(X_test_L)

In [387]:
accuracy_score(y_test_L,y_pred_L)

0.9829596412556054

In [388]:
confusion_matrix(y_test_L,y_pred_L)

array([[946,   9],
       [ 10, 150]], dtype=int64)

### Model building for Tfidf 

#### Model Building for Stemmed data

In [389]:
X_train_S,X_test_S,y_train_S,y_test_S= train_test_split(X_stem_tf,y,test_size=0.2,random_state=0)

In [390]:
spam_detector_stem = MultinomialNB().fit(X_train_S,y_train_S)

In [391]:
y_pred_S = spam_detector_stem.predict(X_test_S)

In [392]:
from sklearn.metrics import accuracy_score,confusion_matrix

In [393]:
accuracy_score(y_test_S,y_pred_S)

0.979372197309417

In [394]:
confusion_matrix(y_test_S,y_pred_S)

array([[955,   0],
       [ 23, 137]], dtype=int64)

#### Model Building for lemmatized data

In [395]:
X_train_L,X_test_L,y_train_L,y_test_L= train_test_split(X_lemmatize_tf,y,test_size=0.2,random_state=0)

In [396]:
spam_detector_lemmatize = MultinomialNB().fit(X_train_L,y_train_L)

In [397]:
y_pred_L = spam_detector_lemmatize.predict(X_test_L)

In [398]:
accuracy_score(y_test_L,y_pred_L)

0.979372197309417

In [399]:
confusion_matrix(y_test_L,y_pred_L)

array([[954,   1],
       [ 22, 138]], dtype=int64)