In [1]:
import pandas as pd
import re
import nltk

In [2]:
messages = pd.read_csv('SMSSpamCollection',sep='\t',names =[ 'label','message'])

### Data Cleaning and Preprocessing

In [3]:
messages.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
messages.shape

(5572, 2)

In [5]:
messages.isnull().sum()

label      0
message    0
dtype: int64

#### No null values

#### next , Text PREPROCESSING

In [6]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [7]:
ps = PorterStemmer()
wnl = WordNetLemmatizer()
corpus = []
corpus_lemmatize = []


In [8]:
for i in range(0,len(messages)):
    review = re.sub('[^a-zA-Z]',' ',messages['message'][i]) # allowing only a-z & A-Z characters
    review = review.lower()  # lower all capital letters
    review = review.split()  # splits sentenses into words and stores them in list format
    # stemming
    review_stem = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review_stem = ' '.join(review_stem) # joining the stemmed words back into sentenses
    corpus.append(review_stem)
    # lemmatization
    review_lemmatize = [wnl.lemmatize(word) for word in review if not word in stopwords.words('english')]
    review_lemmatize = ' '.join(review_lemmatize) # joining the lemmatized words back into sentenses
    corpus_lemmatize.append(review_lemmatize)
    

#### Creating the bag of words

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

In [10]:
cv = CountVectorizer(max_features=2500)                                                   

#### creating bag of words for stemmed data

In [11]:
X_stem = cv.fit_transform(corpus).toarray()

#### creating bag of words for lemmatized data

In [12]:
X_lemmatize = cv.fit_transform(corpus_lemmatize).toarray()

### Using TFIDF model

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [14]:
tfidf = TfidfVectorizer(max_features=2500)

#### for stemmed data

In [15]:
X_stem_tf = tfidf.fit_transform(corpus).toarray()

#### for lemmatized data

In [16]:
X_lemmatize_tf = tfidf.fit_transform(corpus_lemmatize).toarray()

#### converting Labels into dummy variable

In [17]:
y=pd.get_dummies(messages['label'])

In [18]:
y.head()

Unnamed: 0,ham,spam
0,1,0
1,1,0
2,0,1
3,1,0
4,1,0


##### the above case is dummy variable trap

In [19]:
y=y.iloc[:,1].values

In [20]:
y[0:5]

array([0, 0, 1, 0, 0], dtype=uint8)

## Model Building

In [21]:
from sklearn.model_selection import train_test_split

In [22]:
from sklearn.naive_bayes import MultinomialNB

### Model building for Bag of Word

#### Model Building for Stemmed data

In [23]:
X_train_S,X_test_S,y_train_S,y_test_S= train_test_split(X_stem,y,test_size=0.2,random_state=0)

In [24]:
spam_detector_stem = MultinomialNB().fit(X_train_S,y_train_S)

In [25]:
y_pred_S = spam_detector_stem.predict(X_test_S)

In [26]:
from sklearn.metrics import accuracy_score,confusion_matrix

In [27]:
accuracy_score(y_test_S,y_pred_S)

0.9856502242152466

In [28]:
confusion_matrix(y_test_S,y_pred_S)

array([[946,   9],
       [  7, 153]], dtype=int64)

#### Model Building for lemmatized data

In [29]:
X_train_L,X_test_L,y_train_L,y_test_L= train_test_split(X_lemmatize,y,test_size=0.2,random_state=0)

In [30]:
spam_detector_lemmatize = MultinomialNB().fit(X_train_L,y_train_L)

In [31]:
y_pred_L = spam_detector_lemmatize.predict(X_test_L)

In [32]:
accuracy_score(y_test_L,y_pred_L)

0.9829596412556054

In [33]:
confusion_matrix(y_test_L,y_pred_L)

array([[946,   9],
       [ 10, 150]], dtype=int64)

### Model building for Tfidf 

#### Model Building for Stemmed data

In [34]:
X_train_S,X_test_S,y_train_S,y_test_S= train_test_split(X_stem_tf,y,test_size=0.2,random_state=0)

In [35]:
spam_detector_stem_tf = MultinomialNB().fit(X_train_S,y_train_S)

In [36]:
y_pred_S = spam_detector_stem_tf.predict(X_test_S)

In [37]:
from sklearn.metrics import accuracy_score,confusion_matrix

In [38]:
accuracy_score(y_test_S,y_pred_S)

0.979372197309417

In [39]:
confusion_matrix(y_test_S,y_pred_S)

array([[955,   0],
       [ 23, 137]], dtype=int64)

#### Model Building for lemmatized data

In [40]:
X_train_L,X_test_L,y_train_L,y_test_L= train_test_split(X_lemmatize_tf,y,test_size=0.2,random_state=0)

In [41]:
spam_detector_lemmatize_tf = MultinomialNB().fit(X_train_L,y_train_L)

In [42]:
y_pred_L = spam_detector_lemmatize_tf.predict(X_test_L)

In [43]:
accuracy_score(y_test_L,y_pred_L)

0.979372197309417

In [44]:
confusion_matrix(y_test_L,y_pred_L)

array([[954,   1],
       [ 22, 138]], dtype=int64)

### generating a pickle file 

In [45]:
import pickle

In [48]:
# open a file, where you want to store the data
file = open('transform.pkl', 'wb')

# dump information to that file
pickle.dump(cv, file)

In [50]:
# open a file, where you want to store the data
file = open('spam_detector.pkl', 'wb')

# dump information to that file
pickle.dump(spam_detector_stem, file)