In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('spam.csv', encoding='latin-1')

In [3]:
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
v1            5572 non-null object
v2            5572 non-null object
Unnamed: 2    50 non-null object
Unnamed: 3    12 non-null object
Unnamed: 4    6 non-null object
dtypes: object(5)
memory usage: 217.8+ KB


In [5]:
data.describe()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
count,5572,5572,50,12,6
unique,2,5169,43,10,5
top,ham,"Sorry, I'll call later","bt not his girlfrnd... G o o d n i g h t . . .@""",GE,"GNT:-)"""
freq,4825,30,3,2,2


In [6]:
messages = data['v2']

In [7]:
messages.head()

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: v2, dtype: object

In [8]:
lables = data['v1']

In [9]:
lables.value_counts()

ham     4825
spam     747
Name: v1, dtype: int64

##### NOTE:  By above metrics we can conclude this is Imbalanced dataset, first we will build model with Imbalanced dataset than will come to Imbalanced dataset problem.

### We have to perform below steps on data
##### 1.Remove Stopwords.
##### 2.Remove punctuations and special charecters.
##### 3.Make all words to lowercase
##### 4.Stemming
##### 5.Lemmatization

# 1. Removing Stop words

In [10]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Prasad\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
stop_words = set(stopwords.words('english'))

In [12]:
len(messages)

5572

In [13]:
corpus = []
for message in messages:
#     print(message, "Actual message")
    message = message.split()
#     print(message, "Actual message after split")
    good_words = [ word for word in message if not word in stopwords.words('english')]
    good_words = " ".join(good_words)
    corpus.append(good_words)

# 2.Removing Special characters and punctuations

#### 2.1 First I tried to remove special characters by using regex but later i observed some http urls and  lessthan/greaterthan symbols in messages so i decided to select only text from messages

In [14]:
import re
cleaned_corpus = []
for message in corpus:
    cleaned = re.sub(r'[?|#|\'|"|!]', r' ', message)
    cleaned = re.sub(r"[,|.|(|)|']", r' ', cleaned)
    cleaned_corpus.append(cleaned)

#### 2.2 selecting only letters from messages so remaining all we can consider as special characters

In [15]:
cleaned_corpus2 = []
for message in corpus:
    cleaned_message = re.sub(r'[^a-zA-Z]', r' ', message)
    cleaned_corpus2.append(cleaned_message)

## 3.Making all words to Lower case

In [16]:
cleaned_corpus3 = []
for message in cleaned_corpus2:
    cleaned_message = message.lower()
    cleaned_corpus3.append(cleaned_message)

## 4.Stemming

In [17]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')

In [18]:
final_corpus = []
for message in cleaned_corpus3:
    split_message = message.split()
    stemmed_message = [stemmer.stem(msg) for msg in split_message]
    msg = " ".join(stemmed_message)
    final_corpus.append(msg)

# BOW

In [19]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000)
X = cv.fit_transform(final_corpus).toarray()

In [20]:
X.shape

(5572, 5000)

In [21]:
lables.head()

0     ham
1     ham
2    spam
3     ham
4     ham
Name: v1, dtype: object

In [22]:
y = pd.get_dummies(lables, drop_first=True)

In [23]:
y.head()

Unnamed: 0,spam
0,0
1,0
2,1
3,0
4,0


In [24]:
len(y)

5572

In [25]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [26]:
from sklearn.naive_bayes import MultinomialNB
spam_detector = MultinomialNB()
spam_detector.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [27]:
spam_detector.score(X_train, y_train)

0.992581957406078

In [28]:
y_pred = spam_detector.predict(X_test)

In [29]:
spam_detector.score(X_test, y_test)

0.9806173725771715

In [30]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [31]:
confusion_matrix(y_test, y_pred)

array([[1199,   15],
       [  12,  167]], dtype=int64)

In [32]:
accuracy_score(y_test, y_pred)

0.9806173725771715

## Building Pipeline to take input from user and predict with model

#### - Need to apply all Text preprocessing techniques on user input data for that i am creating function text_preprocessing 

In [37]:
def text_preprocessing(message):
    words = message.split()  #splitting entire message into words
    non_stop_words = [stemmer.stem(word) for word in words if not word in stopwords.words('english')] #Removing stop words from message
    non_stop_message = " ".join(non_stop_words)
    non_stop_message = re.sub(r'[^a-zA-Z]', r' ', non_stop_message)
    non_stop_message = re.sub(r"[.|,|']", r' ', non_stop_message)
    lower_case_message = non_stop_message.lower()
    return lower_case_message

In [41]:
message = "I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times."
cleaned_message = text_preprocessing(message)

#### Writing function to convert cleaned message to vector by using sklearn BOW method

In [46]:
def text_to_vector(message):
    vector = cv.fit_transform(message).toarray()
    return vector

In [47]:
cleaned_message

'i v search right word thank breather  i promis wont take help grant fulfil promise  you wonder bless times '

In [48]:
vector = text_to_vector([cleaned_message])

In [49]:
vector

array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], dtype=int64)

In [50]:
spam_detector.predict(vector)

ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 5000 is different from 16)

In [52]:
X_test.shape

(1393, 5000)