In [6]:
#Reading the dataset
import pandas as pd
data=pd.read_csv("https://raw.githubusercontent.com/sunnysavita10/Naive-Bayes/main/SpamClassifier-with-ML/sms_spam_data/SMSSpamCollection.csv",sep="\t",header=None,names=["labels","messages"])

# In above cell why we read the data with sep= "\t" ?
Ans: The dataset is avaiable in the tab delimited or separated manner this is the reason why we use "\t" representing escape sequence for tab. Using header = None we are making sure that column header(in case exists) is removed and using names we are giving the name to column headers.

# Agenda

 - text classification
 - text generation
 - text summarization
 - quesiton answer
 - spot keyword
 - fill blanks
 - topic modeling
 - text similarity


In [65]:
#Displaying first 5 records of dataset in form of dataframe
data.head()

Unnamed: 0,labels,messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
#First message
data["messages"][0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [9]:
#Second message
data["messages"][1]

'Ok lar... Joking wif u oni...'

In [10]:
#Third message
data["messages"][2]

"Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"

# Above we can clearly observe that the message consists of a lot of noise for example multiple dots ... This will be handled later using regular expression.

In [13]:
import nltk  #this is nltk library which can be used to implement most of the NLP concepts like stopwords, stemming,
             #lemmitization etc. nltk stands for natural language processing tool kit.

import re    #re is the regular expression library used for pattern matching
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer #PorterStemmer is the stemming technique that is used
                                           #as a suffix stripping algorithm.
nltk.download("stopwords")
#In case any module of nltk is not present then use nltk.download("stopwords") where we are mentioning the module
#name to be downloaded under quotes""

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sunny\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [14]:
#stopwords.words('english') can be used to list down all the stopwords available in the english language. Please note that here
#we are not listing stopwords in "english" word instead listing all the words within English language as a whole.

stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [15]:
data["messages"][0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

#### Regular expressions are a sequence of characters that define a search pattern. They can be used to search, match, and manipulate text strings based on specific patterns. The re module allows you to perform various operations using regular expressions, such as pattern matching, substitution, and splitting strings.

In [18]:
re.sub('[^a-zA-Z]',' ',data["messages"][0])


'Go until jurong point  crazy   Available only in bugis n great world la e buffet    Cine there got amore wat   '

# Here, we are using substitite method of regular expression lib where:
 - In 1st parameter '[^a-zA-Z]' = we are defining the pattern that we want to keep as part of output which is all the sequences STARTING WITH lower or upper case english alphabets.

 - In 2nd parameter ' ' = we are mentioning the string with which all the unmatached string(as defined in the 1st parameter) will get substituted.
 
 - In 3rd parameter data["messages"][0] = we are passing the data on which substituion using regular expression needs to be implemented
 
In the above example we can clearly observe that before applying substitution using regular expression 0th message was containing a lot of noise such as '.' and ','. And after applying substitution all the respective noises were substituted with ' ' .

In [19]:
#Followed by re.sub() lowering the words

re.sub('[^a-zA-Z]',' ',data["messages"][0]).lower()

'go until jurong point  crazy   available only in bugis n great world la e buffet    cine there got amore wat   '

In [21]:
#Using split() to split into words using sep=' ' and store it in the list.

words=re.sub('[^a-zA-Z]',' ',data["messages"][0]).lower().split()

In [22]:
#Displaying the list containig the respective words

words

['go',
 'until',
 'jurong',
 'point',
 'crazy',
 'available',
 'only',
 'in',
 'bugis',
 'n',
 'great',
 'world',
 'la',
 'e',
 'buffet',
 'cine',
 'there',
 'got',
 'amore',
 'wat']

In [24]:
#Displayig the list of words afterting removing the stopwords

[word for word in words if word not in stopwords.words("english")]

['go',
 'jurong',
 'point',
 'crazy',
 'available',
 'bugis',
 'n',
 'great',
 'world',
 'la',
 'e',
 'buffet',
 'cine',
 'got',
 'amore',
 'wat']

In [25]:
#Creating object of PorterStemmer for performing suffix stripping which is a stemming approach

ps=PorterStemmer()

In [26]:
ps=PorterStemmer()

#removing the stopwords followed by stemming the the suffix of the resultant list of words
[ps.stem(word) for word in words if word not in stopwords.words("english")]

['go',
 'jurong',
 'point',
 'crazi',
 'avail',
 'bugi',
 'n',
 'great',
 'world',
 'la',
 'e',
 'buffet',
 'cine',
 'got',
 'amor',
 'wat']

# In above we can clearly observe that available, crazy, bugis and amore get stemmed to avail, crazi, bugi abd amor respectively

In [28]:
#Using join operation with space separation and generating the final sentence or document

" ".join([ps.stem(word) for word in words if word not in stopwords.words("english")])

'go jurong point crazi avail bugi n great world la e buffet cine got amor wat'

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [10]:
import re
string="hello, world, hello"
pattern=r"hello"   # r represents read. So, r"hello" means read "hello"

result=re.search(pattern,string) #using re.search to serach for the occurence of the pattern in string

In [11]:
result
#below output signifies that re.Match object in the span of 0th to 5th index found the matching
#pattern 'hello' for the passed string.

#Please note that re.search will only return the first occurence of the matching string. From below
#output it is clear that it has not captured the span of second 'hello' substring

<re.Match object; span=(0, 5), match='hello'>

In [31]:
#Since matching pattern is found so result will be internally set to True

if result:
    print("pattern found")
else:
    print("pattern not found")

pattern found


In [12]:
string="hello, world"
pattern=r"sunny"

result=re.search(pattern,string)

In [13]:
result

In [14]:
#In this example since matching pattern is not found so result will be internally set to False and
#hence we are getting output as patten not found

if result:
    print("pattern found")
else:
    print("pattern not found")

pattern not found


In [36]:
string="hello,world"
pattern=r"world"
replacement="python"

re.sub(pattern,replacement,string)
#Since we have passed the patten in read format so it will search the repective read pattern and will
#perform substitution with the string passed with in replacement parameter

'hello,python'

# Some use cases of regular expressions:

 1. email validation
 2. URL extraction
 3. phone number extraction
 4. password validation
 5. HTML signup/login page field validation

In [39]:
#Summarizing whaterver we have discussed so far

corpus=[]
for i in range(0,len(data)):
    review=re.sub('[^a-zA-Z]'," ",data["messages"][i])
    review=review.lower()
    review=review.split()
    review=[ps.stem(word) for word in review if word not in stopwords.words("english")]
    review=" ".join(review)
    corpus.append(review)
    

In [41]:
data['messages'][40]

'Pls go ahead with watts. I just wanted to be sure. Do have a great weekend. Abiola'

In [42]:
corpus[40]

'pl go ahead watt want sure great weekend abiola'

In [43]:
#CountVectorizer uses Bag of Word Text to vector conversion approach

from sklearn.feature_extraction.text import CountVectorizer

In [None]:
#In BOW(Bag of Word) features are equivalent to vocabulary

from sklearn.feature_extraction.text import CountVectorizer

#By using max_features we are basically generating data set based on top vocabulary as defined using
#max_feature. So through this we can overcome overfitting and do optimization. In the current example
#we are setting max_features to 2500 which is equivalent to the top 2500 vocabularies(unique words).

cv=CountVectorizer(max_features=2500)

X=cv.fit_transform(corpus).toarray()  #Independent feature
#We are transforming the obtained corpus using CountVectorizer with 2500 features followed by
#transforming it to the array. Array transformation is done since it is fast as compared to pandas
#dataframe.

y=pd.get_dummies(data['labels'],drop_first=True)  #Dependent feature
#get_dummies do class wise separate split. Read below markdown to know more about the get_dummies



 
# Read this beautiful article to understand more about .get_dummies:
https://www.sharpsightlabs.com/blog/pandas-get-dummies/#:~:text=Report%20Ad-,drop_first,of%20the%20input%20categorical%20variable
 
 - A dummy variable is a numeric variable that encodes categorical information.

 - Dummy variables have two possible values: 0 or 1.

- In a dummy variable:
     -  1 encodes the presence of a category
     - 0 encodes the absence of a category

 - drop_first = True ; So, it’s a common convention to drop the dummy variable for the first level of the categorical variable that you’re encoding.

 - (In fact, it’s frequently needed for some types of machine learning models. If you fail to drop the extra dummy variable, it can cause issues with your model.)

In [52]:
X=cv.fit_transform(corpus).toarray()

In [53]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [54]:
X.shape

(5572, 2500)

In [49]:
data.shape

(5572, 2)

In [56]:
data.labels

0        ham
1        ham
2       spam
3        ham
4        ham
        ... 
5567    spam
5568     ham
5569     ham
5570     ham
5571     ham
Name: labels, Length: 5572, dtype: object

In [59]:
y=pd.get_dummies(data['labels'],drop_first=True)

In [60]:
y

Unnamed: 0,spam
0,0
1,0
2,1
3,0
4,0
...,...
5567,1
5568,0
5569,0
5570,0


In [61]:
from sklearn.model_selection import train_test_split

In [63]:
#Splitting the dataset into Test and Train datasets

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=10)

In [64]:
#We will be using multinomial naive bayes algorithm here since there are multiple categories present.
#Also, since this is the text based classification ie; I/P is in the terms of text and not in terms of
#categorical or numerical so, in all such intances we would prefer using Naive Bayes based approach
#for building a Ml based learning model


from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
model=MultinomialNB()
model.fit(X_train,y_train)
print(model.score(X_train,y_train))  #training accuracy score
y_pred=model.predict(X_test)
print(accuracy_score(y_test,y_pred)) #Testing accuracy score


  y = column_or_1d(y, warn=True)


0.9899497487437185
0.9770279971284996


In [67]:
data["messages"][0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [68]:
#Here we are taking a new data on which we would need to perform classification(Spam=1/Ham=0)

data="Thanks gaurav for the update. This provides clear visibility for the next release."

In [69]:
data

'Thanks gaurav for the update. This provides clear visibility for the next release.'

In [70]:
#Please note that here we are getting an error because data is passed directly to the model(to carry 
#out prediction) without performing any preprocessing(remove unwanted character using re, stopwords 
#removal, lower case conversion, stemming, appending to corpus, coverting corpus to array etc)

model.predict(data)

ValueError: Expected 2D array, got scalar array instead:
array=Thanks gaurav for the update. This provides clear visibility for the next release..
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [72]:
#Applying all the above mentioned preprocessing steps over new data on which prediction needs to be
#performed using the multinomial naive bayes ML leaning model

review=re.sub('[^a-zA-Z]'," ",data)
review=review.lower()
review=review.split()
review=[ps.stem(word) for word in review if word not in stopwords.words("english")]
review=" ".join(review)
    
    

In [77]:
corpus=[]
corpus.append(review)

In [78]:
corpus

['thank gaurav updat provid clear visibl next releas']

In [80]:
tfdata=cv.transform(corpus).toarray()

In [82]:
model.predict(tfdata)[0]

#model predicted the new processed data as 1

1

# For self practise we may use below on top of provided dataset. This consits of another approach of performing text based classification

1. word_tokenizer = Apply this instead of .split() for tokenizing or coverting sentences into words.
2. word lemetizer = Apply lemmitization instead of stemming that is being used in the current approach.
3. tfidf = Apply tfidf instead of curently used CountVectorizer (bag of word) for converting text into vectors.
4. gauusian naive bayes = Apply this instead of currently used multinomial naive bayes for building the text based classification learning ML model.

https://github.com/sunnysavita10/Naive-Bayes/tree/main/Natural-Language-Processing

Above GitHub repo can be useful in implementing the same

Above approach may give an improved results
