## NLP (Natural Language Processing)

In [1]:
import pandas as pd
import numpy as np
import nltk # natural language tool kit
import matplotlib.pyplot as plt

In [2]:
messages = pd.read_csv("G:\datasets\spam.csv", encoding = 'cp1252')

In [3]:
messages.shape

(6776, 5)

In [4]:
messages = messages.iloc[:,[0,1]]

In [5]:
messages.shape

(6776, 2)

In [6]:
messages.rename(columns = {"v1":"label", "v2": "message"}, inplace = True)

In [7]:
messages.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
length = messages.message.apply(len)

In [9]:
length

0       111
1        29
2       155
3        49
4        61
       ... 
6771    161
6772     37
6773     57
6774    125
6775     26
Name: message, Length: 6776, dtype: int64

In [10]:
len(length)

6776

In [11]:
messages = pd.concat([messages, length], axis = 1)

In [12]:
messages.columns.values[2] = "Length"

In [13]:
messages.head()

Unnamed: 0,label,message,Length
0,ham,"Go until jurong point, crazy.. Available only ...",111
1,ham,Ok lar... Joking wif u oni...,29
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,ham,U dun say so early hor... U c already then say...,49
4,ham,"Nah I don't think he goes to usf, he lives aro...",61


In [14]:
messages.label.replace({"ham":0, "spam":1}, inplace = True)

In [15]:
messages.message = messages.message.str.lower() #convert all charecters into lower or upper case as python is case sensetive

In [16]:
from nltk.corpus import stopwords

In [17]:
stopwords.words("english")

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [18]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [19]:
import string

In [20]:
string.punctuation #it shows all punctuation marks

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [21]:
def test_process(mess):
    """
    1. remove the punctuaion
    2. remove the stopwords
    3.return the list of clean textwords
    
    """
    nopunc = [char for char in mess if char not in string.punctuation]
    nopunc = "".join(nopunc)
    
    return [ word for word in nopunc.split() if word not in stopwords.words("english")]

In [22]:
messages['message'].apply(test_process)

0       [go, jurong, point, crazy, available, bugis, n...
1                          [ok, lar, joking, wif, u, oni]
2       [free, entry, 2, wkly, comp, win, fa, cup, fin...
3           [u, dun, say, early, hor, u, c, already, say]
4       [nah, dont, think, goes, usf, lives, around, t...
                              ...                        
6771    [2nd, time, tried, 2, contact, u, u, å£750, po...
6772                   [ì, b, going, esplanade, fr, home]
6773                     [pity, mood, soany, suggestions]
6774    [guy, bitching, acted, like, id, interested, b...
6775                                   [rofl, true, name]
Name: message, Length: 6776, dtype: object

In [23]:
from sklearn.feature_extraction.text import CountVectorizer

In [24]:
bow_transformer =    CountVectorizer(analyzer = test_process ).fit(messages["message"])

In [25]:
bow_transformer.vocabulary_

{'go': 3744,
 'jurong': 4640,
 'point': 6371,
 'crazy': 2459,
 'available': 1373,
 'bugis': 1836,
 'n': 5620,
 'great': 3841,
 'world': 9100,
 'la': 4799,
 'e': 2984,
 'buffet': 1834,
 'cine': 2174,
 'got': 3801,
 'amore': 1140,
 'wat': 8863,
 'ok': 5936,
 'lar': 4838,
 'joking': 4608,
 'wif': 8996,
 'u': 8527,
 'oni': 5968,
 'free': 3535,
 'entry': 3119,
 '2': 414,
 'wkly': 9052,
 'comp': 2290,
 'win': 9010,
 'fa': 3257,
 'cup': 2514,
 'final': 3381,
 'tkts': 8304,
 '21st': 434,
 'may': 5283,
 '2005': 421,
 'text': 8143,
 '87121': 836,
 'receive': 6769,
 'questionstd': 6660,
 'txt': 8511,
 'ratetcs': 6713,
 'apply': 1226,
 '08452810075over18s': 71,
 'dun': 2970,
 'say': 7123,
 'early': 2991,
 'hor': 4171,
 'c': 1885,
 'already': 1114,
 'nah': 5627,
 'dont': 2878,
 'think': 8217,
 'goes': 3758,
 'usf': 8657,
 'lives': 5000,
 'around': 1277,
 'though': 8236,
 'freemsg': 3543,
 'hey': 4067,
 'darling': 2578,
 '3': 518,
 'weeks': 8916,
 'word': 9086,
 'back': 1424,
 'id': 4290,
 'like': 4

In [26]:
len(bow_transformer.vocabulary_)

9422

In [27]:
messages_bow = bow_transformer.transform(messages.message)

In [28]:
messages_bow.shape

(6776, 9422)

In [29]:
from sklearn.model_selection import train_test_split

In [30]:
x_train,x_test,y_train,y_test= train_test_split(messages_bow,messages.label, test_size =.2, random_state = 101)

In [31]:
from sklearn.naive_bayes import MultinomialNB


naive_bay = MultinomialNB()


spam_nb_model =  naive_bay.fit(x_train, y_train) #3 i have build the model

pred  =  naive_bay.predict(x_test)  # prediction


In [32]:
from sklearn.metrics import confusion_matrix
tab1 = confusion_matrix(pred , y_test)

In [33]:
tab1

array([[1161,    6],
       [  17,  172]], dtype=int64)

In [34]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc = dtc.fit(x_train, y_train)

In [35]:
pred_dtc = dtc.predict(x_test)

In [36]:
pred_dtc

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [37]:
confusionmatrix = confusion_matrix(pred_dtc,y_test) 

In [38]:
confusionmatrix

array([[1162,   22],
       [  16,  156]], dtype=int64)

In [40]:
(1162+156)/(1162+22+16+156)

0.971976401179941