In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

In [2]:
df = pd.read_csv('smsspam.txt', sep='\t', names=['Status', 'Message'])

In [3]:
df.head()

Unnamed: 0,Status,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
len(df)

5572

In [5]:
len(df[df.Status=='spam'])

747

In [6]:
df.loc[df["Status"]=='ham', "Status"]=1
df.loc[df["Status"]=='spam', "Status"]=0

In [7]:
df.head()

Unnamed: 0,Status,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
df_x=df["Message"]
df_y=df["Status"]

In [9]:
df_x

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object

In [10]:
cv=CountVectorizer()

In [11]:
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.2, random_state=4)

In [12]:
x_traincv=cv.fit_transform(x_train)

In [13]:
a=x_traincv.toarray()

In [14]:
a

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [15]:
a[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [16]:
cv.inverse_transform(a[[0]])

[array(['checking', 'going', 'got', 'haha', 'lor', 'mails', 'me', 'now',
        'online', 'or', 'replying', 'sleeping', 'spys', 'take', 'to',
        'wat', 'you'], dtype='<U27')]

In [17]:
x_train.iloc[0]

'U sleeping now.. Or you going to take? Haha.. I got spys wat.. Me online checking n replying mails lor..'

In [24]:
cv.vocabulary_

{'sleeping': 6225,
 'now': 4852,
 'or': 4990,
 'you': 7719,
 'going': 3174,
 'to': 6944,
 'take': 6700,
 'haha': 3296,
 'got': 3200,
 'spys': 6429,
 'wat': 7413,
 'me': 4415,
 'online': 4958,
 'checking': 1766,
 'replying': 5720,
 'mails': 4327,
 'lor': 4212,
 'how': 3515,
 'long': 4197,
 'has': 3342,
 'it': 3752,
 'been': 1297,
 'since': 6178,
 'screamed': 5963,
 'princess': 5426,
 'urgent': 7220,
 'call': 1612,
 '09066612661': 233,
 'from': 3023,
 'landline': 4004,
 'your': 7725,
 'complementary': 1935,
 'tenerife': 6780,
 'holiday': 3462,
 '10': 252,
 '000': 1,
 'cash': 1682,
 'await': 1173,
 'collection': 1901,
 'sae': 5876,
 'cs': 2098,
 'po': 5278,
 'box': 1462,
 'wa14': 7360,
 '2px': 413,
 '150ppm': 305,
 '18': 318,
 'sender': 6017,
 'hol': 3457,
 'offer': 4911,
 'okay': 4931,
 'no': 4806,
 'just': 3871,
 'shining': 6095,
 'on': 4948,
 'that': 6825,
 'was': 7407,
 'meant': 4423,
 'be': 1276,
 'signing': 6166,
 'but': 1580,
 'sounds': 6351,
 'better': 1336,
 'wen': 7472,
 'ur': 7

In [20]:
Doc_Term_Matrix = pd.DataFrame(a, columns=cv.get_feature_names_out())

In [21]:
Doc_Term_Matrix

Unnamed: 0,00,000,000pes,008704050406,0125698789,02,0207,02072069400,02073162414,02085076972,...,zhong,zindgi,zoe,zogtorius,zoom,zouk,zyada,èn,ú1,〨ud
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4452,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4453,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4454,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4455,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
x_train.iloc[2]

'Urgent! call 09066612661 from landline. Your complementary 4* Tenerife Holiday or £10,000 cash await collection SAE T&Cs PO Box 3 WA14 2PX 150ppm 18+ Sender: Hol Offer'