In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
doc = nlp('John flew to Japan yesterday. he had his wife and $10000 with him')

In [4]:
for token in doc:
    print(token, " | ", token.pos_ , ' | ', spacy.explain(token.pos_))

John  |  PROPN  |  proper noun
flew  |  VERB  |  verb
to  |  ADP  |  adposition
Japan  |  PROPN  |  proper noun
yesterday  |  NOUN  |  noun
.  |  PUNCT  |  punctuation
he  |  PRON  |  pronoun
had  |  VERB  |  verb
his  |  PRON  |  pronoun
wife  |  NOUN  |  noun
and  |  CCONJ  |  coordinating conjunction
$  |  SYM  |  symbol
10000  |  NUM  |  numeral
with  |  ADP  |  adposition
him  |  PRON  |  pronoun


In [5]:
doc = nlp('hello world. i quit my job')

In [6]:
print(doc[4] , " | ", doc[4].tag_ , " | ", spacy.explain(doc[4].tag_))

quit  |  VBD  |  verb, past tense


In [7]:
for token in doc:
    print(token, " | ", token.pos_ , ' | ', token.tag_ , ' | ' , spacy.explain(token.pos_), " | ", spacy.explain(token.tag_))

hello  |  INTJ  |  UH  |  interjection  |  interjection
world  |  NOUN  |  NN  |  noun  |  noun, singular or mass
.  |  PUNCT  |  .  |  punctuation  |  punctuation mark, sentence closer
i  |  PRON  |  PRP  |  pronoun  |  pronoun, personal
quit  |  VERB  |  VBD  |  verb  |  verb, past tense
my  |  PRON  |  PRP$  |  pronoun  |  pronoun, possessive
job  |  NOUN  |  NN  |  noun  |  noun, singular or mass


In [8]:
doc.count_by(spacy.attrs.POS)

{91: 1, 92: 2, 97: 1, 95: 2, 100: 1}

In [9]:
doc.vocab[91].text

'INTJ'

In [10]:
doc.vocab[92].text

'NOUN'

In [11]:
text = ''' Video provides a powerful way to help you prove your point. When you click Online Video, you can paste in the embed code for the video you want to add. You can also type a keyword to search online for the video that best fits your document.
To make your document look professionally produced, Word provides header, footer, cover page, and text box designs that complement each other. For example, you can add a matching cover page, header, and sidebar. Click Insert and then choose the elements you want from the different galleries. $10000
Themes and styles also help keep your document coordinated. When you click Design and choose a new Theme, the pictures, charts, and SmartArt graphics change to match your new theme. When you apply styles, your headings change to match the new theme.
Save time in Word with new buttons that show up where you need them. To change the way a picture fits in your document, click it and a button for layout options appears next to it. When you work on a table, click where you want to add a row or a column, and then click the plus sign.
Reading is easier, too, in the new Reading view. You can collapse parts of the document and focus on the text you want. If you need to stop reading before you reach the end, Word remembers where you left off - even on another device.

'''

In [12]:
doc = nlp(text)

In [13]:
doc.count_by(spacy.attrs.POS)

{103: 6,
 96: 9,
 100: 49,
 90: 24,
 84: 12,
 92: 54,
 94: 9,
 95: 30,
 97: 34,
 98: 9,
 87: 5,
 85: 17,
 86: 10,
 89: 10,
 99: 1,
 93: 1}

In [14]:
doc

 Video provides a powerful way to help you prove your point. When you click Online Video, you can paste in the embed code for the video you want to add. You can also type a keyword to search online for the video that best fits your document.
To make your document look professionally produced, Word provides header, footer, cover page, and text box designs that complement each other. For example, you can add a matching cover page, header, and sidebar. Click Insert and then choose the elements you want from the different galleries. $10000
Themes and styles also help keep your document coordinated. When you click Design and choose a new Theme, the pictures, charts, and SmartArt graphics change to match your new theme. When you apply styles, your headings change to match the new theme.
Save time in Word with new buttons that show up where you need them. To change the way a picture fits in your document, click it and a button for layout options appears next to it. When you work on a table, c

In [15]:
for ent in doc.ents:
    print(ent.text ,' | ', ent.label_ , ' | ', spacy.explain(ent.label_))

Online Video  |  PERSON  |  People, including fictional
Click Insert  |  PERSON  |  People, including fictional
10000  |  MONEY  |  Monetary values, including unit
Theme  |  ORG  |  Companies, agencies, institutions, etc.
SmartArt  |  ORG  |  Companies, agencies, institutions, etc.


In [16]:
from spacy import displacy

In [17]:
import pandas as pd
df = pd.read_csv('spam.csv')

In [18]:
df.head(1)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."


In [19]:
df.Category.value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [20]:
df['spam'] = df['Category'].apply(lambda x: 1 if x=='spam' else 0)

In [21]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [22]:
from sklearn.model_selection import train_test_split

In [23]:
x_train, x_test, y_train, y_test = train_test_split(df.Message, df.spam)

In [24]:
from sklearn.feature_extraction.text import CountVectorizer

In [25]:
v = CountVectorizer()

In [26]:
x_train_cv = v.fit_transform(x_train.values)

In [27]:
x_train_cv[1:10]

<9x7420 sparse matrix of type '<class 'numpy.int64'>'
	with 126 stored elements in Compressed Sparse Row format>

In [28]:
x_train_cv.toarray()[:2][0]

array([0, 0, 0, ..., 0, 0, 0])

In [29]:
x_train.shape

(4179,)

In [30]:
x_train_cv.shape

(4179, 7420)

In [35]:
v.get_feature_names_out()[4]

'0089'

In [37]:
import numpy as np

In [38]:
x_train_np = x_train_cv.toarray()

In [39]:
x_train_np[0]

array([0, 0, 0, ..., 0, 0, 0])

In [42]:
np.where(x_train_np[3] != 0 )

(array([ 496,  927, 1476, 1562, 2089, 2280, 3018, 3019, 3101, 3266, 3303,
        3582, 3600, 3612, 3804, 3946, 4178, 4236, 4401, 4433, 4496, 4630,
        4694, 4742, 4952, 4973, 5360, 5667, 5814, 6227, 6472, 7142, 7182,
        7311, 7367]),)

In [47]:
x_train[3]

'U dun say so early hor... U c already then say...'

In [50]:
from sklearn.naive_bayes import MultinomialNB

In [51]:
model = MultinomialNB()

In [52]:
model.fit(x_train_cv, y_train)

In [53]:
x_test_cv = v.transform(x_test)

In [54]:
y_pred = model.predict(x_test_cv)

In [55]:
y_pred[1]

np.int64(0)

In [62]:
y_test

2282    0
4477    0
2691    1
136     0
1058    0
       ..
2937    0
4965    1
4364    0
3762    0
2149    0
Name: spam, Length: 1393, dtype: int64

In [63]:
y_pred

array([0, 0, 1, ..., 0, 0, 0])

In [72]:
mails = ['Introducing the slimmest Galaxy S smartphone yet, the new Galaxy S25 Edge', 'Top reasons to be at AU 2025', 'Top reasons to be at AU 2025']

In [73]:
mails_cv = v.transform(mails)

In [74]:
model.predict(mails_cv)

array([0, 0, 0])

In [75]:
from sklearn.pipeline import Pipeline

model = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [76]:
model.fit(x_train, y_train)

In [77]:
model.predict(x_test)

array([0, 0, 1, ..., 0, 0, 0])

In [78]:
y_test

2282    0
4477    0
2691    1
136     0
1058    0
       ..
2937    0
4965    1
4364    0
3762    0
2149    0
Name: spam, Length: 1393, dtype: int64

In [79]:
from spacy.lang.en.stop_words import STOP_WORDS

In [81]:
STOP_WORDS


{"'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',
 'from',
 'fron

In [82]:
len(STOP_WORDS)

326

In [83]:
nlp = spacy.load('en_core_web_sm')

In [84]:
doc = nlp('We are going to Patna for an educational trip. We would be there for a day.')

In [86]:
for token in doc:
    # print(token)
    if token.is_stop:
        print(token)

We
are
to
for
an
We
would
be
there
for
a


In [87]:
for token in doc:
    print(token)

We
are
going
to
Patna
for
an
educational
trip
.
We
would
be
there
for
a
day
.


In [88]:
for token in doc:
    # print(token)
    if token.is_stop:
        # print(token)
        pass
    else:
        print(token)

going
Patna
educational
trip
.
day
.


In [96]:
def preprocess(text):
    doc = nlp(text)
    no_stop = [token.text for token in doc if not token.is_stop]
    return " ".join(no_stop)

In [97]:
preprocess('''For new applicants of PAN card, the Aadhaar PAN linking is done automatically during the application stage. For existing PAN holders, who were allotted PAN on or before 01-07-2017 it is mandatory to link PAN with Aadhaar. The Link Aadhaar service is available to individual taxpayers (both registered and unregistered on e-Filing Portal). If you do not link your PAN with the Aadhaar till 30th June 2023, your PAN will become inoperative. However, people who fall under the exempted category will not be subject to the effects of PAN becoming inoperative.''')

'new applicants PAN card , Aadhaar PAN linking automatically application stage . existing PAN holders , allotted PAN 01 - 07 - 2017 mandatory link PAN Aadhaar . Link Aadhaar service available individual taxpayers ( registered unregistered e - Filing Portal ) . link PAN Aadhaar till 30th June 2023 , PAN inoperative . , people fall exempted category subject effects PAN inoperative .'

In [91]:
preprocess('Hello, my name is nashit. i am a human. i also')

'Hello , nashit . human .'