In [1]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer(ngram_range=(2,2))
v.fit(["Thor Hathodawala is looking for a job"])
v.get_feature_names_out()
# v.vocabulary_

array(['for job', 'hathodawala is', 'is looking', 'looking for',
       'thor hathodawala'], dtype=object)

In [2]:
corpus = [
    "Thor ate pizza",
    "Loki is tall",
    "Loki is eating pizza",
    "Talha is a boy"
]

In [3]:
import spacy


nlp = spacy.load("en_core_web_sm")
def preprocess(text):
    # remove stop words and lemmatize the text
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    
    return " ".join(filtered_tokens)

In [4]:
preprocess("Thor ate pizza")

'thor eat pizza'

In [5]:
# corpus_processed = [
#     preprocess(text) 
#     for text in corpus
# ]

corpus_processed=[]
for i in corpus:
    preocess_text=preprocess(i)
    corpus_processed.append(preocess_text)
corpus_processed

['thor eat pizza', 'Loki tall', 'Loki eat pizza', 'Talha boy']

In [6]:
v = CountVectorizer(ngram_range=(1,2))
v.fit(corpus_processed)
v.get_feature_names_out()
v.vocabulary_

{'thor': 10,
 'eat': 1,
 'pizza': 6,
 'thor eat': 11,
 'eat pizza': 2,
 'loki': 3,
 'tall': 9,
 'loki tall': 5,
 'loki eat': 4,
 'talha': 7,
 'boy': 0,
 'talha boy': 8}

In [7]:
v.get_feature_names_out()

array(['boy', 'eat', 'eat pizza', 'loki', 'loki eat', 'loki tall',
       'pizza', 'talha', 'talha boy', 'tall', 'thor', 'thor eat'],
      dtype=object)

In [8]:
v.transform(['Talha eat pizza']).toarray()

array([[0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0]], dtype=int64)

In [9]:
v.transform(['Thor eat pizza']).toarray()

array([[0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1]], dtype=int64)

In [10]:
import spacy
corpus2 = [
     'This is the first document.',
     'This document is the second document.',
     'And this is the third one.',
     'Is this the first document?',
     'Talha is a friend of abu huaraira'
]

In [11]:
nlp = spacy.load("en_core_web_sm")
def preprocess(text0):
    # remove stop words and lemmatize the text
    doc = nlp(text0)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
        # print(filtered_tokens)
    
    return " ".join(filtered_tokens)
    # return filtered_tokens

In [12]:
# corpus_processed0 = [
#     preprocess(text0) for text0 in corpus2
# ]


corpus_processed0 = []
for text0 in corpus2:
    processed_text = preprocess(text0)
    corpus_processed0.append(processed_text)
corpus_processed0

['document',
 'document second document',
 '',
 'document',
 'Talha friend abu huaraira']

In [13]:
v = CountVectorizer()

# X = v.fit_transform(corpus2)
X = v.fit_transform(corpus_processed0)
print(v.vocabulary_)
v.get_feature_names_out()
# 'Talha is a friend of abu-huaraira'

{'document': 1, 'second': 4, 'talha': 5, 'friend': 2, 'abu': 0, 'huaraira': 3}


array(['abu', 'document', 'friend', 'huaraira', 'second', 'talha'],
      dtype=object)

In [14]:
print(X.toarray())
X.shape # here vocabulary=6 therefore 6 columns and no of starements is 5 therefore 5 rows hence the  shape is (5,6) 

[[0 1 0 0 0 0]
 [0 2 0 0 1 0]
 [0 0 0 0 0 0]
 [0 1 0 0 0 0]
 [1 0 1 1 0 1]]


(5, 6)

In [15]:
v.transform(['talha is friend']).toarray()

array([[0, 0, 1, 0, 0, 1]], dtype=int64)

In [16]:
import pandas as pd

df = pd.read_json('news_dataset.json')
print(df.shape)

df.tail()

(12695, 2)


Unnamed: 0,text,category
12690,Coach Shakes Hands Of Imaginary Players After ...,SPORTS
12691,This Minivan-Sized Sea Sponge Is Thought To Be...,SCIENCE
12692,RECAP: Dramatic Eclipse Photos Don't miss the ...,SCIENCE
12693,Richard Sherman Wants To Talk About Police Sho...,SPORTS
12694,Your Customers Ignore Your Emails -- How Will ...,BUSINESS


In [17]:
df.category.value_counts()

BUSINESS    4254
SPORTS      4167
CRIME       2893
SCIENCE     1381
Name: category, dtype: int64

In [18]:
min_samples = 1381 # we have these many SCIENCE articles and SCIENCE is our minority class


df_business = df[df.category=="BUSINESS"].sample(min_samples, random_state=2022)
df_sports = df[df.category=="SPORTS"].sample(min_samples, random_state=2022)
df_crime = df[df.category=="CRIME"].sample(min_samples, random_state=2022)
df_science = df[df.category=="SCIENCE"].sample(min_samples, random_state=2022)

In [19]:
df_balanced = pd.concat([df_business,df_sports,df_crime,df_science],axis=0)  #axis=0 --> row wise axis=1--> column wise
df_balanced.category.value_counts()

BUSINESS    1381
SPORTS      1381
CRIME       1381
SCIENCE     1381
Name: category, dtype: int64

In [20]:
df_new=df.head(10)
df_new

Unnamed: 0,text,category
0,Watching Schrödinger's Cat Die University of C...,SCIENCE
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,SCIENCE
2,Entrepreneurs Today Don't Need a Big Budget to...,BUSINESS
3,These Roads Could Recharge Your Electric Car A...,BUSINESS
4,Civilian 'Guard' Fires Gun While 'Protecting' ...,CRIME
5,Trump Pays Penalty For Ethically Questionable ...,BUSINESS
6,LIVE: Argentina vs. Iran,SPORTS
7,9 Tips for Making Your Blog Better in 2015 New...,BUSINESS
8,Man Arrested Over Threats To CNN: 'Fake News....,CRIME
9,Here's Why Coffee Makes You Have To Poop It hi...,SCIENCE


In [21]:
# nlp = spacy.load("en_core_web_sm")
# def process(text):
#     # remove stop words and lemmatize the text
#     doc = nlp(text)
#     filtered_tokens = []
#     for token in doc:
#         if token.is_stop or token.is_punct:
#             continue
#         filtered_tokens.append(token.lemma_)
#         # print(filtered_tokens)
    
#     return " ".join(filtered_tokens)
#     # return filtered_tokens

corpus_new=df_new.text
# process(corpus_new)
corpus_new


0    Watching Schrödinger's Cat Die University of C...
1       WATCH: Freaky Vortex Opens Up In Flooded Lake 
2    Entrepreneurs Today Don't Need a Big Budget to...
3    These Roads Could Recharge Your Electric Car A...
4    Civilian 'Guard' Fires Gun While 'Protecting' ...
5    Trump Pays Penalty For Ethically Questionable ...
6                            LIVE: Argentina vs. Iran 
7    9 Tips for Making Your Blog Better in 2015 New...
8    Man Arrested Over Threats To CNN:  'Fake News....
9    Here's Why Coffee Makes You Have To Poop It hi...
Name: text, dtype: object

In [26]:
v = CountVectorizer()

# X = v.fit_transform(corpus2)
X = v.fit_transform(corpus_new)
print(v.vocabulary_)
# v.get_feature_names_out()

# 'Talha is a friend of abu-huaraira'

{'watching': 177, 'schrödinger': 147, 'cat': 27, 'die': 44, 'university': 167, 'of': 117, 'california': 22, 'berkeley': 14, 'physicists': 128, 'have': 79, 'for': 66, 'the': 156, 'first': 61, 'time': 160, 'showed': 148, 'that': 155, 'in': 85, 'fact': 57, 'it': 89, 'possible': 131, 'to': 162, 'follow': 65, 'metaphorical': 105, 'through': 159, 'whole': 180, 'process': 133, 'whether': 178, 'he': 80, 'lives': 98, 'or': 122, 'dies': 45, 'end': 53, 'watch': 176, 'freaky': 67, 'vortex': 172, 'opens': 121, 'up': 168, 'flooded': 63, 'lake': 93, 'entrepreneurs': 54, 'today': 163, 'don': 48, 'need': 111, 'big': 17, 'budget': 19, 'start': 151, 'wasn': 175, 'so': 150, 'many': 104, 'years': 183, 'ago': 4, 'starting': 152, 'new': 112, 'commerce': 35, 'business': 20, 'on': 120, 'internet': 86, 'was': 174, 'complex': 36, 'custom': 41, 'development': 43, 'project': 135, 'usually': 170, 'costing': 38, 'million': 108, 'dollars': 47, 'more': 110, 'now': 116, 'you': 184, 'can': 25, 'do': 46, 'free': 68, 'the

In [30]:
v.transform(['schrödinger is cat die']).toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)