In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder

In [15]:
data = pd.read_csv('abcnews-date-text.csv')

In [45]:
data.head(1)

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...


In [46]:
data.shape

(1244184, 2)

In [71]:
df = data.iloc[:10000,-1:]

# Bag-of-Words (BoW)

In [89]:
# Bag-of-Words (BoW)
vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(df['headline_text'])
X_bow.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

# N-grams

In [90]:
# N-grams

bigram_vectorizer = CountVectorizer(ngram_range=(2, 3))
X_bigram = bigram_vectorizer.fit_transform(df['headline_text'])

#Generate bigrams
# Get the feature names, which represent the n-grams
feature_names = bigram_vectorizer.get_feature_names_out()

print('Feature names:\n',feature_names)
print()

# Print the generated n-grams
print(f"Generated {n}-grams:")
for gram in feature_names:
    print(gram)

Feature names:
 ['10 arrested' '10 arrested in' '10 awarded' ... 'zone with home'
 'zoos to' 'zoos to help']

Generated 2-grams:
10 arrested
10 arrested in
10 awarded
10 awarded with
10 billion
10 billion verdict
10 day
10 day strike
10 die
10 die as
10 goal
10 goal thriller
10 km
10 km of
10 man
10 man liverpool
10 man roma
10 man stuttgart
10 million
10 million in
10 of
10 of strike
10 point
10 point gap
10 points
10 points clear
10 teenagers
10 teenagers after
10 upper
10 upper house
10 year
10 year old
10 years
10 years jail
100 arrests
100 bin
100 bin ladens
100 break
100 break and
100 car
100 car pileup
100 cases
100 cases of
100 companies
100 houses
100 houses damaged
100 hype
100 hype free
100 kms
100 kms east
100 metres
100 metres at
100 missing
100 missing after
1000 iraqis
1000 iraqis killed
1000 over
1000 over pms
1000 palestinians
1000 palestinians forced
10000 and
10000 and marathon
10000 first
10000 first class
10000 homeless
100000 for
100000 for peace
100000 pakistanis

on web
on wednesday
on weekend
on wet
on wet roads
on whatmores
on whatmores future
on white
on white flags
on whyalla
on whyalla airlines
on winning
on winning not
on winning note
on with
on with gold
on with settlers
on with thrashing
on woods
on woods in
on world
on world indoor
once ban
once ban is
once threshold
once threshold met
oncologists resign
oncologists resign from
oncology service
oncology service wait
one arrest
one arrest at
one at
one at phillip
one cars
one cars arrive
one china
one china city
one day
one day and
one day final
one day jail
one day player
one day title
one dead
one dead as
one dead several
one dead three
one dead two
one defender
one game
one game away
one horse
one horse race
one hurt
one hurt in
one in
one in california
one in dubai
one in georgia
one in new
one in us
one killed
one killed after
one killed five
one last
one last chance
one last push
one last swim
one life
one marine
one marine dead
one match
one match ban
one match over
one million
o

# TF-IDF

In [74]:
# TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(df['headline_text'])

print('TF-IDF Matrix:\n',X_tfidf.toarray())

TF-IDF Matrix:
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


# One-Hot Encoding

In [81]:
# One-Hot Encoding (for illustration purposes, it might not be suitable for headlines)
# Encode categorical variables based on unique headline texts
encoder = OneHotEncoder()
headlines_encoded = encoder.fit_transform(df[['headline_text']]).toarray()
headlines_encoded

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])