In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
df=pd.read_csv('abcnews-date-text.csv')
df

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers
...,...,...
1244179,20211231,two aged care residents die as state records 2...
1244180,20211231,victoria records 5;919 new cases and seven deaths
1244181,20211231,wa delays adopting new close contact definition
1244182,20211231,western ringtail possums found badly dehydrate...


In [8]:
text=df['headline_text'].head(1000)
text

0      aba decides against community broadcasting lic...
1         act fire witnesses must be aware of defamation
2         a g calls for infrastructure protection summit
3               air nz staff in aust strike for pay rise
4          air nz strike to affect australian travellers
                             ...                        
995                  conference to focus on tuna fishery
996                        council hosts farewell for mp
997                  council resists eba roster pressure
998                     customs house restoration opened
999                dam water levels still critically low
Name: headline_text, Length: 1000, dtype: object

# The distribution of articles across these topics

In [9]:
topic_distribution = df['headline_text'].str.extract(r'\b(\w+)\b', expand=False).value_counts()
print("\nDistribution of Articles Across Topics:")
print(topic_distribution)


Distribution of Articles Across Topics:
police           21074
man              18089
interview        12596
new              11616
the              10920
                 ...  
farsnworth           1
affordability        1
unseaworthy          1
pearcecoffee         1
truly                1
Name: headline_text, Length: 49532, dtype: int64


# Implement a Bag-of-Words (BoW) model using CountVectorizer

In [12]:
vectorizer=CountVectorizer()

x=vectorizer.fit_transform(text)
#get feature names(words)
feature_names=vectorizer.get_feature_names_out()
a=x.toarray()
#Display BOW-Matrix
print('Feature names:\n',feature_names)
print()
print('BOW Matrix:\n',a)

Feature names:
 ['10' '100th' '108' ... 'zealand' 'zimbabwe' 'zone']

BOW Matrix:
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


# The advantages and limitations of BoW in this contex

# Advantages:
BoW is easy to understand and implement.
The resulting feature vectors are interpretable, and you can see the importance of different words


# Limitations:
BoW disregards the order of words, losing information about the structure and context of the text.
The resulting feature vectors can be high-dimensional, especially with a large vocabulary
BoW often results in sparse matrices
No Semantics means BoW doesn't capture the semantic meaning of words

# Apply both unigram and bigram techniques and compare their effects on classification accuracy

In [16]:
import nltk
from nltk import ngrams
# Iterate through each sentence in the Series
for sentence in text:
    tokens = nltk.word_tokenize(sentence)
    n = 1
    bigrams = list(ngrams(tokens, n))
    
    print("Original sentence:",sentence)
    print(f"Generated {n}-grams: {bigrams}")
    print("\n")

Original sentence: aba decides against community broadcasting licence
Generated 1-grams: [('aba',), ('decides',), ('against',), ('community',), ('broadcasting',), ('licence',)]


Original sentence: act fire witnesses must be aware of defamation
Generated 1-grams: [('act',), ('fire',), ('witnesses',), ('must',), ('be',), ('aware',), ('of',), ('defamation',)]


Original sentence: a g calls for infrastructure protection summit
Generated 1-grams: [('a',), ('g',), ('calls',), ('for',), ('infrastructure',), ('protection',), ('summit',)]


Original sentence: air nz staff in aust strike for pay rise
Generated 1-grams: [('air',), ('nz',), ('staff',), ('in',), ('aust',), ('strike',), ('for',), ('pay',), ('rise',)]


Original sentence: air nz strike to affect australian travellers
Generated 1-grams: [('air',), ('nz',), ('strike',), ('to',), ('affect',), ('australian',), ('travellers',)]


Original sentence: ambitious olsson wins triple jump
Generated 1-grams: [('ambitious',), ('olsson',), ('wins'

In [17]:
import nltk
from nltk import ngrams
# Iterate through each sentence in the Series
for sentence in text:
    tokens = nltk.word_tokenize(sentence)
    n = 2
    bigrams = list(ngrams(tokens, n))
    
    print("Original sentence:",sentence)
    print(f"Generated {n}-grams: {bigrams}")
    print("\n")

Original sentence: aba decides against community broadcasting licence
Generated 2-grams: [('aba', 'decides'), ('decides', 'against'), ('against', 'community'), ('community', 'broadcasting'), ('broadcasting', 'licence')]


Original sentence: act fire witnesses must be aware of defamation
Generated 2-grams: [('act', 'fire'), ('fire', 'witnesses'), ('witnesses', 'must'), ('must', 'be'), ('be', 'aware'), ('aware', 'of'), ('of', 'defamation')]


Original sentence: a g calls for infrastructure protection summit
Generated 2-grams: [('a', 'g'), ('g', 'calls'), ('calls', 'for'), ('for', 'infrastructure'), ('infrastructure', 'protection'), ('protection', 'summit')]


Original sentence: air nz staff in aust strike for pay rise
Generated 2-grams: [('air', 'nz'), ('nz', 'staff'), ('staff', 'in'), ('in', 'aust'), ('aust', 'strike'), ('strike', 'for'), ('for', 'pay'), ('pay', 'rise')]


Original sentence: air nz strike to affect australian travellers
Generated 2-grams: [('air', 'nz'), ('nz', 'strike'

# Different N-gram ranges impact the performance of the classification model

In [18]:
labels = []
for i in range(1000):
    labels.append(i % 2)

In [20]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score


df = pd.DataFrame({'text': text, 'label': labels})

X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)


def train_and_evaluate(vectorizer, ngram_range):
 
    X_train_vectorized = vectorizer.fit_transform(X_train)
    X_test_vectorized = vectorizer.transform(X_test)

   
    classifier = MultinomialNB()
    classifier.fit(X_train_vectorized, y_train)

    
    predictions = classifier.predict(X_test_vectorized)

   
    accuracy = accuracy_score(y_test, predictions)

    print(f"N-gram Range: {ngram_range}, Accuracy: {accuracy:.2f}")


vectorizer = CountVectorizer(ngram_range=(1,1))  
train_and_evaluate(vectorizer, ngram_range=(1,1))

vectorizer = CountVectorizer(ngram_range=(1,2))  
train_and_evaluate(vectorizer, ngram_range=(1,2))

vectorizer = CountVectorizer(ngram_range=(1,3)) 
train_and_evaluate(vectorizer, ngram_range=(1,3))

N-gram Range: (1, 1), Accuracy: 0.47
N-gram Range: (1, 2), Accuracy: 0.47
N-gram Range: (1, 3), Accuracy: 0.47


# TF-IDF: Apply TF-IDF (Term Frequency-Inverse Document Frequency) to the text data

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import sent_tokenize


for i in text:
    sent_token = sent_tokenize(i)

    for sent in sent_token:
        print(sent)
        print(len(sent))

    print()

    tfidf = TfidfVectorizer()
    x_tfidf = tfidf.fit_transform(sent_token)
    feature_names_tfidf = tfidf.get_feature_names_out()

    print('Feature Names(TF-IDF):\n', feature_names_tfidf)
    print()
    print('TF-IDF Matrix:\n', x_tfidf.toarray())

aba decides against community broadcasting licence
50

Feature Names(TF-IDF):
 ['aba' 'against' 'broadcasting' 'community' 'decides' 'licence']

TF-IDF Matrix:
 [[0.40824829 0.40824829 0.40824829 0.40824829 0.40824829 0.40824829]]
act fire witnesses must be aware of defamation
46

Feature Names(TF-IDF):
 ['act' 'aware' 'be' 'defamation' 'fire' 'must' 'of' 'witnesses']

TF-IDF Matrix:
 [[0.35355339 0.35355339 0.35355339 0.35355339 0.35355339 0.35355339
  0.35355339 0.35355339]]
a g calls for infrastructure protection summit
46

Feature Names(TF-IDF):
 ['calls' 'for' 'infrastructure' 'protection' 'summit']

TF-IDF Matrix:
 [[0.4472136 0.4472136 0.4472136 0.4472136 0.4472136]]
air nz staff in aust strike for pay rise
40

Feature Names(TF-IDF):
 ['air' 'aust' 'for' 'in' 'nz' 'pay' 'rise' 'staff' 'strike']

TF-IDF Matrix:
 [[0.33333333 0.33333333 0.33333333 0.33333333 0.33333333 0.33333333
  0.33333333 0.33333333 0.33333333]]
air nz strike to affect australian travellers
45

Feature Names(T

Feature Names(TF-IDF):
 ['charged' 'cooma' 'man' 'murder' 'over']

TF-IDF Matrix:
 [[0.4472136 0.4472136 0.4472136 0.4472136 0.4472136]]
man fined after aboriginal tent embassy raid
44

Feature Names(TF-IDF):
 ['aboriginal' 'after' 'embassy' 'fined' 'man' 'raid' 'tent']

TF-IDF Matrix:
 [[0.37796447 0.37796447 0.37796447 0.37796447 0.37796447 0.37796447
  0.37796447]]
man jailed over keno fraud
26

Feature Names(TF-IDF):
 ['fraud' 'jailed' 'keno' 'man' 'over']

TF-IDF Matrix:
 [[0.4472136 0.4472136 0.4472136 0.4472136 0.4472136]]
man with knife hijacks light plane
34

Feature Names(TF-IDF):
 ['hijacks' 'knife' 'light' 'man' 'plane' 'with']

TF-IDF Matrix:
 [[0.40824829 0.40824829 0.40824829 0.40824829 0.40824829 0.40824829]]
martin to lobby against losing nt seat in fed
45

Feature Names(TF-IDF):
 ['against' 'fed' 'in' 'lobby' 'losing' 'martin' 'nt' 'seat' 'to']

TF-IDF Matrix:
 [[0.33333333 0.33333333 0.33333333 0.33333333 0.33333333 0.33333333
  0.33333333 0.33333333 0.33333333]]
mas

Feature Names(TF-IDF):
 ['british' 'cheese' 'entomb' 'himself' 'in' 'magician' 'to']

TF-IDF Matrix:
 [[0.37796447 0.37796447 0.37796447 0.37796447 0.37796447 0.37796447
  0.37796447]]
bungle leaves doctor waiting to practise
40

Feature Names(TF-IDF):
 ['bungle' 'doctor' 'leaves' 'practise' 'to' 'waiting']

TF-IDF Matrix:
 [[0.40824829 0.40824829 0.40824829 0.40824829 0.40824829 0.40824829]]
bushfire coronial inquiry winds up
34

Feature Names(TF-IDF):
 ['bushfire' 'coronial' 'inquiry' 'up' 'winds']

TF-IDF Matrix:
 [[0.4472136 0.4472136 0.4472136 0.4472136 0.4472136]]
bush thanks nato for support on turkey
38

Feature Names(TF-IDF):
 ['bush' 'for' 'nato' 'on' 'support' 'thanks' 'turkey']

TF-IDF Matrix:
 [[0.37796447 0.37796447 0.37796447 0.37796447 0.37796447 0.37796447
  0.37796447]]
call for ambos help in wake of funding changes
46

Feature Names(TF-IDF):
 ['ambos' 'call' 'changes' 'for' 'funding' 'help' 'in' 'of' 'wake']

TF-IDF Matrix:
 [[0.33333333 0.33333333 0.33333333 0.33333

Feature Names(TF-IDF):
 ['armed' 'attempted' 'court' 'face' 'man' 'over' 'robbery' 'to']

TF-IDF Matrix:
 [[0.35355339 0.35355339 0.35355339 0.35355339 0.35355339 0.35355339
  0.35355339 0.35355339]]
man who whacked thatcher gets 3 months jail
43

Feature Names(TF-IDF):
 ['gets' 'jail' 'man' 'months' 'thatcher' 'whacked' 'who']

TF-IDF Matrix:
 [[0.37796447 0.37796447 0.37796447 0.37796447 0.37796447 0.37796447
  0.37796447]]
maroney calls it quits
22

Feature Names(TF-IDF):
 ['calls' 'it' 'maroney' 'quits']

TF-IDF Matrix:
 [[0.5 0.5 0.5 0.5]]
medal recognises islander war efforts
37

Feature Names(TF-IDF):
 ['efforts' 'islander' 'medal' 'recognises' 'war']

TF-IDF Matrix:
 [[0.4472136 0.4472136 0.4472136 0.4472136 0.4472136]]
meetings to consider glenelg river future
41

Feature Names(TF-IDF):
 ['consider' 'future' 'glenelg' 'meetings' 'river' 'to']

TF-IDF Matrix:
 [[0.40824829 0.40824829 0.40824829 0.40824829 0.40824829 0.40824829]]
melbourne man receives massive compo payout after

Feature Names(TF-IDF):
 ['child' 'court' 'faces' 'over' 'prostitution' 'woman']

TF-IDF Matrix:
 [[0.40824829 0.40824829 0.40824829 0.40824829 0.40824829 0.40824829]]
yacht loses mast in trawler incident
36

Feature Names(TF-IDF):
 ['in' 'incident' 'loses' 'mast' 'trawler' 'yacht']

TF-IDF Matrix:
 [[0.40824829 0.40824829 0.40824829 0.40824829 0.40824829 0.40824829]]
zimbabwe bars british reporter from world cup
45

Feature Names(TF-IDF):
 ['bars' 'british' 'cup' 'from' 'reporter' 'world' 'zimbabwe']

TF-IDF Matrix:
 [[0.37796447 0.37796447 0.37796447 0.37796447 0.37796447 0.37796447
  0.37796447]]
accc too timid in petrol price investigations
45

Feature Names(TF-IDF):
 ['accc' 'in' 'investigations' 'petrol' 'price' 'timid' 'too']

TF-IDF Matrix:
 [[0.37796447 0.37796447 0.37796447 0.37796447 0.37796447 0.37796447
  0.37796447]]
action wanted to lower indigenous unemployment rate
51

Feature Names(TF-IDF):
 ['action' 'indigenous' 'lower' 'rate' 'to' 'unemployment' 'wanted']

TF-IDF Ma

Feature Names(TF-IDF):
 ['administrator' 'appointed' 'council' 'dismissed' 'land']

TF-IDF Matrix:
 [[0.4472136 0.4472136 0.4472136 0.4472136 0.4472136]]
langer named one day player of year
35

Feature Names(TF-IDF):
 ['day' 'langer' 'named' 'of' 'one' 'player' 'year']

TF-IDF Matrix:
 [[0.37796447 0.37796447 0.37796447 0.37796447 0.37796447 0.37796447
  0.37796447]]
liverpool win but lazio held by krakow
38

Feature Names(TF-IDF):
 ['but' 'by' 'held' 'krakow' 'lazio' 'liverpool' 'win']

TF-IDF Matrix:
 [[0.37796447 0.37796447 0.37796447 0.37796447 0.37796447 0.37796447
  0.37796447]]
llewellyn not impressed by patterson so show
44

Feature Names(TF-IDF):
 ['by' 'impressed' 'llewellyn' 'not' 'patterson' 'show' 'so']

TF-IDF Matrix:
 [[0.37796447 0.37796447 0.37796447 0.37796447 0.37796447 0.37796447
  0.37796447]]
longreach water restrictions to remain
38

Feature Names(TF-IDF):
 ['longreach' 'remain' 'restrictions' 'to' 'water']

TF-IDF Matrix:
 [[0.4472136 0.4472136 0.4472136 0.44721

Feature Names(TF-IDF):
 ['coast' 'input' 'into' 'issues' 'offer' 'senators']

TF-IDF Matrix:
 [[0.40824829 0.40824829 0.40824829 0.40824829 0.40824829 0.40824829]]
several dead in fire at rhode island nightclub
46

Feature Names(TF-IDF):
 ['at' 'dead' 'fire' 'in' 'island' 'nightclub' 'rhode' 'several']

TF-IDF Matrix:
 [[0.35355339 0.35355339 0.35355339 0.35355339 0.35355339 0.35355339
  0.35355339 0.35355339]]
shire says planning policy not compromised
42

Feature Names(TF-IDF):
 ['compromised' 'not' 'planning' 'policy' 'says' 'shire']

TF-IDF Matrix:
 [[0.40824829 0.40824829 0.40824829 0.40824829 0.40824829 0.40824829]]
slapping paedophile to have arm amputated
41

Feature Names(TF-IDF):
 ['amputated' 'arm' 'have' 'paedophile' 'slapping' 'to']

TF-IDF Matrix:
 [[0.40824829 0.40824829 0.40824829 0.40824829 0.40824829 0.40824829]]
state territory health ministers arrive for meeting
51

Feature Names(TF-IDF):
 ['arrive' 'for' 'health' 'meeting' 'ministers' 'state' 'territory']

TF-IDF M

Feature Names(TF-IDF):
 ['backs' 'campaign' 'for' 'fresh' 'iraq' 'japan' 'us' 'vote']

TF-IDF Matrix:
 [[0.35355339 0.35355339 0.35355339 0.35355339 0.35355339 0.35355339
  0.35355339 0.35355339]]
johnson pittman excel in canberra
33

Feature Names(TF-IDF):
 ['canberra' 'excel' 'in' 'johnson' 'pittman']

TF-IDF Matrix:
 [[0.4472136 0.4472136 0.4472136 0.4472136 0.4472136]]
kangaroos thump essendon hawks edge saints
42

Feature Names(TF-IDF):
 ['edge' 'essendon' 'hawks' 'kangaroos' 'saints' 'thump']

TF-IDF Matrix:
 [[0.40824829 0.40824829 0.40824829 0.40824829 0.40824829 0.40824829]]
kookaburras force dutch to draw
31

Feature Names(TF-IDF):
 ['draw' 'dutch' 'force' 'kookaburras' 'to']

TF-IDF Matrix:
 [[0.4472136 0.4472136 0.4472136 0.4472136 0.4472136]]
labor targets council golden handshakes
39

Feature Names(TF-IDF):
 ['council' 'golden' 'handshakes' 'labor' 'targets']

TF-IDF Matrix:
 [[0.4472136 0.4472136 0.4472136 0.4472136 0.4472136]]
lara jumps to warnes defence
28

Feature Na

Feature Names(TF-IDF):
 ['best' 'hockeyroos' 'last' 'save' 'till']

TF-IDF Matrix:
 [[0.4472136 0.4472136 0.4472136 0.4472136 0.4472136]]
hotels association to fight smoking ban
39

Feature Names(TF-IDF):
 ['association' 'ban' 'fight' 'hotels' 'smoking' 'to']

TF-IDF Matrix:
 [[0.40824829 0.40824829 0.40824829 0.40824829 0.40824829 0.40824829]]
howell extends la lead
22

Feature Names(TF-IDF):
 ['extends' 'howell' 'la' 'lead']

TF-IDF Matrix:
 [[0.5 0.5 0.5 0.5]]
hussain hails special england performance
41

Feature Names(TF-IDF):
 ['england' 'hails' 'hussain' 'performance' 'special']

TF-IDF Matrix:
 [[0.4472136 0.4472136 0.4472136 0.4472136 0.4472136]]
iaea says iraqi cooperation improving
37

Feature Names(TF-IDF):
 ['cooperation' 'iaea' 'improving' 'iraqi' 'says']

TF-IDF Matrix:
 [[0.4472136 0.4472136 0.4472136 0.4472136 0.4472136]]
iraqi minister says protests prompt wheat decision
50

Feature Names(TF-IDF):
 ['decision' 'iraqi' 'minister' 'prompt' 'protests' 'says' 'wheat']

TF-

Feature Names(TF-IDF):
 ['critically' 'dam' 'levels' 'low' 'still' 'water']

TF-IDF Matrix:
 [[0.40824829 0.40824829 0.40824829 0.40824829 0.40824829 0.40824829]]


# Compare the results of TF-IDF with the BoW approach

# Term Frequency-Inverse Document Frequency (TF-IDF):
1. Term Frequency (TF):

Measures how frequently a term appears in a document.
Calculated as the ratio of the number of occurrences of a term to the total number of terms in a document.
Emphasizes the importance of a term within a specific document.
2. Inverse Document Frequency (IDF):

Measures how important a term is across all documents in a corpus.
Calculated as the logarithm of the ratio of the total number of documents to the number of documents containing the term.
Emphasizes the rarity of a term across the entire corpus.


# Discriminative Power: Highlights terms that distinguish one document from others.
Comparison with Bag-of-Words (BoW):
1. BoW Approach:

Represents a document as an unordered set of words.
Each word is assigned a count, and the order is disregarded.
Simple and effective but does not capture the importance or context of words.



# Compare the results of TF-IDF with the BoW approach
1. TF-IDF vs. BoW:

Importance: TF-IDF considers both term frequency and uniqueness, giving more weight to terms that are both frequent in a document and rare in the corpus.
Context: TF-IDF captures the context and importance of words, whereas BoW only considers word frequencies.
Sparse Representation: TF-IDF tends to result in a sparser representation compared to BoW, as it downweights common terms.
Dimensionality: TF-IDF often leads to a lower-dimensional representation compared to BoW, especially when dealing with large corpora.
Use Cases:

# One-Hot Encoding: Investigate the application of One-Hot Encoding to encode categorical

In [25]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import re
import nltk


processed_headlines = []
for text in text:
    headline = text.lower() 
    headline = re.sub(r'\W', ' ', text)  
    headline = re.sub(r'\s+', ' ', text)  
    processed_headlines.append(text)


count_vectorizer = CountVectorizer(binary=True)
x_one_hot = count_vectorizer.fit_transform(processed_headlines[:6])  
feature_names_one_hot = count_vectorizer.get_feature_names_out()

print('Feature Names (One-Hot Encoding):\n', feature_names_one_hot)
print()
print('One-Hot Encoded Matrix:\n', x_one_hot.toarray())

Feature Names (One-Hot Encoding):
 ['aba' 'act' 'affect' 'against' 'air' 'ambitious' 'aust' 'australian'
 'aware' 'be' 'broadcasting' 'calls' 'community' 'decides' 'defamation'
 'fire' 'for' 'in' 'infrastructure' 'jump' 'licence' 'must' 'nz' 'of'
 'olsson' 'pay' 'protection' 'rise' 'staff' 'strike' 'summit' 'to'
 'travellers' 'triple' 'wins' 'witnesses']

One-Hot Encoded Matrix:
 [[1 0 0 1 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 1 0 0 0 0 0 0 1 1 0 0 0 0 1 1 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0]
 [0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 1 0 1 1 1 0 0 0 0 0 0]
 [0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 1 1 0 0 0]
 [0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0]]
