In [1]:
import pandas as pd

**EDA(Know the datset)**:

In [3]:
# Load CSV file into a Pandas DataFrame
df = pd.read_csv('/content/BBC News Train.csv')

# Print the contents of the DataFrame
print(df.head())

   ArticleId                                               Text  Category
0       1833  worldcom ex-boss launches defence lawyers defe...  business
1        154  german business confidence slides german busin...  business
2       1101  bbc poll indicates economic gloom citizens in ...  business
3       1976  lifestyle  governs mobile choice  faster  bett...      tech
4        917  enron bosses in $168m payout eighteen former e...  business


In [4]:
#  Count number of documents in each class/category
print(df["Category"].value_counts())

sport            346
business         336
politics         274
entertainment    273
tech             261
Name: Category, dtype: int64


In [5]:
# Count number of documents in each class
category_counts = {}
for category in df["Category"]:
    if category in category_counts:
        category_counts[category] += 1
    else:
        category_counts[category] = 1

# Print dictionary of counts
print(category_counts)

{'business': 336, 'tech': 261, 'politics': 274, 'sport': 346, 'entertainment': 273}


In [6]:
#no of document in dataset 
print(len(df))

1490


**1.Preprocessing the dataset:**

In [7]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [8]:
 df1 =df

In [9]:
# Text cleaning
stop_words = set(stopwords.words('english'))
#converting to lowercase
df1['Text'] = df1['Text'].apply(lambda x: re.sub(r'[^\w\s]', '', x.lower()))
#tokenize
df1['Text'] = df1['Text'].apply(lambda x: ' '.join([word for word in word_tokenize(x) if word not in stop_words]))

# Applying Stemming
stemmer = PorterStemmer()
df1['Text'] = df1['Text'].apply(lambda x: ' '.join([stemmer.stem(word) for word in word_tokenize(x)]))

# Print the preprocessed text column
print(df1['Text'].head())

0    worldcom exboss launch defenc lawyer defend fo...
1    german busi confid slide german busi confid fe...
2    bbc poll indic econom gloom citizen major nati...
3    lifestyl govern mobil choic faster better funk...
4    enron boss 168m payout eighteen former enron d...
Name: Text, dtype: object


**2.Split the BBC train dataset into training and testing sets.Use a 70:30 split for the training and testing sets, respectively.**

In [10]:
from sklearn.model_selection import train_test_split

train_df1, test_df1 = train_test_split(df1, test_size=0.3, random_state=42)
print(len(train_df1))
print(len(test_df1))

1043
447


**3. Training the Naive Bayes classifier with TF-ICF:**

In [11]:
import pandas as pd
from collections import defaultdict

# Create dictionary tf_dict1
tf_dict1 = defaultdict(lambda: defaultdict(int))
for i, row in train_df1.iterrows():
    category = row['Category']
    text = row['Text']
    words = (text.split())
    for word in words:
        tf_dict1[category][word] += 1

# Fill in missing terms with 0 count
all_terms = set()
for cat in tf_dict1.values():
    all_terms.update(cat.keys())
for cat in tf_dict1.values():
    for term in all_terms:
        if term not in cat:
            cat[term] = 0

# Print the resulting dictionary
print(tf_dict1)

defaultdict(<function <lambda> at 0x7f23a4da8d30>, {'entertainment': defaultdict(<class 'int'>, {'willi': 10, 'sue': 2, 'movi': 84, 'injuri': 6, 'actor': 161, 'bruce': 8, 'su': 8, 'revolut': 3, 'studio': 35, 'said': 426, 'suffer': 12, 'make': 87, 'tear': 5, 'sun': 3, 'seek': 13, 'medic': 5, 'expens': 4, 'hit': 71, 'head': 16, 'firework': 1, 'film': 439, '2002': 21, 'produc': 79, 'firm': 6, 'lawsuit': 7, 'star': 227, 'endur': 8, 'mental': 2, 'physic': 8, 'result': 10, 'alleg': 12, 'incid': 8, 'abl': 15, 'comment': 9, 'pend': 1, 'litig': 1, 'spokesman': 23, 'sean': 2, 'duda': 1, 'play': 128, 'us': 178, 'militari': 1, 'command': 1, 'disobey': 1, 'order': 9, 'tri': 23, 'help': 35, 'save': 7, 'doctor': 6, 'patient': 1, 'trap': 2, 'nigerian': 1, 'jungl': 7, 'direct': 31, 'antoin': 1, 'fuqua': 1, 'poorli': 2, 'receiv': 42, 'critic': 48, 'perform': 108, 'strongli': 4, 'box': 62, 'offic': 54, 'accord': 20, 'fire': 6, 'explos': 4, 'part': 34, 'special': 28, 'effect': 10, 'known': 16, 'squib': 2,

In [12]:
# create a new dictionary to store the term and class count
cf_dict1 = {}

# iterate through the nested dictionaries and update term_class_count
for class_key, term_dict in tf_dict1.items():
    for term_key, term_count in term_dict.items():
        if term_key in cf_dict1 and term_count!=0:
            cf_dict1[term_key] += 1
        else:
            cf_dict1[term_key] = 1

# print the resulting dictionary
print(cf_dict1)

{'willi': 3, 'sue': 1, 'movi': 1, 'injuri': 5, 'actor': 1, 'bruce': 3, 'su': 1, 'revolut': 1, 'studio': 1, 'said': 5, 'suffer': 5, 'make': 5, 'tear': 4, 'sun': 3, 'seek': 5, 'medic': 5, 'expens': 5, 'hit': 5, 'head': 5, 'firework': 1, 'film': 1, '2002': 5, 'produc': 5, 'firm': 5, 'lawsuit': 3, 'star': 5, 'endur': 3, 'mental': 2, 'physic': 5, 'result': 5, 'alleg': 5, 'incid': 2, 'abl': 5, 'comment': 5, 'pend': 1, 'litig': 1, 'spokesman': 5, 'sean': 3, 'duda': 1, 'play': 5, 'us': 5, 'militari': 1, 'command': 5, 'disobey': 1, 'order': 5, 'tri': 5, 'help': 5, 'save': 5, 'doctor': 4, 'patient': 4, 'trap': 2, 'nigerian': 1, 'jungl': 1, 'direct': 5, 'antoin': 1, 'fuqua': 1, 'poorli': 4, 'receiv': 5, 'critic': 5, 'perform': 5, 'strongli': 5, 'box': 5, 'offic': 5, 'accord': 5, 'fire': 5, 'explos': 2, 'part': 5, 'special': 5, 'effect': 5, 'known': 5, 'squib': 1, 'intend': 5, 'simul': 1, 'appear': 5, 'bullet': 1, 'strike': 5, 'ground': 5, 'extrem': 5, 'emot': 5, 'pain': 4, 'specif': 5, 'detail': 

In [13]:
len(cf_dict1)

16706

In [14]:
import math

In [15]:
# Step 2: Calculate inverse-class frequency (ICF) for each term
N = len(set(train_df1['Category'])) #as there are 5 classes
icf_dict1 = {term: math.log10(N / cf) for term, cf in cf_dict1.items()}

In [16]:
print(icf_dict1)

{'willi': 0.22184874961635637, 'sue': 0.6989700043360189, 'movi': 0.6989700043360189, 'injuri': 0.0, 'actor': 0.6989700043360189, 'bruce': 0.22184874961635637, 'su': 0.6989700043360189, 'revolut': 0.6989700043360189, 'studio': 0.6989700043360189, 'said': 0.0, 'suffer': 0.0, 'make': 0.0, 'tear': 0.09691001300805642, 'sun': 0.22184874961635637, 'seek': 0.0, 'medic': 0.0, 'expens': 0.0, 'hit': 0.0, 'head': 0.0, 'firework': 0.6989700043360189, 'film': 0.6989700043360189, '2002': 0.0, 'produc': 0.0, 'firm': 0.0, 'lawsuit': 0.22184874961635637, 'star': 0.0, 'endur': 0.22184874961635637, 'mental': 0.3979400086720376, 'physic': 0.0, 'result': 0.0, 'alleg': 0.0, 'incid': 0.3979400086720376, 'abl': 0.0, 'comment': 0.0, 'pend': 0.6989700043360189, 'litig': 0.6989700043360189, 'spokesman': 0.0, 'sean': 0.22184874961635637, 'duda': 0.6989700043360189, 'play': 0.0, 'us': 0.0, 'militari': 0.6989700043360189, 'command': 0.0, 'disobey': 0.6989700043360189, 'order': 0.0, 'tri': 0.0, 'help': 0.0, 'save':

In [17]:
classes=set(df1['Category'])
print(classes)

{'business', 'politics', 'entertainment', 'sport', 'tech'}


In [18]:
import numpy as np
import pandas as pd
tf_icf1=np.zeros((len(set(train_df1['Category'])),len(cf_dict1.keys())))
tf_icf1=pd.DataFrame(tf_icf1)
tf_icf1.columns=cf_dict1.keys()
tf_icf1.index=set(train_df1['Category'])

In [19]:
# Step 3: Calculate TF-ICF score for each term in each category
for c,value in tf_dict1.items():
  for term,termf in value.items():
    tf_icf1.loc[c][term]=termf*icf_dict1[term]

Probability of term given category

In [20]:
display(tf_icf1)

Unnamed: 0,willi,sue,movi,injuri,actor,bruce,su,revolut,studio,said,...,hanley,a717,summari,fractur,lebeouf,proven,frederick,phonein,brownit,unflapp
business,0.221849,6.9897,0.0,0.0,0.69897,0.665546,4.19382,0.0,2.09691,0.0,...,0.0,0.69897,0.0,0.0,0.0,0.0,0.69897,0.0,0.0,0.0
politics,0.0,0.0,0.0,0.0,0.69897,0.0,1.39794,0.69897,0.69897,0.0,...,0.0,0.0,8.38764,0.39794,0.0,0.39794,0.0,0.0,0.69897,0.0
entertainment,2.218487,1.39794,58.71348,0.0,112.534171,1.77479,5.59176,2.09691,24.46395,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
sport,0.221849,0.0,0.0,0.0,0.0,0.443697,0.0,0.0,0.0,0.0,...,0.39794,0.0,0.0,0.79588,0.39794,0.39794,0.0,0.39794,0.0,0.39794
tech,0.0,2.79588,30.75468,0.0,2.09691,0.221849,3.49485,7.68867,13.9794,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
pred=np.empty(len(test_df1), dtype="str")
pred=pd.DataFrame(pred)
pred.index=test_df1.loc[:,'ArticleId']
pred.columns={'category'}
display(pred)

Unnamed: 0_level_0,category
ArticleId,Unnamed: 1_level_1
2160,
1360,
302,
864,
2184,
...,...
1118,
2219,
559,
2166,


In [22]:
#Calculate the probability of each category based on the frequency of documents
#in the training set that belong to that category.
# Calculate prior probability
total_docs = len(train_df1)
prior_prob = {}
for category, count in category_counts.items():
    prior_prob[category] = count / total_docs

# Print dictionary of prior probability
print(prior_prob)

{'business': 0.3221476510067114, 'tech': 0.25023969319271333, 'politics': 0.26270373921380635, 'sport': 0.3317353787152445, 'entertainment': 0.26174496644295303}


**4.Testing the Naive Bayes classifier with TF-ICF: Use the testing set to evaluate the performance of the classifier.**

In [26]:
for i in range(len(test_df1)):
      nb=np.zeros(5)
      nb=pd.DataFrame(nb)
      nb.index=tf_icf1.index
      for s in test_df1.iloc[i][1].split(' '):
        if(s in tf_icf1.columns):
          if(tf_icf1.loc['entertainment'].sum(axis=0)>0 and tf_icf1.loc['entertainment'][s]>0):
              nb.loc['entertainment']+=math.log10((1+tf_icf1.loc['entertainment'][s])/(tf_icf1.loc['entertainment'].sum(axis=0)+len(cf_dict1)))
          if(tf_icf1.loc['sport'].sum(axis=0)>0 and tf_icf1.loc['sport'][s]>0):   
              nb.loc['sport']+=math.log10((1+tf_icf1.loc['sport'][s])/(tf_icf1.loc['sport'].sum(axis=0)+len(cf_dict1)))
          if(tf_icf1.loc['tech'].sum(axis=0)>0 and tf_icf1.loc['tech'][s]>0):    
              nb.loc['tech']+=math.log10((1+tf_icf1.loc['tech'][s])/(tf_icf1.loc['tech'].sum(axis=0)+len(cf_dict1)))
          if(tf_icf1.loc['business'].sum(axis=0)>0 and tf_icf1.loc['business'][s]>0):    
              nb.loc['business']+=math.log10((1+tf_icf1.loc['business'][s])/(tf_icf1.loc['business'].sum(axis=0)+len(cf_dict1)))
          if(tf_icf1.loc['politics'].sum(axis=0)>0 and tf_icf1.loc['politics'][s]>0):    
              nb.loc['politics']+=math.log10((1+tf_icf1.loc['politics'][s])/(tf_icf1.loc['politics'].sum(axis=0)+len(cf_dict1)))

      
      nb.loc['entertainment']+=math.log10(prior_prob['entertainment'])
      nb.loc['sport']+=math.log10(prior_prob['sport'])
      nb.loc['tech']+=math.log10(prior_prob['tech'])
      nb.loc['business']+=math.log10(prior_prob['business'])
      nb.loc['politics']+=math.log10(prior_prob['politics'])
      x=nb[[0]].idxmin()[0]
      pred.iloc[i][0]=x

In [28]:
#display the prediction of classes for each document in testset 
display(pred)

Unnamed: 0_level_0,category
ArticleId,Unnamed: 1_level_1
2160,entertainment
1360,politics
302,politics
864,entertainment
2184,entertainment
...,...
1118,business
2219,sport
559,sport
2166,entertainment


In [29]:
#display the actual value of classes for each document in test
print(df1[2:])

      ArticleId                                               Text  \
2          1101  bbc poll indic econom gloom citizen major nati...   
3          1976  lifestyl govern mobil choic faster better funk...   
4           917  enron boss 168m payout eighteen former enron d...   
5          1582  howard truant play snooker conserv leader mich...   
6           651  wale silent grand slam talk rhi william say wa...   
...         ...                                                ...   
1485        857  doubl evict big brother model capric holbi cit...   
1486        325  dj doubl act revamp chart show dj duo jk joel ...   
1487       1590  weak dollar hit reuter revenu media group reut...   
1488       1587  appl ipod famili expand market appl expand ipo...   
1489        538  santi worm make unwelcom visit thousand websit...   

           Category  
2          business  
3              tech  
4          business  
5          politics  
6             sport  
...             ...  
1485 

**Classification Report on TF-ICF using Stemming in preprocessing step and split the train-test in ratio of 70:30**

In [30]:
from sklearn.metrics import classification_report
report = classification_report(pred,test_df1.loc[:,'Category'] )
print(report)

               precision    recall  f1-score   support

     business       0.81      0.92      0.86        96
entertainment       0.96      0.75      0.84       102
     politics       0.81      0.92      0.86        76
        sport       0.96      0.99      0.97        98
         tech       0.85      0.83      0.84        75

     accuracy                           0.88       447
    macro avg       0.88      0.88      0.88       447
 weighted avg       0.89      0.88      0.88       447



**5. Improving the classifier:**

Different preprocessing techniques such as lemmetization and parameters such as train-test different splits to improve the performance of the classifier.

In [46]:
df2=df

In [47]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import pandas as pd

# Initialize the WordNetLemmatizer and stop words
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Define the function to clean and lemmatize the text
def clean_and_lemmatize(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove stop words
    tokens = nltk.word_tokenize(text)
    tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatize each word
    lemmatized_words = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Join the lemmatized words back into a single string
    lemmatized_text = ' '.join(lemmatized_words)
    
    return lemmatized_text

# Apply the clean_and_lemmatize function to the "Text" column and create a new column with the cleaned and lemmatized text
df2['Text'] = df2['Text'].apply(clean_and_lemmatize)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [48]:
df2.head()

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom exboss launch defenc lawyer defend fo...,business
1,154,german busi confid slide german busi confid fe...,business
2,1101,bbc poll indic econom gloom citizen major nati...,business
3,1976,lifestyl govern mobil choic faster better funk...,tech
4,917,enron bos 168m payout eighteen former enron di...,business


In [49]:
#train test split on 80:20 
train_df2, test_df2 = train_test_split(df2, test_size=0.2, random_state=42)
print(len(train_df2))
print(len(test_df2))

1192
298


In [50]:
import pandas as pd
from collections import defaultdict

# Create dictionary tf_dict1
tf_dict2 = defaultdict(lambda: defaultdict(int))
for i, row in train_df2.iterrows():
    category = row['Category']
    text = row['Text']
    words = (text.split())
    for word in words:
        tf_dict2[category][word] += 1

# Fill in missing terms with 0 count
all_terms = set()
for cat in tf_dict2.values():
    all_terms.update(cat.keys())
for cat in tf_dict2.values():
    for term in all_terms:
        if term not in cat:
            cat[term] = 0

# Print the resulting dictionary
print(tf_dict2)

defaultdict(<function <lambda> at 0x7f23aead8af0>, {'sport': defaultdict(<class 'int'>, {'fume': 2, 'robinson': 70, 'blast': 6, 'offici': 29, 'england': 281, 'coach': 143, 'andi': 52, 'said': 511, 'livid': 3, 'side': 173, 'deni': 37, 'two': 219, 'tri': 121, 'sunday': 59, '1913': 7, 'six': 124, 'nation': 121, 'loss': 23, 'ireland': 152, 'dublin': 22, 'mark': 66, 'cueto': 14, 'firsthalf': 4, 'effort': 29, 'rule': 45, 'offsid': 7, 'refere': 59, 'spurn': 2, 'tv': 4, 'replay': 17, 'crash': 15, 'die': 4, 'minut': 146, 'absolut': 10, 'spit': 1, 'cost': 6, 'told': 88, 'bbc': 46, 'sport': 77, 'got': 96, 'go': 181, 'back': 210, 'technolog': 5, 'know': 97, 'south': 43, 'african': 16, 'jonathan': 10, 'kaplan': 13, 'ahead': 43, 'charli': 14, 'hodgson': 36, 'flyhalf': 20, 'hoist': 1, 'crossfield': 1, 'kick': 62, 'sale': 18, 'wing': 17, 'gather': 7, 'declin': 9, 'chanc': 106, 'consult': 4, 'fourth': 35, 'josh': 9, 'lewsey': 16, 'took': 66, 'ball': 79, 'irish': 43, 'line': 52, 'pile': 3, 'bodi': 26, '

In [51]:
# create a new dictionary to store the term and class count
cf_dict2 = {}

# iterate through the nested dictionaries and update term_class_count
for class_key, term_dict in tf_dict2.items():
    for term_key, term_count in term_dict.items():
        if term_key in cf_dict2 and term_count!=0:
            cf_dict2[term_key] += 1
        else:
            cf_dict2[term_key] = 1

# print the resulting dictionary
print(cf_dict2)

{'fume': 1, 'robinson': 5, 'blast': 3, 'offici': 5, 'england': 5, 'coach': 1, 'andi': 1, 'said': 5, 'livid': 1, 'side': 5, 'deni': 5, 'two': 5, 'tri': 5, 'sunday': 5, '1913': 1, 'six': 5, 'nation': 5, 'loss': 2, 'ireland': 5, 'dublin': 3, 'mark': 5, 'cueto': 1, 'firsthalf': 1, 'effort': 5, 'rule': 5, 'offsid': 1, 'refere': 1, 'spurn': 1, 'tv': 5, 'replay': 2, 'crash': 3, 'die': 5, 'minut': 5, 'absolut': 5, 'spit': 2, 'cost': 5, 'told': 5, 'bbc': 5, 'sport': 5, 'got': 5, 'go': 5, 'back': 5, 'technolog': 5, 'know': 5, 'south': 5, 'african': 1, 'jonathan': 5, 'kaplan': 1, 'ahead': 5, 'charli': 1, 'hodgson': 1, 'flyhalf': 1, 'hoist': 1, 'crossfield': 1, 'kick': 1, 'sale': 5, 'wing': 3, 'gather': 5, 'declin': 5, 'chanc': 5, 'consult': 5, 'fourth': 1, 'josh': 1, 'lewsey': 1, 'took': 5, 'ball': 4, 'irish': 5, 'line': 5, 'pile': 5, 'bodi': 5, 'could': 5, 'gamewin': 1, 'think': 5, 'score': 4, 'perfectli': 2, 'legal': 5, 'gone': 5, 'video': 5, 'use': 5, 'still': 5, 'work': 5, 'look': 5, 'disappo

In [52]:
len(cf_dict2)

17769

In [53]:
import math
# Step 2: Calculate inverse-class frequency (ICF) for each term
N = len(set(train_df1['Category'])) #as there are 5 classes
icf_dict2 = {term: math.log10(N / cf) for term, cf in cf_dict2.items()}

In [54]:
print(icf_dict2)

{'fume': 0.6989700043360189, 'robinson': 0.0, 'blast': 0.22184874961635637, 'offici': 0.0, 'england': 0.0, 'coach': 0.6989700043360189, 'andi': 0.6989700043360189, 'said': 0.0, 'livid': 0.6989700043360189, 'side': 0.0, 'deni': 0.0, 'two': 0.0, 'tri': 0.0, 'sunday': 0.0, '1913': 0.6989700043360189, 'six': 0.0, 'nation': 0.0, 'loss': 0.3979400086720376, 'ireland': 0.0, 'dublin': 0.22184874961635637, 'mark': 0.0, 'cueto': 0.6989700043360189, 'firsthalf': 0.6989700043360189, 'effort': 0.0, 'rule': 0.0, 'offsid': 0.6989700043360189, 'refere': 0.6989700043360189, 'spurn': 0.6989700043360189, 'tv': 0.0, 'replay': 0.3979400086720376, 'crash': 0.22184874961635637, 'die': 0.0, 'minut': 0.0, 'absolut': 0.0, 'spit': 0.3979400086720376, 'cost': 0.0, 'told': 0.0, 'bbc': 0.0, 'sport': 0.0, 'got': 0.0, 'go': 0.0, 'back': 0.0, 'technolog': 0.0, 'know': 0.0, 'south': 0.0, 'african': 0.6989700043360189, 'jonathan': 0.0, 'kaplan': 0.6989700043360189, 'ahead': 0.0, 'charli': 0.6989700043360189, 'hodgson': 

In [56]:
classes=set(df2['Category'])
print(classes)

{'business', 'politics', 'entertainment', 'sport', 'tech'}


In [57]:
import numpy as np
import pandas as pd
tf_icf2=np.zeros((len(set(train_df2['Category'])),len(cf_dict2.keys())))
tf_icf2=pd.DataFrame(tf_icf2)
tf_icf2.columns=cf_dict2.keys()
tf_icf2.index=set(train_df2['Category'])

In [58]:
# Step 3: Calculate TF-ICF score for each term in each category
for c,value in tf_dict2.items():
  for term,termf in value.items():
    tf_icf2.loc[c][term]=termf*icf_dict2[term]

In [59]:
display(tf_icf2)
#this is probability of term given category

Unnamed: 0,fume,robinson,blast,offici,england,coach,andi,said,livid,side,...,plani,mcguigan,gmac,cleveland,a717,summari,sao,frederick,stairway,brownit
business,0.69897,0.0,0.0,0.0,0.0,0.0,0.69897,0.0,0.0,0.0,...,0.0,0.69897,1.39794,0.0,0.69897,0.0,0.0,0.69897,0.0,0.0
politics,0.0,0.0,0.0,0.0,0.0,0.69897,0.69897,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,8.38764,0.0,0.0,0.0,0.69897
entertainment,0.0,0.0,1.331092,0.0,0.0,2.09691,6.29073,0.0,0.0,0.0,...,0.69897,0.0,0.0,0.69897,0.0,0.0,0.69897,0.0,0.69897,0.0
sport,1.39794,0.0,1.331092,0.0,0.0,99.952711,36.34644,0.0,2.09691,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tech,0.0,0.0,1.109244,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [60]:
pred=np.empty(len(test_df2), dtype="str")
pred=pd.DataFrame(pred)
pred.index=test_df2.loc[:,'ArticleId']
pred.columns={'category'}
display(pred)

Unnamed: 0_level_0,category
ArticleId,Unnamed: 1_level_1
2160,
1360,
302,
864,
2184,
...,...
126,
1533,
1886,
1701,


In [61]:
# Calculate prior probability
total_docs = len(train_df2)
prior_prob = {}
for category, count in category_counts.items():
    prior_prob[category] = count / total_docs

# Print dictionary of prior probability
print(prior_prob)

{'business': 0.28187919463087246, 'tech': 0.21895973154362416, 'politics': 0.22986577181208054, 'sport': 0.2902684563758389, 'entertainment': 0.22902684563758388}


In [65]:
for i in range(len(test_df2)):
      nb=np.zeros(5)
      nb=pd.DataFrame(nb)
      nb.index=tf_icf2.index
      for s in test_df2.iloc[i][1].split(' '):
        if(s in tf_icf2.columns):
          if(tf_icf2.loc['entertainment'].sum(axis=0)>0 and tf_icf2.loc['entertainment'][s]>0):
              nb.loc['entertainment']+=math.log10((1+tf_icf2.loc['entertainment'][s])/(tf_icf2.loc['entertainment'].sum(axis=0)+len(cf_dict2)))
          if(tf_icf2.loc['sport'].sum(axis=0)>0 and tf_icf2.loc['sport'][s]>0):   
              nb.loc['sport']+=math.log10((1+tf_icf2.loc['sport'][s])/(tf_icf2.loc['sport'].sum(axis=0)+len(cf_dict2)))
          if(tf_icf2.loc['tech'].sum(axis=0)>0 and tf_icf2.loc['tech'][s]>0):    
              nb.loc['tech']+=math.log10((1+tf_icf2.loc['tech'][s])/(tf_icf2.loc['tech'].sum(axis=0)+len(cf_dict2)))
          if(tf_icf2.loc['business'].sum(axis=0)>0 and tf_icf2.loc['business'][s]>0):    
              nb.loc['business']+=math.log10((1+tf_icf2.loc['business'][s])/(tf_icf2.loc['business'].sum(axis=0)+len(cf_dict2)))
          if(tf_icf2.loc['politics'].sum(axis=0)>0 and tf_icf2.loc['politics'][s]>0):    
              nb.loc['politics']+=math.log10((1+tf_icf2.loc['politics'][s])/(tf_icf2.loc['politics'].sum(axis=0)+len(cf_dict2)))


      nb.loc['entertainment']+=math.log10(prior_prob['entertainment'])
      nb.loc['sport']+=math.log10(prior_prob['sport'])
      nb.loc['tech']+=math.log10(prior_prob['tech'])
      nb.loc['business']+=math.log10(prior_prob['business'])
      nb.loc['politics']+=math.log10(prior_prob['politics'])
      x=nb[[0]].idxmin()[0]
      pred.iloc[i][0]=x

In [66]:
display(pred)

Unnamed: 0_level_0,category
ArticleId,Unnamed: 1_level_1
2160,entertainment
1360,politics
302,politics
864,tech
2184,tech
...,...
126,business
1533,business
1886,entertainment
1701,tech


**Classification report on TF-ICF with 80:20 split of train-test and performing lemmetization in preprocessing.**

In [67]:
from sklearn.metrics import classification_report
report2 = classification_report(pred,test_df2.loc[:,'Category'] )
print(report2)

               precision    recall  f1-score   support

     business       0.93      0.92      0.93        76
entertainment       0.93      0.90      0.91        48
     politics       0.93      0.96      0.95        54
        sport       0.98      0.97      0.98        64
         tech       0.90      0.93      0.91        56

     accuracy                           0.94       298
    macro avg       0.94      0.94      0.94       298
 weighted avg       0.94      0.94      0.94       298



**Try using different types of features such as  TF-IDF weights.**

In [68]:
import pandas as pd
from collections import defaultdict

# Sample data

# Create dictionary of dictionaries
tf_dict = defaultdict(lambda: defaultdict(int))
for i, row in train_df2.iterrows():
    category = row['Category']
    text = row['Text']
    words = (text.split())
    for word in words:
        tf_dict[category][word] += 1

# Fill in missing terms with 0 count
all_terms = set()
for cat in tf_dict.values():
    all_terms.update(cat.keys())
for cat in tf_dict.values():
    for term in all_terms:
        if term not in cat:
            cat[term] = 0

# Print the resulting dictionary
print(tf_dict)

defaultdict(<function <lambda> at 0x7f23a4ec5a60>, {'sport': defaultdict(<class 'int'>, {'fume': 2, 'robinson': 70, 'blast': 6, 'offici': 29, 'england': 281, 'coach': 143, 'andi': 52, 'said': 511, 'livid': 3, 'side': 173, 'deni': 37, 'two': 219, 'tri': 121, 'sunday': 59, '1913': 7, 'six': 124, 'nation': 121, 'loss': 23, 'ireland': 152, 'dublin': 22, 'mark': 66, 'cueto': 14, 'firsthalf': 4, 'effort': 29, 'rule': 45, 'offsid': 7, 'refere': 59, 'spurn': 2, 'tv': 4, 'replay': 17, 'crash': 15, 'die': 4, 'minut': 146, 'absolut': 10, 'spit': 1, 'cost': 6, 'told': 88, 'bbc': 46, 'sport': 77, 'got': 96, 'go': 181, 'back': 210, 'technolog': 5, 'know': 97, 'south': 43, 'african': 16, 'jonathan': 10, 'kaplan': 13, 'ahead': 43, 'charli': 14, 'hodgson': 36, 'flyhalf': 20, 'hoist': 1, 'crossfield': 1, 'kick': 62, 'sale': 18, 'wing': 17, 'gather': 7, 'declin': 9, 'chanc': 106, 'consult': 4, 'fourth': 35, 'josh': 9, 'lewsey': 16, 'took': 66, 'ball': 79, 'irish': 43, 'line': 52, 'pile': 3, 'bodi': 26, '

In [69]:
df_dict = {}

for i, row in train_df2.iterrows():
    # split the text into individual terms
    terms = row['Text'].split()
    # update the term count dictionary with the count of each term in this document
    for term in terms:
      try:
        df_dict[term] += 1
      except:
        df_dict[term]=1  

# print the term count in different doc 
print(df_dict)

{'fume': 3, 'robinson': 81, 'blast': 17, 'offici': 156, 'england': 352, 'coach': 147, 'andi': 63, 'said': 3835, 'livid': 3, 'side': 228, 'deni': 122, 'two': 672, 'tri': 311, 'sunday': 144, '1913': 7, 'six': 237, 'nation': 368, 'loss': 71, 'ireland': 186, 'dublin': 29, 'mark': 143, 'cueto': 14, 'firsthalf': 5, 'effort': 96, 'rule': 231, 'offsid': 7, 'refere': 59, 'spurn': 2, 'tv': 285, 'replay': 18, 'crash': 25, 'die': 75, 'minut': 196, 'absolut': 43, 'spit': 2, 'cost': 249, 'told': 459, 'bbc': 369, 'sport': 138, 'got': 205, 'go': 630, 'back': 478, 'technolog': 336, 'know': 226, 'south': 157, 'african': 40, 'jonathan': 27, 'kaplan': 13, 'ahead': 123, 'charli': 26, 'hodgson': 36, 'flyhalf': 20, 'hoist': 1, 'crossfield': 1, 'kick': 74, 'sale': 394, 'wing': 23, 'gather': 37, 'declin': 62, 'chanc': 168, 'consult': 53, 'fourth': 76, 'josh': 9, 'lewsey': 16, 'took': 207, 'ball': 86, 'irish': 79, 'line': 181, 'pile': 8, 'bodi': 87, 'could': 788, 'gamewin': 1, 'think': 329, 'score': 117, 'perfe

In [70]:
len(df_dict)

17769

In [71]:
# Step 2: Calculate inverse-document frequency (IDF) for each term
N = len(train_df2)
idf_dict = {term: math.log10(N / df) for term, df in df_dict.items()}

In [72]:
print(idf_dict)

{'fume': 2.599155000684555, 'robinson': 1.1677912365255678, 'blast': 1.8458273340259437, 'offici': 0.8831516570497561, 'england': 0.5297335919260866, 'coach': 0.9089589206560416, 'andi': 1.2769357059506359, 'said': -0.5074891128807821, 'livid': 2.599155000684555, 'side': 0.7183414084037638, 'deni': 0.9899164247294694, 'two': 0.24890698235039235, 'tri': 0.5835158663773801, 'sunday': 0.917913763308968, '1913': 2.231178215389961, 'six': 0.7015279093941138, 'nation': 0.5104284367306999, 'loss': 1.2250179066851423, 'ireland': 0.8067633111863013, 'dublin': 1.6138782575052615, 'mark': 0.9209402179391558, 'cueto': 1.9301482197259796, 'firsthalf': 2.377306251068199, 'effort': 1.0940050223646491, 'rule': 0.7126642755120733, 'offsid': 2.231178215389961, 'refere': 1.3054242437620733, 'spurn': 2.7752462597402365, 'tv': 0.6214313953957074, 'replay': 1.8210037503009116, 'crash': 1.67833624673218, 'die': 1.2012149920125175, 'minut': 0.7840201840477415, 'absolut': 1.4428077998246311, 'spit': 2.77524625

In [73]:
import numpy as np
import pandas as pd
tf_idf=np.zeros((len(set(train_df2['Category'])),len(df_dict.keys())))
tf_idf=pd.DataFrame(tf_idf)
tf_idf.columns=df_dict.keys()
tf_idf.index=set(train_df2['Category'])

In [74]:
# Step 3: Calculate TF-IDF score for each term in each category
for c,value in tf_dict.items():
  for term,termf in value.items():
    tf_idf.loc[c][term]=termf*idf_dict[term]

In [75]:
display(tf_idf)

Unnamed: 0,fume,robinson,blast,offici,england,coach,andi,said,livid,side,...,meandzoeg,jolli,masochist,mug,pap,yianni,papadoyiannaki,fivememb,tamper,dopingrel
business,2.599155,1.167791,0.0,42.39128,10.064938,0.0,1.276936,-436.440637,0.0,10.775121,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
politics,0.0,7.006747,0.0,39.741825,22.248811,0.908959,1.276936,-589.702349,0.0,15.803511,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
entertainment,0.0,3.503374,11.074964,9.714668,4.767602,2.726877,11.492421,-255.267024,0.0,6.465073,...,3.076276,3.076276,5.550493,3.076276,3.076276,0.0,0.0,0.0,0.0,0.0
sport,5.19831,81.745387,11.074964,25.611398,148.855139,129.981126,66.400657,-259.326937,7.797465,124.273064,...,0.0,0.0,0.0,0.0,0.0,3.076276,5.550493,3.076276,3.076276,3.076276
tech,0.0,1.167791,9.229137,20.312488,0.529734,0.0,0.0,-405.483801,0.0,6.465073,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [76]:
pred=np.empty(len(test_df2), dtype="str")
pred=pd.DataFrame(pred)
pred.index=test_df2.loc[:,'ArticleId']
pred.columns={'category'}
display(pred)

Unnamed: 0_level_0,category
ArticleId,Unnamed: 1_level_1
2160,
1360,
302,
864,
2184,
...,...
126,
1533,
1886,
1701,


In [77]:
# Calculate prior probability
total_docs = len(train_df2)
prior_prob = {}
for category, count in category_counts.items():
    prior_prob[category] = count / total_docs

# Print dictionary of prior probability
print(prior_prob)

{'business': 0.28187919463087246, 'tech': 0.21895973154362416, 'politics': 0.22986577181208054, 'sport': 0.2902684563758389, 'entertainment': 0.22902684563758388}


In [79]:
for i in range(len(test_df2)):
      nb=np.zeros(5)
      nb=pd.DataFrame(nb)
      nb.index=tf_idf.index
      for s in test_df2.iloc[i][1].split(' '):
        if(s in tf_idf.columns):
          if(tf_idf.loc['entertainment'].sum(axis=0)>0 and tf_idf.loc['entertainment'][s]>0):
              nb.loc['entertainment']+=math.log10((1+tf_idf.loc['entertainment'][s])/(tf_idf.loc['entertainment'].sum(axis=0)+len(cf_dict2)))
          if(tf_idf.loc['sport'].sum(axis=0)>0 and tf_idf.loc['sport'][s]>0):   
              nb.loc['sport']+=math.log10((1+tf_idf.loc['sport'][s])/(tf_idf.loc['sport'].sum(axis=0)+len(cf_dict2)))
          if(tf_idf.loc['tech'].sum(axis=0)>0 and tf_idf.loc['tech'][s]>0):    
              nb.loc['tech']+=math.log10((1+tf_idf.loc['tech'][s])/(tf_idf.loc['tech'].sum(axis=0)+len(cf_dict2)))
          if(tf_idf.loc['business'].sum(axis=0)>0 and tf_idf.loc['business'][s]>0):    
              nb.loc['business']+=math.log10((1+tf_idf.loc['business'][s])/(tf_idf.loc['business'].sum(axis=0)+len(cf_dict2)))
          if(tf_idf.loc['politics'].sum(axis=0)>0 and tf_idf.loc['politics'][s]>0):    
              nb.loc['politics']+=math.log10((1+tf_idf.loc['politics'][s])/(tf_idf.loc['politics'].sum(axis=0)+len(cf_dict2)))


      nb.loc['entertainment']+=math.log10(prior_prob['entertainment'])
      nb.loc['sport']+=math.log10(prior_prob['sport'])
      nb.loc['tech']+=math.log10(prior_prob['tech'])
      nb.loc['business']+=math.log10(prior_prob['business'])
      nb.loc['politics']+=math.log10(prior_prob['politics'])
      x=nb[[0]].idxmin()[0]
      pred.iloc[i][0]=x

In [80]:
display(pred)

Unnamed: 0_level_0,category
ArticleId,Unnamed: 1_level_1
2160,entertainment
1360,entertainment
302,politics
864,entertainment
2184,entertainment
...,...
126,entertainment
1533,business
1886,entertainment
1701,tech


**Classification report on TF-IDF:**

In [81]:
from sklearn.metrics import classification_report
report = classification_report(pred,test_df2.loc[:,'Category'] )
print(report)

               precision    recall  f1-score   support

     business       0.64      0.70      0.67        69
entertainment       0.87      0.40      0.55        99
     politics       0.32      0.64      0.43        28
        sport       0.81      0.94      0.87        54
         tech       0.59      0.71      0.64        48

     accuracy                           0.64       298
    macro avg       0.65      0.68      0.63       298
 weighted avg       0.71      0.64      0.64       298

