In [1]:
import pandas as pd
import glob






In [2]:
path = 'Fetched/*.csv'
all_files = glob.glob(path)
df_list = []

for filename in all_files:
    df = pd.read_csv(filename)
    df_list.append(df)

In [3]:
df_list

[   Category                                            Content  \
 0  Business  How the health of the economy is measured, and...   
 1  Business  Simon Clarke tells inquiry when he realised ex...   
 2  Business  Shoplifting fell slightly last year according ...   
 3  Business  Andrew Bailey told the BBC the recession was "...   
 4  Business  The rate at which prices are rising has droppe...   
 5  Business  Bank of England interest rates have an impact ...   
 6  Business  The Bank of England will decide interest rates...   
 7  Business  The move follows Huawei's release of an AI-ena...   
 
                                              Link Source  \
 0  https://www.bbc.com/news/articles/cld0rxlqgggo    BBC   
 1  https://www.bbc.com/news/articles/cld0rxlqgggo    BBC   
 2  https://www.bbc.com/news/articles/cld0rxlqgggo    BBC   
 3  https://www.bbc.com/news/articles/cld0rxlqgggo    BBC   
 4  https://www.bbc.com/news/articles/cld0rxlqgggo    BBC   
 5  https://www.bbc.com/news/

In [4]:
df_all = pd.concat(df_list, ignore_index=True)

In [5]:
df_all

Unnamed: 0,Category,Content,Link,Source,Title,Unnamed: 5
0,Business,"How the health of the economy is measured, and...",https://www.bbc.com/news/articles/cld0rxlqgggo,BBC,What is GDP and how does it affect me?,
1,Business,Simon Clarke tells inquiry when he realised ex...,https://www.bbc.com/news/articles/cld0rxlqgggo,BBC,Post Office 'misled and deceived me' says key ...,
2,Business,Shoplifting fell slightly last year according ...,https://www.bbc.com/news/articles/cld0rxlqgggo,BBC,Record levels of shoplifting show signs of fal...,
3,Business,"Andrew Bailey told the BBC the recession was ""...",https://www.bbc.com/news/articles/cld0rxlqgggo,BBC,Bank of England boss weighs in on UK economy row,
4,Business,The rate at which prices are rising has droppe...,https://www.bbc.com/news/articles/cld0rxlqgggo,BBC,How fast are prices rising in the UK?,
...,...,...,...,...,...,...
200,Sports,Denver Nuggets star Nikola Jokic was named the...,https://www.nbcnews.com/news/sports/nuggets-st...,NBC,Nuggets star Nikola Jokic wins 2024 NBA MVP aw...,
201,Sports,INDIANAPOLIS — Indianapolis police announced W...,https://www.nbcnews.com/news/sports/milwaukee-...,NBC,Police investigating after Milwaukee Bucks pla...,
202,Sports,An Indianapolis sports columnist will not be c...,https://www.nbcnews.com/news/us-news/indystar-...,NBC,IndyStar columnist won't cover Caitlin Clark a...,
203,Sports,"Three-time , 34, is making his way from to ...",https://www.nbcnews.com/pop-culture/pop-cultur...,NBC,Travis Kelce lands first major acting role in ...,


### Preparing Data For Classification


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initializing a TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=1000)

# Fitting and transforming the text data
tfidf_matrix = tfidf_vectorizer.fit_transform(df_all['Category'].values.astype('U'))  


Using KMeans

In [7]:
from sklearn.cluster import KMeans
import pickle

num_clusters = 4

# Initializing K-Means clustering
km = KMeans(n_clusters=num_clusters)

# Fittting K-Means clustering on the TF-IDF matrix
km.fit(tfidf_matrix)

#saving as a pickle 
with open('kmeans.pkl', 'wb') as f:
    pickle.dump(km, f)

### Assigning clusters to Dataframe

In [8]:
km.labels_

array([3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1])

In [9]:
df_all['Cluster'] = km.labels_

In [10]:
df_all.head()

Unnamed: 0,Category,Content,Link,Source,Title,Unnamed: 5,Cluster
0,Business,"How the health of the economy is measured, and...",https://www.bbc.com/news/articles/cld0rxlqgggo,BBC,What is GDP and how does it affect me?,,3
1,Business,Simon Clarke tells inquiry when he realised ex...,https://www.bbc.com/news/articles/cld0rxlqgggo,BBC,Post Office 'misled and deceived me' says key ...,,3
2,Business,Shoplifting fell slightly last year according ...,https://www.bbc.com/news/articles/cld0rxlqgggo,BBC,Record levels of shoplifting show signs of fal...,,3
3,Business,"Andrew Bailey told the BBC the recession was ""...",https://www.bbc.com/news/articles/cld0rxlqgggo,BBC,Bank of England boss weighs in on UK economy row,,3
4,Business,The rate at which prices are rising has droppe...,https://www.bbc.com/news/articles/cld0rxlqgggo,BBC,How fast are prices rising in the UK?,,3


### Saving the dataframe 

In [11]:
df_all.to_csv('data.csv')

In [12]:
cluster_categories = []
for cluster_id in range(num_clusters):
    cluster_articles = df_all[df_all['Cluster'] == cluster_id]
    most_common_category = cluster_articles['Category'].mode().values[0]
    cluster_categories.append(most_common_category)

In [13]:
cluster_categories

['Politics', 'Sports', 'Entertainment', 'Business']