In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
data = pd.read_csv('Cleaned-Data-Keywords.csv')

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5004 entries, 0 to 5003
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Unnamed: 0.1   5004 non-null   int64 
 1   Unnamed: 0     5004 non-null   int64 
 2   Title          5004 non-null   object
 3   Article        5004 non-null   object
 4   Category       5004 non-null   object
 5   Summary        5004 non-null   object
 6   Article_clean  5004 non-null   object
 7   Summary_clean  5004 non-null   object
 8   Keywords       5004 non-null   object
dtypes: int64(2), object(7)
memory usage: 352.0+ KB


In [5]:
data = data.drop(labels = ['Unnamed: 0.1','Unnamed: 0' ], axis = 1)

In [6]:
data.head(100)

Unnamed: 0,Title,Article,Category,Summary,Article_clean,Summary_clean,Keywords
0,Askia Muhammad I,"Askia Muhammad I (b. 1443 – d. 1538), born Muh...",People,"Askia Muhammad I (b. 1443 – d. 1538), born Muh...",askia muhammad 1443 1538 born muhammad ibn abi...,askia muhammad 1443 1538 born muhammad ibn abi...,"['askia muhammad', 'descend', 'dynasti songhai..."
1,Wood,Wood is a structural tissue found in the stems...,Technology,Wood is a structural tissue found in the stems...,wood structur tissu found stem root tree woodi...,wood structural tissue found stem root tree wo...,"['wood structur tissu', 'use', 'growth ring', ..."
2,Horn (instrument),A horn is any of a family of musical instrumen...,Arts,A horn is any of a family of musical instrumen...,horn ani famili music instrument made tube usu...,horn family musical instrument made tube usual...,"['horn ani famili music instrument', 'use', 'p..."
3,Corvus,Corvus is a widely distributed genus of medium...,Biology_and_health_sciences,Corvus is a widely distributed genus of medium...,corvu wide distribut genu medium larg bird fam...,corvus widely distributed genus mediumsized la...,['includ speci commonli known crow raven rook'...
4,Cebu City,"Cebu City, officially the City of Cebu (Cebuan...",Geography,"Cebu City, officially the City of Cebu (Cebuan...",cebu citi offici citi cebu cebuano dakbayan su...,cebu city officially city cebu cebuano dakbaya...,"['cebu citi offici', 'locat', 'barangay', 'sch..."
...,...,...,...,...,...,...,...
95,Terence,Publius Terentius Afer (; c. 195/185 – c. 159?...,People,Publius Terentius Afer (; c. 195/185 – c. 159?...,publiu terentiu afer 195185 159 becaus better ...,publius terentius afer 195185 159 better known...,"['play', 'terenc roman african playwright dure..."
96,Saxophone,The saxophone (often referred to colloquially ...,Arts,The saxophone (often referred to colloquially ...,saxophon often refer colloqui sax type singler...,saxophone often referred colloquially sax type...,"['saxophon', 'jazz', 'sax type singlere woodwi..."
97,Helmut Kohl,Helmut Josef Michael Kohl (German pronunciatio...,People,Helmut Josef Michael Kohl (German pronunciatio...,helmut josef michael kohl german pronunci list...,helmut josef michael kohl german pronunciation...,"['kohl german', 'european', 'state', 'polit', ..."
98,Track and field,Track and field is a sport that includes athle...,Everyday_life,Track and field is a sport that includes athle...,track field sport includ athlet contest base r...,track field sport includes athletic contest ba...,"['event', 'record', 'track field sport includ ..."


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5004 entries, 0 to 5003
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Title          5004 non-null   object
 1   Article        5004 non-null   object
 2   Category       5004 non-null   object
 3   Summary        5004 non-null   object
 4   Article_clean  5004 non-null   object
 5   Summary_clean  5004 non-null   object
 6   Keywords       5004 non-null   object
dtypes: object(7)
memory usage: 273.8+ KB


In [8]:
tf_idf = TfidfVectorizer()

In [9]:
# fitting the vectorizer on the Keywords
tf_idf.fit(data['Keywords'])

TfidfVectorizer()

In [10]:
# defining a function to recommend articles based on category and keywords
def recommend(category, keyword):
    # filter the data by the specified category
    category_data = data[data['Category'] == category]
    
    # vectorizing the input keyword
    keyword_vector = tf_idf.transform([keyword])
    
    # computing the cosine similarity between the keyword vector and the Keywords column in the specified category
    keywords_vectors = tf_idf.transform(category_data['Keywords'])
    similarity_scores = cosine_similarity(keyword_vector, keywords_vectors)[0]
    
    # sort the articles by their similarity scores and return the top 10 articles
    indices = similarity_scores.argsort()[::-1][:10]
    recommended_articles = category_data.iloc[indices][['Title', 'Article']]
    return recommended_articles

In [11]:
recommended_articles = recommend("Everyday_life", "track")
print(recommended_articles)

                     Title                                            Article
692           Horse racing  Horse racing is an equestrian performance spor...
4053             Long jump  The long jump is a track and field event in wh...
3257              Hurdling  Hurdling is the act of jumping over an obstacl...
1817               Karaoke  Karaoke (; Japanese: [kaɾaoke] (listen); カラオケ,...
4919                  Luge  A luge  is a small one- or two-person sled on ...
529     Sport of athletics  Athletics is a group of sporting events that i...
98         Track and field  Track and field is a sport that includes athle...
1256  Cross-country skiing  Cross-country skiing is a form of skiing where...
2055                Karate  Karate (空手) (; Japanese pronunciation: [kaɾate...
2038         Equestrianism  Equestrianism (from Latin equester, equestr-, ...
