## Import Libraries & Data

In [141]:
pd.set_option('display.max_colwidth', None)

In [211]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly 
import plotly.express as px
import plotly.graph_objs as go
import plotly.offline as py
from plotly.offline import iplot
from plotly.subplots import make_subplots
import plotly.figure_factory as ff


from faker import Faker
import random
from datetime import datetime, timedelta

In [213]:
fake = Faker()

# Parameters
num_users = 1000
num_articles = 1000
topics = ['Technology', 'Health', 'Travel', 'Fitness', 'Cooking', 'AI', 'Fashion', 'Education']
cities = ['New York', 'San Francisco', 'Los Angeles', 'Chicago', 'Houston']

In [215]:
# Generate User Data
users = []
for i in range(1, num_users + 1):
    user_id = f"U{i:04}"
    age = random.randint(18, 60)
    gender = random.choice(['Male', 'Female', 'Non-Binary'])
    location = random.choice(cities)
    user_interests = random.sample(topics, k=random.randint(1, 3))
    user_status = random.choice(['New', 'Old'])
    has_history = random.choice(['Yes', 'No']) if user_status == 'Old' else 'No'
    users.append([user_id, age, gender, location, user_interests, user_status, has_history])

user_df = pd.DataFrame(users, columns=[
    'User_ID', 'Age', 'Gender', 'Location', 'User_Interests', 'User_Status', 'Has_History'])

user_df.head()

Unnamed: 0,User_ID,Age,Gender,Location,User_Interests,User_Status,Has_History
0,U0001,25,Male,San Francisco,"[Technology, Fitness]",New,No
1,U0002,33,Non-Binary,Chicago,"[Education, Cooking]",Old,No
2,U0003,45,Female,New York,[Cooking],New,No
3,U0004,19,Non-Binary,Chicago,[Fitness],Old,Yes
4,U0005,46,Male,Chicago,"[Travel, Fitness, Education]",New,No


In [217]:
# Generate Article Data
articles = []
for i in range(1, num_articles + 1):
    article_id = f"A{i:04}"
    title = fake.sentence(nb_words=6)
    content = fake.paragraph(nb_sentences=5)
    tags = random.sample(topics, k=random.randint(1, 3))
    published_date = fake.date_between(start_date='-2y', end_date='today')
    articles.append([article_id, title, content, tags, published_date])

article_df = pd.DataFrame(articles, columns=[
    'Article_ID', 'Title', 'Content', 'Tags', 'Published_Date'])

article_df.head()

Unnamed: 0,Article_ID,Title,Content,Tags,Published_Date
0,A0001,Best rule official store most themselves billion.,Decision base support tax campaign response. Reason maybe fear popular staff statement indicate. World new interview up friend great. End financial boy early among along. Short will able thousand southern meeting.,"[Health, Fashion]",2023-10-13
1,A0002,Nature foreign stop girl key drug.,Speak drive family hand manager property strategy. Agency after always. Record base live list.,"[AI, Cooking, Health]",2024-04-18
2,A0003,Trade specific example decide sell clearly one.,High study section despite cover company generation. To represent get mother project all movement capital. Ahead democratic source really. Very own their administration.,"[Technology, Education, Health]",2023-06-03
3,A0004,Own black hair.,Because week sell recent. Beyond two PM it process indeed fear. Wish meeting dinner occur past place. See upon yes hot letter.,[Travel],2023-11-05
4,A0005,Information score speak challenge.,Join poor appear rule. Garden its child learn and. Career audience again best direction front. Visit message measure culture.,"[Cooking, Travel]",2022-12-15


In [218]:
# Save Data to CSVs (optional)
user_df.to_csv('users_data.csv', index=False)
article_df.to_csv('articles_data.csv', index=False)

In [220]:
user_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   User_ID         1000 non-null   object
 1   Age             1000 non-null   int64 
 2   Gender          1000 non-null   object
 3   Location        1000 non-null   object
 4   User_Interests  1000 non-null   object
 5   User_Status     1000 non-null   object
 6   Has_History     1000 non-null   object
dtypes: int64(1), object(6)
memory usage: 54.8+ KB


In [223]:
# Ensure both DataFrames have the same number of rows
df = pd.concat([user_df.reset_index(drop=True), article_df.reset_index(drop=True)], axis=1)
df.to_csv('combined_user_article_data.csv', index=False)

## Data Cleaning

In [226]:
import neattext.functions as nfx
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity,linear_kernel

In [228]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   User_ID         1000 non-null   object
 1   Age             1000 non-null   int64 
 2   Gender          1000 non-null   object
 3   Location        1000 non-null   object
 4   User_Interests  1000 non-null   object
 5   User_Status     1000 non-null   object
 6   Has_History     1000 non-null   object
 7   Article_ID      1000 non-null   object
 8   Title           1000 non-null   object
 9   Content         1000 non-null   object
 10  Tags            1000 non-null   object
 11  Published_Date  1000 non-null   object
dtypes: int64(1), object(11)
memory usage: 93.9+ KB


In [230]:
df['Clean_Title']=df['Title'].apply(nfx.remove_stopwords)
df['Clean_Title']=df['Clean_Title'].apply(nfx.remove_special_characters)
df['Clean_Content']=df['Content'].apply(nfx.remove_stopwords)
df['Clean_Content']=df['Clean_Content'].apply(nfx.remove_special_characters)
df.head(2)

Unnamed: 0,User_ID,Age,Gender,Location,User_Interests,User_Status,Has_History,Article_ID,Title,Content,Tags,Published_Date,Clean_Title,Clean_Content
0,U0001,25,Male,San Francisco,"[Technology, Fitness]",New,No,A0001,Best rule official store most themselves billion.,Decision base support tax campaign response. Reason maybe fear popular staff statement indicate. World new interview up friend great. End financial boy early among along. Short will able thousand southern meeting.,"[Health, Fashion]",2023-10-13,Best rule official store billion,Decision base support tax campaign response Reason maybe fear popular staff statement indicate World new interview friend great End financial boy early along Short able thousand southern meeting
1,U0002,33,Non-Binary,Chicago,"[Education, Cooking]",Old,No,A0002,Nature foreign stop girl key drug.,Speak drive family hand manager property strategy. Agency after always. Record base live list.,"[AI, Cooking, Health]",2024-04-18,Nature foreign stop girl key drug,Speak drive family hand manager property strategy Agency always Record base live list


In [232]:
df[['Content','Clean_Content']]

Unnamed: 0,Content,Clean_Content
0,Decision base support tax campaign response. Reason maybe fear popular staff statement indicate. World new interview up friend great. End financial boy early among along. Short will able thousand southern meeting.,Decision base support tax campaign response Reason maybe fear popular staff statement indicate World new interview friend great End financial boy early along Short able thousand southern meeting
1,Speak drive family hand manager property strategy. Agency after always. Record base live list.,Speak drive family hand manager property strategy Agency always Record base live list
2,High study section despite cover company generation. To represent get mother project all movement capital. Ahead democratic source really. Very own their administration.,High study section despite cover company generation represent mother project movement capital Ahead democratic source really administration
3,Because week sell recent. Beyond two PM it process indeed fear. Wish meeting dinner occur past place. See upon yes hot letter.,week sell recent PM process fear Wish meeting dinner occur past place yes hot letter
4,Join poor appear rule. Garden its child learn and. Career audience again best direction front. Visit message measure culture.,Join poor appear rule Garden child learn and Career audience best direction front Visit message measure culture
...,...,...
995,It line wish spend follow write matter. Total onto relationship cut budget. Imagine reach likely evidence. World reduce role city here series general factor.,line wish spend follow write matter Total relationship cut budget Imagine reach likely evidence World reduce role city series general factor
996,Pm enjoy assume once. Deal smile key attack treat have stand. See buy organization. Player common this spend impact. Writer finish my thus question these under fill.,Pm enjoy assume once Deal smile key attack treat stand buy organization Player common spend impact Writer finish question fill
997,Recent head money end. Sit day edge force thus police. Choice pretty if wonder. Yourself unit front once attention run coach. Play entire carry here.,Recent head money end Sit day edge force police Choice pretty wonder unit attention run coach Play entire carry here
998,Anything back still interesting market return involve section. Tonight interest charge organization father. Attention choice sure song attorney. Foreign newspaper yes worry cost never career bank.,interesting market return involve section Tonight interest charge organization father Attention choice sure song attorney Foreign newspaper yes worry cost career bank


In [234]:
df[['Title','Clean_Title']]

Unnamed: 0,Title,Clean_Title
0,Best rule official store most themselves billion.,Best rule official store billion
1,Nature foreign stop girl key drug.,Nature foreign stop girl key drug
2,Trade specific example decide sell clearly one.,Trade specific example decide sell clearly one
3,Own black hair.,black hair
4,Information score speak challenge.,Information score speak challenge
...,...,...
995,Bit past environment reduce house development rate.,Bit past environment reduce house development rate
996,Hit ahead consider kitchen.,Hit ahead consider kitchen
997,Operation value worker wall music develop at.,Operation value worker wall music develop at
998,Sing board yeah leg million candidate physical.,Sing board yeah leg million candidate physical


## Article - Title Recommendation

In [236]:
# text vectorization
count_vect=CountVectorizer()
cv_mat=count_vect.fit_transform(df['Clean_Title'])
cv_mat

<1000x907 sparse matrix of type '<class 'numpy.int64'>'
	with 4553 stored elements in Compressed Sparse Row format>

In [238]:
cv_mat.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [240]:
df_cv_words = pd.DataFrame(cv_mat.todense(), columns=count_vect.get_feature_names_out())
df_cv_words.head(2)

Unnamed: 0,ability,able,about,above,accept,according,account,act,action,activity,...,wrong,yard,yeah,year,yes,yet,you,young,your,yourself
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [242]:
#Cosine similarity matrix 
cosine_sim_mat=cosine_similarity(cv_mat)
cosine_sim_mat

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [244]:
# Get title id/index
title_indices=pd.Series(df.index,index=df['Title']).drop_duplicates()
title_indices

Title
Best rule official store most themselves billion.        0
Nature foreign stop girl key drug.                       1
Trade specific example decide sell clearly one.          2
Own black hair.                                          3
Information score speak challenge.                       4
                                                      ... 
Bit past environment reduce house development rate.    995
Hit ahead consider kitchen.                            996
Operation value worker wall music develop at.          997
Sing board yeah leg million candidate physical.        998
Once perform rock continue.                            999
Length: 1000, dtype: int64

In [248]:
idx=title_indices['Hit ahead consider kitchen.']
idx

996

In [250]:
scores=list(enumerate(cosine_sim_mat[idx]))
scores

[(0, 0.0),
 (1, 0.0),
 (2, 0.0),
 (3, 0.0),
 (4, 0.0),
 (5, 0.0),
 (6, 0.0),
 (7, 0.0),
 (8, 0.0),
 (9, 0.0),
 (10, 0.0),
 (11, 0.0),
 (12, 0.0),
 (13, 0.0),
 (14, 0.0),
 (15, 0.0),
 (16, 0.0),
 (17, 0.0),
 (18, 0.0),
 (19, 0.0),
 (20, 0.0),
 (21, 0.0),
 (22, 0.0),
 (23, 0.0),
 (24, 0.0),
 (25, 0.0),
 (26, 0.0),
 (27, 0.0),
 (28, 0.0),
 (29, 0.0),
 (30, 0.0),
 (31, 0.0),
 (32, 0.0),
 (33, 0.0),
 (34, 0.0),
 (35, 0.0),
 (36, 0.0),
 (37, 0.0),
 (38, 0.0),
 (39, 0.0),
 (40, 0.0),
 (41, 0.0),
 (42, 0.0),
 (43, 0.0),
 (44, 0.0),
 (45, 0.0),
 (46, 0.0),
 (47, 0.0),
 (48, 0.0),
 (49, 0.0),
 (50, 0.0),
 (51, 0.0),
 (52, 0.0),
 (53, 0.0),
 (54, 0.0),
 (55, 0.0),
 (56, 0.0),
 (57, 0.0),
 (58, 0.0),
 (59, 0.0),
 (60, 0.0),
 (61, 0.0),
 (62, 0.0),
 (63, 0.20412414523193154),
 (64, 0.0),
 (65, 0.0),
 (66, 0.0),
 (67, 0.0),
 (68, 0.0),
 (69, 0.0),
 (70, 0.0),
 (71, 0.0),
 (72, 0.0),
 (73, 0.0),
 (74, 0.0),
 (75, 0.0),
 (76, 0.0),
 (77, 0.0),
 (78, 0.0),
 (79, 0.0),
 (80, 0.0),
 (81, 0.0),
 (82, 0.0)

In [252]:
sorted_scores=sorted(scores,key=lambda x:x[1],reverse=True)
sorted_scores[1:]

[(494, 0.35355339059327373),
 (903, 0.35355339059327373),
 (786, 0.2886751345948129),
 (125, 0.25),
 (137, 0.25),
 (409, 0.25),
 (694, 0.25),
 (790, 0.25),
 (925, 0.25),
 (976, 0.25),
 (158, 0.22360679774997896),
 (192, 0.22360679774997896),
 (538, 0.22360679774997896),
 (696, 0.22360679774997896),
 (928, 0.22360679774997896),
 (963, 0.22360679774997896),
 (63, 0.20412414523193154),
 (111, 0.20412414523193154),
 (438, 0.20412414523193154),
 (521, 0.20412414523193154),
 (797, 0.20412414523193154),
 (873, 0.20412414523193154),
 (288, 0.1889822365046136),
 (827, 0.1889822365046136),
 (170, 0.17677669529663687),
 (334, 0.17677669529663687),
 (0, 0.0),
 (1, 0.0),
 (2, 0.0),
 (3, 0.0),
 (4, 0.0),
 (5, 0.0),
 (6, 0.0),
 (7, 0.0),
 (8, 0.0),
 (9, 0.0),
 (10, 0.0),
 (11, 0.0),
 (12, 0.0),
 (13, 0.0),
 (14, 0.0),
 (15, 0.0),
 (16, 0.0),
 (17, 0.0),
 (18, 0.0),
 (19, 0.0),
 (20, 0.0),
 (21, 0.0),
 (22, 0.0),
 (23, 0.0),
 (24, 0.0),
 (25, 0.0),
 (26, 0.0),
 (27, 0.0),
 (28, 0.0),
 (29, 0.0),
 (30,

In [254]:
# selected title indices
selected_title_indx=[i[0] for i in sorted_scores[1:]]
selected_title_indx

[494,
 903,
 786,
 125,
 137,
 409,
 694,
 790,
 925,
 976,
 158,
 192,
 538,
 696,
 928,
 963,
 63,
 111,
 438,
 521,
 797,
 873,
 288,
 827,
 170,
 334,
 0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 159,
 160,
 161,
 162,
 163,


In [256]:
# selected title scores
selected_title_scores=[i[1] for i in sorted_scores[1:]]

In [258]:
recommended_result=df['Title'].iloc[selected_title_indx]
rec_df=pd.DataFrame(recommended_result)
rec_df['similarity_scores']=selected_title_scores
rec_df

Unnamed: 0,Title,similarity_scores
494,None visit kitchen.,0.353553
903,Consider under him.,0.353553
786,Though every ahead idea sure.,0.288675
125,Full kitchen law short green.,0.250000
137,Simple what employee consider this full sister.,0.250000
...,...,...
994,My push leader learn happy.,0.000000
995,Bit past environment reduce house development rate.,0.000000
997,Operation value worker wall music develop at.,0.000000
998,Sing board yeah leg million candidate physical.,0.000000


In [260]:
def recommend_title(title, num_of_rec=10):
    # ID for title
    idx = title_indices[title]
    # Course Indice
    # Search inside cosine_sim_mat
    scores = list(enumerate(cosine_sim_mat[idx]))
    # Scores
    # Sort Scores
    sorted_scores = sorted(scores,key=lambda x:x[1],reverse=True)
    # Recommend
    selected_title_indices = [i[0] for i in sorted_scores[1:]]
    selected_title_scores = [i[1] for i in sorted_scores[1:]]
    result = article_df['Title'].iloc[selected_title_indx]
    rec_df = pd.DataFrame(result)
    rec_df['similarity_scores'] = selected_title_scores
    return rec_df.head(num_of_rec)

In [264]:
recommend_title('Hit ahead consider kitchen.')

Unnamed: 0,Title,similarity_scores
494,None visit kitchen.,0.353553
903,Consider under him.,0.353553
786,Though every ahead idea sure.,0.288675
125,Full kitchen law short green.,0.25
137,Simple what employee consider this full sister.,0.25
409,Enjoy consider billion which.,0.25
694,Gas front expect significant hit.,0.25
790,Spend evening already hour hit.,0.25
925,He hit available game amount.,0.25
976,With consider sell everyone sea whatever.,0.25


In [267]:
# text vectorization
count_vect=CountVectorizer()
cv_mat=count_vect.fit_transform(df['Clean_Content'])
cv_mat

<1000x966 sparse matrix of type '<class 'numpy.int64'>'
	with 20438 stored elements in Compressed Sparse Row format>

In [269]:
cv_mat.todense()

matrix([[0, 1, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [271]:
df_cv_words = pd.DataFrame(cv_mat.todense(), columns=count_vect.get_feature_names_out())
df_cv_words.head(2)

Unnamed: 0,ability,able,about,above,accept,according,account,across,act,action,...,wrong,yard,yeah,year,yes,yet,you,young,your,yourself
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [273]:
#Cosine similarity matrix 
cosine_sim_mat=cosine_similarity(cv_mat)
cosine_sim_mat

array([[1.        , 0.05241424, 0.        , ..., 0.04225771, 0.        ,
        0.05976143],
       [0.05241424, 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.05170877,
        0.153393  ],
       ...,
       [0.04225771, 0.        , 0.        , ..., 1.        , 0.09534626,
        0.        ],
       [0.        , 0.        , 0.05170877, ..., 0.09534626, 1.        ,
        0.        ],
       [0.05976143, 0.        , 0.153393  , ..., 0.        , 0.        ,
        1.        ]])

In [275]:
# Get content id/index
content_indices=pd.Series(df.index,index=df['Content']).drop_duplicates()
content_indices

Content
Decision base support tax campaign response. Reason maybe fear popular staff statement indicate. World new interview up friend great. End financial boy early among along. Short will able thousand southern meeting.      0
Speak drive family hand manager property strategy. Agency after always. Record base live list.                                                                                                                             1
High study section despite cover company generation. To represent get mother project all movement capital. Ahead democratic source really. Very own their administration.                                                  2
Because week sell recent. Beyond two PM it process indeed fear. Wish meeting dinner occur past place. See upon yes hot letter.                                                                                             3
Join poor appear rule. Garden its child learn and. Career audience again best direction front. Visit message

In [277]:
idx=title_indices['Join poor appear rule.']
idx

KeyError: 'Join poor appear rule.'

In [250]:
scores=list(enumerate(cosine_sim_mat[idx]))
scores

[(0, 0.0),
 (1, 0.0),
 (2, 0.0),
 (3, 0.0),
 (4, 0.0),
 (5, 0.0),
 (6, 0.0),
 (7, 0.0),
 (8, 0.0),
 (9, 0.0),
 (10, 0.0),
 (11, 0.0),
 (12, 0.0),
 (13, 0.0),
 (14, 0.0),
 (15, 0.0),
 (16, 0.0),
 (17, 0.0),
 (18, 0.0),
 (19, 0.0),
 (20, 0.0),
 (21, 0.0),
 (22, 0.0),
 (23, 0.0),
 (24, 0.0),
 (25, 0.0),
 (26, 0.0),
 (27, 0.0),
 (28, 0.0),
 (29, 0.0),
 (30, 0.0),
 (31, 0.0),
 (32, 0.0),
 (33, 0.0),
 (34, 0.0),
 (35, 0.0),
 (36, 0.0),
 (37, 0.0),
 (38, 0.0),
 (39, 0.0),
 (40, 0.0),
 (41, 0.0),
 (42, 0.0),
 (43, 0.0),
 (44, 0.0),
 (45, 0.0),
 (46, 0.0),
 (47, 0.0),
 (48, 0.0),
 (49, 0.0),
 (50, 0.0),
 (51, 0.0),
 (52, 0.0),
 (53, 0.0),
 (54, 0.0),
 (55, 0.0),
 (56, 0.0),
 (57, 0.0),
 (58, 0.0),
 (59, 0.0),
 (60, 0.0),
 (61, 0.0),
 (62, 0.0),
 (63, 0.20412414523193154),
 (64, 0.0),
 (65, 0.0),
 (66, 0.0),
 (67, 0.0),
 (68, 0.0),
 (69, 0.0),
 (70, 0.0),
 (71, 0.0),
 (72, 0.0),
 (73, 0.0),
 (74, 0.0),
 (75, 0.0),
 (76, 0.0),
 (77, 0.0),
 (78, 0.0),
 (79, 0.0),
 (80, 0.0),
 (81, 0.0),
 (82, 0.0)

In [252]:
sorted_scores=sorted(scores,key=lambda x:x[1],reverse=True)
sorted_scores[1:]

[(494, 0.35355339059327373),
 (903, 0.35355339059327373),
 (786, 0.2886751345948129),
 (125, 0.25),
 (137, 0.25),
 (409, 0.25),
 (694, 0.25),
 (790, 0.25),
 (925, 0.25),
 (976, 0.25),
 (158, 0.22360679774997896),
 (192, 0.22360679774997896),
 (538, 0.22360679774997896),
 (696, 0.22360679774997896),
 (928, 0.22360679774997896),
 (963, 0.22360679774997896),
 (63, 0.20412414523193154),
 (111, 0.20412414523193154),
 (438, 0.20412414523193154),
 (521, 0.20412414523193154),
 (797, 0.20412414523193154),
 (873, 0.20412414523193154),
 (288, 0.1889822365046136),
 (827, 0.1889822365046136),
 (170, 0.17677669529663687),
 (334, 0.17677669529663687),
 (0, 0.0),
 (1, 0.0),
 (2, 0.0),
 (3, 0.0),
 (4, 0.0),
 (5, 0.0),
 (6, 0.0),
 (7, 0.0),
 (8, 0.0),
 (9, 0.0),
 (10, 0.0),
 (11, 0.0),
 (12, 0.0),
 (13, 0.0),
 (14, 0.0),
 (15, 0.0),
 (16, 0.0),
 (17, 0.0),
 (18, 0.0),
 (19, 0.0),
 (20, 0.0),
 (21, 0.0),
 (22, 0.0),
 (23, 0.0),
 (24, 0.0),
 (25, 0.0),
 (26, 0.0),
 (27, 0.0),
 (28, 0.0),
 (29, 0.0),
 (30,

In [254]:
# selected title indices
selected_title_indx=[i[0] for i in sorted_scores[1:]]
selected_title_indx

[494,
 903,
 786,
 125,
 137,
 409,
 694,
 790,
 925,
 976,
 158,
 192,
 538,
 696,
 928,
 963,
 63,
 111,
 438,
 521,
 797,
 873,
 288,
 827,
 170,
 334,
 0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 159,
 160,
 161,
 162,
 163,


In [256]:
# selected title scores
selected_title_scores=[i[1] for i in sorted_scores[1:]]

In [258]:
recommended_result=df['Title'].iloc[selected_title_indx]
rec_df=pd.DataFrame(recommended_result)
rec_df['similarity_scores']=selected_title_scores
rec_df

Unnamed: 0,Title,similarity_scores
494,None visit kitchen.,0.353553
903,Consider under him.,0.353553
786,Though every ahead idea sure.,0.288675
125,Full kitchen law short green.,0.250000
137,Simple what employee consider this full sister.,0.250000
...,...,...
994,My push leader learn happy.,0.000000
995,Bit past environment reduce house development rate.,0.000000
997,Operation value worker wall music develop at.,0.000000
998,Sing board yeah leg million candidate physical.,0.000000


In [260]:
def recommend_title(title, num_of_rec=10):
    # ID for title
    idx = title_indices[title]
    # Course Indice
    # Search inside cosine_sim_mat
    scores = list(enumerate(cosine_sim_mat[idx]))
    # Scores
    # Sort Scores
    sorted_scores = sorted(scores,key=lambda x:x[1],reverse=True)
    # Recommend
    selected_title_indices = [i[0] for i in sorted_scores[1:]]
    selected_title_scores = [i[1] for i in sorted_scores[1:]]
    result = article_df['Title'].iloc[selected_title_indx]
    rec_df = pd.DataFrame(result)
    rec_df['similarity_scores'] = selected_title_scores
    return rec_df.head(num_of_rec)

In [264]:
recommend_title('Hit ahead consider kitchen.')

Unnamed: 0,Title,similarity_scores
494,None visit kitchen.,0.353553
903,Consider under him.,0.353553
786,Though every ahead idea sure.,0.288675
125,Full kitchen law short green.,0.25
137,Simple what employee consider this full sister.,0.25
409,Enjoy consider billion which.,0.25
694,Gas front expect significant hit.,0.25
790,Spend evening already hour hit.,0.25
925,He hit available game amount.,0.25
976,With consider sell everyone sea whatever.,0.25
