In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD
from pymongo.mongo_client import MongoClient
from scipy.spatial.distance import cosine
from pandas import json_normalize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import hstack
from scipy import sparse
from sklearn.preprocessing import normalize
from scipy.sparse import csr_matrix

In [4]:
from dotenv import load_dotenv
import requests
import os

In [7]:
load_dotenv()
MONGO_CONNECTION_STRING = os.getenv("MONGO_CONNECTION_STRING")

## pull data from mongo

In [6]:
def connect_to_mongodb():
    ''' set up mongodb atlas connection'''
    
    uri = MONGO_CONNECTION_STRING
    client = MongoClient(uri)

    # send a ping to confirm a successful connection
    try:
        client.admin.command('ping')
        print("Pinged your deployment. Connection successful!")
    except Exception as e:
        print(e)

    db = client.medium_database
    return db

In [4]:
db = connect_to_mongodb()

Pinged your deployment. Connection successful!


In [52]:
def get_data(db):
    '''fetch data from server'''
    writer_data = pd.DataFrame(list(db.writer_information.find({})))
    follower_data = pd.DataFrame(list(db.follower_information.find({})))
    print(f'fetched {len(writer_data)}+{len(follower_data)} records')
    return writer_data, follower_data

In [53]:
writers_raw, followers = get_data(db)
# Remove the _id field as it's not needed for analysis
del followers['_id']

fetched 16+1 records


In [7]:
writers_raw.head(3).T

Unnamed: 0,0,1,2
_id,65d97b8fd307609d56dca275,65d97b8fd307609d56dca276,65d97b8fd307609d56dca277
id,fca9db1c7da0,e10ad955760c,76398be9016
username,sheilateozy,nikhiladithyan,machine-learning-made-simple
fullname,Sheila Teo,Nikhil Adithyan,Devansh
bio,"Data Scientist, https://www.linkedin.com/in/sh...",Founder @BacktestZone (https://www.backtestzon...,"Writing about AI, Math, the Tech Industry and ..."
followers_count,2111,7044,13172
following_count,16,41,21
publication_following_count,2,11,2
image_url,https://miro.medium.com/1*UmlZGQsNhuv9kgQL6pFs...,https://miro.medium.com/1*fiFn4AhPBi-CG-cKxHk2...,https://miro.medium.com/1*xiFRgHfgfMR7S111UB2h...
twitter_username,,,Machine01776819


In [8]:
# Exploding 'top_articles' into separate rows for each article
# Ensure each article retains the writer's information
writers_exploded = writers_raw.explode('top_articles')

# Normalize the exploded 'top_articles' JSON into a flat table
articles_normalized = json_normalize(writers_exploded['top_articles'])

# Combine the normalized article data with the original writer data (minus the 'top_articles' column)
writers_exploded = writers_exploded.drop('top_articles', axis=1).reset_index(drop=True)
writers_combined = pd.concat([writers_exploded, articles_normalized], axis=1)

# Dropping columns that may not be useful for computing features and similarity
columns_to_drop = ['_id', 'image_url', 'twitter_username', 'medium_member_at', 'is_suspended',
                   'top_writer_in', 'has_list', 'is_book_author', 'tipping_link', 'bg_image_url',
                   'logo_image_url', 'url', 'unique_slug', 'image_url', 'lang',
                   'is_locked', 'top_highlight', 'content.id']

top_articles = writers_combined.drop(columns=columns_to_drop, axis=1)
top_articles.columns.values[10] = 'article_id'
top_articles.columns.values[26] = 'content'

In [9]:
top_articles.head(3).T

Unnamed: 0,0,1,2
id,fca9db1c7da0,fca9db1c7da0,e10ad955760c
username,sheilateozy,sheilateozy,nikhiladithyan
fullname,Sheila Teo,Sheila Teo,Nikhil Adithyan
bio,"Data Scientist, https://www.linkedin.com/in/sh...","Data Scientist, https://www.linkedin.com/in/sh...",Founder @BacktestZone (https://www.backtestzon...
followers_count,2111,2111,7044
following_count,16,16,41
publication_following_count,2,2,11
is_writer_program_enrolled,True,True,True
allow_notes,True,True,True
followers,"[0037690620ed, 01545acdd5dc, 02533e82cf32, 037...","[0037690620ed, 01545acdd5dc, 02533e82cf32, 037...","[00149b7421b2, 00183ef79cce, 00418aadfc4b, 005..."


## compute articles similar to a top article

In [10]:
# Step 1: Vectorize 'article content' using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english', min_df=0.01, max_df=0.8)
tfidf_matrix = tfidf_vectorizer.fit_transform(top_articles['content'].fillna(''))
tfidf_matrix

<152x6475 sparse matrix of type '<class 'numpy.float64'>'
	with 58234 stored elements in Compressed Sparse Row format>

In [11]:
# step 2: computer similarity scores for all articles
top_article_similarity = cosine_similarity(tfidf_matrix, tfidf_matrix)
top_article_similarity.round(3)

array([[1.   , 0.076, 0.181, ..., 0.089, 0.089, 0.123],
       [0.076, 1.   , 0.087, ..., 0.053, 0.04 , 0.023],
       [0.181, 0.087, 1.   , ..., 0.067, 0.051, 0.101],
       ...,
       [0.089, 0.053, 0.067, ..., 1.   , 0.491, 0.217],
       [0.089, 0.04 , 0.051, ..., 0.491, 1.   , 0.094],
       [0.123, 0.023, 0.101, ..., 0.217, 0.094, 1.   ]])

In [12]:
# Step 3: Recommend Similar articles
def get_similar_articles(article_id, top_n=3):
    article_idx = top_articles.index[top_articles['article_id'] == article_id].tolist()[0]
    sim_scores = list(enumerate(top_article_similarity[article_idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]  # Exclude the writer itself
    article_indices = [i[0] for i in sim_scores]
    return top_articles['article_id'].iloc[article_indices].tolist()

# Example usage
article_id = '34c195a93d41'  # Replace with actual writer ID
similar_articles = get_similar_articles(article_id)
print(f"Top {len(similar_articles)} similar articles for article ID {article_id}: {similar_articles}")

Top 3 similar articles for article ID 34c195a93d41: ['6ad21c4cfa99', 'e3b3e99e4fca', '2675c73080ff']


In [13]:
top_articles[top_articles['article_id']=='2675c73080ff']['content']

68    Making LLM Knowledge-Aware (From Fareed Khan)\...
Name: content, dtype: object

## compute similar writers

In [14]:
writers = writers_raw.copy(deep=True)

In [15]:
def concatenate_article_info(top_articles):
    """
    Concatenate information from top_articles for TF-IDF vectorization.

    Parameters:
    - top_articles: List of dictionaries, each representing an article.

    Returns:
    - concatenated_info: String, concatenated information of all articles.
    """
    concatenated_info = ""

    for article in top_articles:
        # Extract information from each article
        title = article.get('title', '')
        subtitle = article.get('subtitle', '')
        tags = ' '.join(article.get('tags', []))  # Convert list of tags to a space-separated string
        topics = ' '.join(article.get('topics', []))  # Convert list of topics to a space-separated string
        top_highlight = article.get('top_highlight', '')

        # Concatenate article information, separated by spaces
        article_info = f"{title} {subtitle} {tags} {topics} {top_highlight}"
        concatenated_info += " " + article_info  # Add to the overall concatenated string

    return concatenated_info.strip()  # Remove any leading/trailing whitespace

# Assuming 'top_articles' is a column in your DataFrame 'df' that contains the list of article dictionaries
writers['concatenated_info'] = writers['top_articles'].apply(concatenate_article_info)
writers['concatenated_info'] = writers['concatenated_info'] +" "+ writers['bio']

In [16]:
writers['concatenated_info'].iloc[0]

'How I Won Singapore’s GPT-4 Prompt Engineering Competition A deep dive into the strategies I learned for harnessing the power of Large Language Models (LLMs) data-science artificial-intelligence prompt-engineering editors-pick technology artificial-intelligence data-science Use System Prompts to provide instructions that you want the LLM to remember when responding throughout the entire chat. Stacked Ensembles for Advanced Predictive Modeling With H2O.ai and Optuna And how I placed top 10% in Europe’s largest machine learning competition with them! machine-learning data-science deep-learning ensemble-learning python machine-learning data-science Data Scientist, https://www.linkedin.com/in/sheila-teo/'

In [17]:
def compute_averages_and_proportions(top_articles):
    # Initialize sums and counts for each metric
    sums = {
        'claps': 0,
        'voters': 0,
        'word_count': 0,
        'responses_count': 0,
        'reading_time': 0,
        'is_series_count': 0,
        'is_shortform_count': 0
    }
    count = len(top_articles)  # Number of articles to average over
    
    for article in top_articles:
        # Sum up each metric
        sums['claps'] += article.get('claps', 0)
        sums['voters'] += article.get('voters', 0)
        sums['word_count'] += article.get('word_count', 0)
        sums['responses_count'] += article.get('responses_count', 0)
        sums['reading_time'] += article.get('reading_time', 0)
    
    # Calculate averages and proportions
    averages_and_proportions = {
        'avg_claps': sums['claps'] / count if count else 0,
        'avg_voters': sums['voters'] / count if count else 0,
        'avg_word_count': sums['word_count'] / count if count else 0,
        'avg_responses_count': sums['responses_count'] / count if count else 0,
        'avg_reading_time': sums['reading_time'] / count if count else 0,
    }
    
    return averages_and_proportions

# Apply the function to each row in the DataFrame and expand the results into new columns
for column, default in [('avg_claps', 0), ('avg_voters', 0), ('avg_word_count', 0), 
                        ('avg_responses_count', 0), ('avg_reading_time', 0)]:
    writers[column] = writers['top_articles'].apply(lambda x: compute_averages_and_proportions(x).get(column, default))

In [20]:
# Assuming `df` is your DataFrame
writers = writers.drop(columns=['_id', 'bio', 'following_count',
                      'publication_following_count', 'image_url', 'twitter_username', 'is_writer_program_enrolled',
                      'allow_notes', 'medium_member_at', 'is_suspended', 'top_writer_in', 'has_list',
                      'tipping_link', 'bg_image_url', 'logo_image_url', 'top_articles'])
writers.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
id,fca9db1c7da0,e10ad955760c,76398be9016,d80580992695,8a910484fe84,4beacba7dc8a,630ab5ffdf27,b856005e5ecd,b0fbe613be9d,14176fcb5743,9b351e8113e9,5d33decdf4c4,8c8e5b7182ef,37a2cbe8bd15,15a29a4fc6ad,fb44e21903f3
username,sheilateozy,nikhiladithyan,machine-learning-made-simple,anmol3015,moneytent,jordan_gibbs,jacobistyping,fareedkhandev,cobusgreyling,inchristiely,ignacio.de.gregorio.noblejas,avi_chawla,iampaulrose,pareto_investor,miptgirl,frank-andrade
fullname,Sheila Teo,Nikhil Adithyan,Devansh,Anmol Tomar,Money Tent,Jordan Gibbs,Jacob Bennett,Fareed Khan,Cobus Greyling,Christie C.,Ignacio de Gregorio,Avi Chawla,Paul Rose,The Pareto Investor,Mariya Mansurova,The PyCoach
followers_count,2111,7044,13172,19854,3516,2825,25716,20638,16025,56048,71934,19823,40019,38872,7465,135695
is_book_author,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False
followers,"[0037690620ed, 01545acdd5dc, 02533e82cf32, 037...","[00149b7421b2, 00183ef79cce, 00418aadfc4b, 005...","[00245ad345da, 0048135eab64, 0078a8c1aefd, 008...","[0008a90f7d7f, 00117ef3e7b2, 001479440872, 001...","[00033482febc, 0037f86827fc, 0058b5cae97b, 008...","[0019523196a0, 00245ad345da, 00b8eada8b79, 00d...","[0008a90f7d7f, 00149b7421b2, 0016c6d6a984, 001...","[000d5dcca96e, 001479440872, 001564222f42, 001...","[00245ad345da, 00388d34235e, 004bab759a12, 005...","[00008456a9f0, 0000a6571897, 00032ca4b88c, 000...","[00017297cca0, 0002df3940d5, 00032ca4b88c, 000...","[001479440872, 002125e32c14, 002b3b17db23, 004...","[0000a6571897, 0008a90f7d7f, 0009dee2120f, 001...","[0000a6571897, 0000d489eba9, 0002df3940d5, 000...","[00245ad345da, 002a25275643, 0041541f90e4, 004...","[0008d1d32e50, 00117ef3e7b2, 001479440872, 001..."
concatenated_info,How I Won Singapore’s GPT-4 Prompt Engineering...,Create a Stock Chatbot with your own CSV Data ...,Understanding Google’s GPT Killer- The Pathway...,"Don’t use loc/iloc with Loops In Python, Inste...",Want to be Rich? DON’T Start a Side Hustle. Wh...,"Forget Prompt Engineering, ChatGPT Can Write P...",My magical first job as a self-taught software...,100x Faster — Scaling Your RAG App for Billion...,"Demonstrate, Search, Predict (DSP) for LLMs Th...",Midjourney V6 New Prompting Technique — Introd...,Is Mamba the End of ChatGPT As We Know It? The...,320+ Python and Data Science Tips — Covering P...,I Built The 5 Income Streams Every Writer Shou...,Why You Should Pay Attention to Perplexity AI ...,Text Embeddings: Comprehensive Guide Evolution...,You’re Not The Only One Feeling AI Fatigue (Or...
avg_claps,5934.0,680.5,256.3,998.6,43.6,1201.8,1732.2,715.0,180.1,856.6,3523.5,675.2,1266.9,890.3,778.6,825.6
avg_voters,1202.0,161.9,48.8,276.9,5.9,213.7,462.0,143.5,19.4,108.7,689.9,181.1,148.6,147.4,168.1,149.2
avg_word_count,4352.5,2127.1,2117.3,891.9,787.8,1247.1,969.9,3552.4,1018.5,1393.1,1477.2,1256.9,1878.6,829.4,4636.3,1007.3


In [37]:
tfidf_vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
# Fit and transform the 'concatenated_info' column
tfidf_writer_matrix = tfidf_vectorizer.fit_transform(writers['concatenated_info'])

#### now encode the followers

In [38]:
# Assuming `df` is your DataFrame and it has a 'followers' column where each entry is a list of follower IDs
all_followers = set()
writers['followers'].apply(lambda followers: all_followers.update(followers))
# Create a list of all unique followers for indexing
all_followers = list(all_followers)

In [39]:
def encode_followers(followers):
    # Initialize a row with zeros
    row = np.zeros(len(all_followers))
    # For each follower of the writer, set the corresponding column to 1
    for follower in followers:
        if follower in all_followers:
            index = all_followers.index(follower)
            row[index] = 1
    return row

# Apply the function to each row in the DataFrame and stack the results into a sparse matrix
follower_matrix = np.array(writers['followers'].apply(encode_followers).tolist())

# Convert to a sparse matrix to save memory
follower_sparse_matrix = sparse.csr_matrix(follower_matrix)
follower_sparse_matrix

<16x4991 sparse matrix of type '<class 'numpy.float64'>'
	with 7594 stored elements in Compressed Sparse Row format>

In [41]:
# Convert 'is_book_author' from boolean to int
writers['is_book_author'] = writers['is_book_author'].astype(int)

# Select numerical features
numerical_features = writers[['avg_claps', 'avg_voters', 'avg_word_count', 'avg_responses_count', 
                              'avg_reading_time', 'followers_count', 'is_book_author']].values

# Normalize numerical features
normalized_numerical_features = normalize(numerical_features, axis=0)

# Convert to sparse matrix
numerical_features_sparse = csr_matrix(normalized_numerical_features)

# Assuming 'tfidf_matrix' is the TF-IDF sparse matrix and 'followers_matrix' is the followers sparse matrix
final_matrix = hstack([tfidf_writer_matrix, follower_sparse_matrix, numerical_features_sparse])

In [45]:
# Compute similarity scores
writer_similarity_scores = cosine_similarity(final_matrix, final_matrix)

writer_similarity_scores.round(3)

array([[1.   , 0.014, 0.012, 0.006, 0.027, 0.082, 0.01 , 0.006, 0.007,
        0.003, 0.007, 0.01 , 0.003, 0.005, 0.025, 0.005],
       [0.014, 1.   , 0.138, 0.155, 0.021, 0.055, 0.212, 0.062, 0.065,
        0.038, 0.055, 0.109, 0.03 , 0.062, 0.127, 0.055],
       [0.012, 0.138, 1.   , 0.146, 0.015, 0.046, 0.095, 0.119, 0.19 ,
        0.038, 0.065, 0.235, 0.06 , 0.056, 0.128, 0.096],
       [0.006, 0.155, 0.146, 1.   , 0.034, 0.042, 0.236, 0.381, 0.131,
        0.109, 0.178, 0.179, 0.1  , 0.163, 0.152, 0.115],
       [0.027, 0.021, 0.015, 0.034, 1.   , 0.057, 0.008, 0.05 , 0.034,
        0.017, 0.015, 0.006, 0.016, 0.017, 0.05 , 0.017],
       [0.082, 0.055, 0.046, 0.042, 0.057, 1.   , 0.029, 0.04 , 0.052,
        0.033, 0.028, 0.035, 0.023, 0.026, 0.04 , 0.018],
       [0.01 , 0.212, 0.095, 0.236, 0.008, 0.029, 1.   , 0.079, 0.049,
        0.057, 0.082, 0.118, 0.045, 0.096, 0.113, 0.045],
       [0.006, 0.062, 0.119, 0.381, 0.05 , 0.04 , 0.079, 1.   , 0.18 ,
        0.141, 0.147, 0.13

In [46]:
def find_top_similar_writers(writer_id, writers_df, similarity_scores):    
    # Find the index of this writer in the DataFrame
    writer_index = writers_df.index[writers_df['id'] == writer_id].tolist()[0]
    
    # Get the similarity scores for this writer against all others
    writer_similarity_scores = similarity_scores[writer_index]
    
    # Get indices of the top 4 most similar writers, including the writer itself
    top_indices = np.argsort(writer_similarity_scores)[-4:][::-1]
    
    # Exclude the writer itself from the top indices
    top_similar_indices = [idx for idx in top_indices if idx != writer_index][:3]
    
    # Get the IDs of the most similar writers
    most_similar_writer_ids = writers_df.iloc[top_similar_indices]['id'].values
    
    return most_similar_writer_ids

find_top_similar_writers('fca9db1c7da0', writers, writer_similarity_scores)

array(['4beacba7dc8a', '8a910484fe84', '15a29a4fc6ad'], dtype=object)

## recommend writers to followers based on other follower behavior

In [64]:
# Convert the nested dictionary into a list of tuples (follower, writer)
data_list = [(follower, writer) for follower, writers in followers.items() for writer in writers]

# Create a DataFrame from the list
df = pd.DataFrame(data_list, columns=['Follower', 'Writer'])

df = df.explode('Writer').reset_index(drop=True)

# Create a binary matrix
binary_matrix = df.pivot_table(index='Follower', columns='Writer', aggfunc='size', fill_value=0)

# Compute the cosine similarity matrix
similarity_matrix = cosine_similarity(binary_matrix)

# Convert the similarity matrix into a DataFrame for easier readability
similarity_df = pd.DataFrame(similarity_matrix, index=binary_matrix.index, columns=binary_matrix.index)

In [65]:
def recommend_writers_for_follower(follower_id, binary_matrix, similarity_df, top_n=3):
    # Find the top N most similar followers
    most_similar_followers = similarity_df[follower_id].sort_values(ascending=False).index[1:top_n+1]
    
    # Aggregate the writers followed by these similar followers
    recommended_writers = binary_matrix.loc[most_similar_followers].sum().sort_values(ascending=False)
    
    # Exclude writers the target follower already follows
    already_followed = binary_matrix.loc[follower_id]
    recommended_writers = recommended_writers[already_followed == 0]
    
    return recommended_writers.head(top_n).index.tolist()

# Example usage:
follower_id = '0037690620ed'  # Replace with the actual follower ID
recommendations = recommend_writers_for_follower(follower_id, binary_matrix, similarity_df)
print(f"Recommended writers for {follower_id}: {recommendations}")

Recommended writers for 0037690620ed: ['14176fcb5743', '15a29a4fc6ad', '37a2cbe8bd15']
