In [1]:
import numpy as np
import pandas as pd
import re
import spacy
import string

from datetime import datetime
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

In [1]:
from dotenv import load_dotenv
import requests
import os

In [2]:
load_dotenv()
MONGO_CONNECTION_STRING = os.getenv("MONGO_CONNECTION_STRING")

In [2]:
# Constants
URI = MONGO_CONNECTION_STRING
DB_NAME = "medium_database"
COLLECTION_NAME = "writer_information"

In [3]:
# Get the desired collection.
client = MongoClient(URI, server_api=ServerApi('1'))
db = client[DB_NAME]
writer_info = db[COLLECTION_NAME]

# Prepare the raw dataset.
dataset = {"followers_count": [],
            "publication_following_count": [],
            "has_twitter_username": [],  # convert from string to bool
            "is_writer_program_enrolled": [],
            "allow_notes": [],
            "medium_member_at": [],  # need to calculate the time later
            "is_book_author": [],
            "title": [],  # need to do NLP later
            "subtitle": [],  # need to do NLP later
            #"tags": [],
            #"topics": [],
            "word_count": [],
            "reading_time": [],
            "is_series": [],
            "is_shortform": [],
            "top_highlight": [],  # need to do NLP later
            "content": [],  # need to do NLP later
            "claps": [],
            "voters": [],
            "responses_count": []}

for writer in writer_info.find():
    for article in writer["top_articles"]:
        dataset["followers_count"].append(writer.get("followers_count", 0))
        dataset["publication_following_count"].append(writer.get("publication_following_count", 0))
        dataset["has_twitter_username"].append(writer.get("twitter_username", "").strip() != "")
        dataset["is_writer_program_enrolled"].append(writer.get("is_writer_program_enrolled", False))
        dataset["allow_notes"].append(writer.get("allow_notes", False))
        dataset["medium_member_at"].append(writer.get("medium_member_at", "").strip())
        dataset["is_book_author"].append(writer.get("is_book_author", False))
        dataset["title"].append(article.get("title", "").strip())
        dataset["subtitle"].append(article.get("subtitle", "").strip())
        dataset["word_count"].append(article.get("word_count", 0))
        dataset["reading_time"].append(article.get("reading_time", 0))
        dataset["is_series"].append(article.get("is_series", False))
        dataset["is_shortform"].append(article.get("is_shortform", False))
        dataset["top_highlight"].append(article.get("top_highlight", "").strip())
        dataset["claps"].append(article.get("claps", 0))
        dataset["voters"].append(article.get("voters", 0))
        dataset["responses_count"].append(article.get("responses_count", 0))

        if "content" in article:
            dataset["content"].append(article["content"].get("content", "").strip())
        else:
            dataset["content"].append("")

dataset = pd.DataFrame(dataset)
dataset.shape

(152, 18)

In [4]:
# Convert medium_member_at to medium_usage_time in days.

medium_usage_time = []
current_date = datetime.strptime("2024-02-24 16:45:00", "%Y-%m-%d %H:%M:%S")
for medium_member_at in dataset["medium_member_at"]:
    if medium_member_at == "":
        medium_usage_time.append(0)
        continue

    date = datetime.strptime(medium_member_at, "%Y-%m-%d %H:%M:%S")
    medium_usage_time.append((current_date - date).days)
dataset["medium_usage_time"] = medium_usage_time
dataset.shape

(152, 19)

In [5]:
# Do tfidf to all text fields.
nlp = spacy.load("en_core_web_sm")


def sanitize(texts):
    result = []
    for t in texts:
        text = t.lower()
        text = re.sub('[' + string.punctuation + '0-9\\r\\t\\n]', ' ', text)
        text = re.sub('\\s+', ' ', text)
        text = nlp(text)
        tokens = [words.lemma_ for words in text if not words.is_stop and len(words) >= 2]
        result.append(" ".join(tokens))
    return result


def vectorize(col_name, max_num):
    vectorizer = TfidfVectorizer(stop_words='english', token_pattern=r'\b[a-zA-Z]+\b')
    contents = sanitize(dataset[col_name].values)
    tfidf_matrix = vectorizer.fit_transform(contents)
    feature_names = vectorizer.get_feature_names_out()
    tfidf_sum = np.sum(tfidf_matrix, axis=0)
    tfidf_sum_array = np.squeeze(np.asarray(tfidf_sum))
    tfidf_mapping = dict(zip(feature_names, tfidf_sum_array))
    sorted_tfidf = sorted(tfidf_mapping.items(), key=lambda x: x[1], reverse=True)
    top_words = sorted_tfidf[:max_num]
    indices = [np.where(feature_names == word)[0][0] if word in feature_names else None for word, _ in top_words]
    return tfidf_matrix.toarray()[:, indices]

In [6]:
def update_dataset_with_vector(dataset, col_name, max_num):
    dataset_copy = dataset.copy(deep=True)
    vector = vectorize(col_name, max_num)
    n_cols = vector.shape[1]
    for i in range(n_cols):
        dataset_copy[f"{col_name}_{i}"] = vector[:, i]
    return dataset_copy


dataset_final = update_dataset_with_vector(dataset, "title", 50)
dataset_final = update_dataset_with_vector(dataset_final, "subtitle", 50)
dataset_final = update_dataset_with_vector(dataset_final, "top_highlight", 50)
dataset_final = update_dataset_with_vector(dataset_final, "content", 50)
print(dataset_final.shape)

(152, 219)


In [17]:
from enum import Enum


class DataType(Enum):
    TEXT_ONLY = 0
    TEXT_EXCLUSIVE = 1
    ALL_DATA = 2


def get_X(X, data_type=DataType.ALL_DATA):
    non_text_cols = ["followers_count",
                    "publication_following_count",
                    "has_twitter_username",
                    "is_writer_program_enrolled",
                    "allow_notes",
                    "medium_usage_time",
                    "is_book_author",
                    "word_count",
                    "reading_time",
                    "is_series",
                    "is_shortform"]
    X_new = X.copy(deep=True)
    if data_type == DataType.TEXT_ONLY:
        X_new = X_new.drop(columns=non_text_cols)
    elif data_type == DataType.TEXT_EXCLUSIVE:
        X_new = X_new[non_text_cols]
    return X_new


# Try linear regression
def fit_linear_regression(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    model = LinearRegression()
    model.fit(X_train, y_train)
    #y_pred = model.predict(X_test)
    score = model.score(X_test, y_test)
    print(f"Linear Regression R2: {score}")


def fit_random_forest(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    model = RandomForestRegressor(n_estimators=50, max_depth=10,ccp_alpha = 0.1, random_state=0)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    score = r2_score(y_test, y_pred)
    print(f"Random Forest R2: {score}")


col_to_exclude = ["medium_member_at", "title", "subtitle", "top_highlight", "content", "claps", "voters", "responses_count"]
X = dataset_final.drop(columns=col_to_exclude)
y_claps = dataset_final["claps"]
y_voters = dataset_final["voters"]
y_responses_count = dataset_final["responses_count"]
fit_linear_regression(X, y_claps)
fit_random_forest(X, y_claps)
fit_linear_regression(X, y_voters)
fit_random_forest(X, y_voters)
fit_linear_regression(X, y_responses_count)
fit_random_forest(X, y_responses_count)

Linear Regression R2: 0.07105536043775662
Random Forest R2: 0.07725905911298092
Linear Regression R2: -0.07663117680937925
Random Forest R2: -0.17260835995974344
Linear Regression R2: -0.26205881868656955
Random Forest R2: -2.068504375642978


In [20]:
fit_linear_regression(get_X(X, DataType.TEXT_EXCLUSIVE), y_claps)
fit_linear_regression(get_X(X, DataType.TEXT_ONLY), y_claps)
fit_linear_regression(get_X(X, DataType.ALL_DATA), y_claps)
fit_random_forest(get_X(X, DataType.TEXT_EXCLUSIVE), y_claps)
fit_random_forest(get_X(X, DataType.TEXT_ONLY), y_claps)
fit_random_forest(get_X(X, DataType.ALL_DATA), y_claps)

fit_linear_regression(get_X(X, DataType.TEXT_EXCLUSIVE),y_voters)
fit_linear_regression(get_X(X, DataType.TEXT_ONLY), y_voters)
fit_linear_regression(get_X(X, DataType.ALL_DATA), y_voters)
fit_random_forest(get_X(X, DataType.TEXT_EXCLUSIVE), y_voters)
fit_random_forest(get_X(X, DataType.TEXT_ONLY), y_voters)
fit_random_forest(get_X(X, DataType.ALL_DATA), y_voters)

fit_linear_regression(get_X(X, DataType.TEXT_EXCLUSIVE),y_responses_count)
fit_linear_regression(get_X(X, DataType.TEXT_ONLY), y_responses_count)
fit_linear_regression(get_X(X, DataType.ALL_DATA), y_responses_count)
fit_random_forest(get_X(X, DataType.TEXT_EXCLUSIVE), y_responses_count)
fit_random_forest(get_X(X, DataType.TEXT_ONLY), y_responses_count)
fit_random_forest(get_X(X, DataType.ALL_DATA), y_responses_count)

Linear Regression R2: -0.09344272348701121
Linear Regression R2: -1.3020934484506022
Linear Regression R2: -1.0231194847982348
Random Forest R2: -2.09958169609855
Random Forest R2: 0.022014609427925214
Random Forest R2: 0.16549284337926506
Linear Regression R2: -0.18901840672510417
Linear Regression R2: -9.325537583965108
Linear Regression R2: -0.6500885508936134
Random Forest R2: -0.7132404489272217
Random Forest R2: 0.29584197179880234
Random Forest R2: 0.016305983264450208
Linear Regression R2: -0.022927453453716273
Linear Regression R2: -2.0220158583869976
Linear Regression R2: -0.5086904654329325
Random Forest R2: -0.8288271485451644
Random Forest R2: 0.011168421785156424
Random Forest R2: -0.7365404262372661
