### Set up

In [1]:
# file system navigation
from pathlib import Path

# data transformation
import pandas as pd

# nlp
import spacy
from spacy_cld import LanguageDetector

### Data preparation

In [2]:
articles = pd.read_csv(Path.cwd() / "data" / "articles.csv")

In [3]:
articles.shape

(337, 6)

#### Find and remove duplicates based on author and title

In [4]:
articles = articles\
    .drop_duplicates(subset=["author", "title"], keep="first")

In [5]:
articles.shape

(230, 6)

#### Convert claps to integer

In [6]:
articles["claps"] = articles["claps"].apply(lambda s: int(float(s[:-1]) * 1000) if s[-1] == "K" else int(s))

In [7]:
articles.head()

Unnamed: 0,author,claps,reading_time,link,title,text
0,Justin Lee,8300,11,https://medium.com/swlh/chatbots-were-the-next...,Chatbots were the next big thing: what happene...,"Oh, how the headlines blared:\nChatbots were T..."
1,Conor Dewey,1400,7,https://towardsdatascience.com/python-for-data...,Python for Data Science: 8 Concepts You May Ha...,If you’ve ever found yourself looking up the s...
2,William Koehrsen,2800,11,https://towardsdatascience.com/automated-featu...,Automated Feature Engineering in Python – Towa...,Machine learning is increasingly moving from h...
3,Gant Laborde,1300,7,https://medium.freecodecamp.org/machine-learni...,Machine Learning: how to go from Zero to Hero ...,If your understanding of A.I. and Machine Lear...
4,Emmanuel Ameisen,935,11,https://blog.insightdatascience.com/reinforcem...,Reinforcement Learning from scratch – Insight ...,Want to learn about applied Artificial Intelli...


#### Remove articles, which are not written in english

In [8]:
nlp = spacy.load("en")
language_detector = LanguageDetector()
nlp.add_pipe(language_detector)

In [9]:
def get_en_score(df, col_name):
    
    data = df.copy()
    data["en_score"] = data[col_name].apply(lambda x: nlp(x)._.language_scores.get("en", 0.0))
    
    return data

In [10]:
unique_articles_en = articles \
        .pipe(get_en_score, "text") \
        .query("en_score > 0.9") \
        .drop("en_score", axis=1)

In [11]:
unique_articles_en.shape

(224, 6)

#### Prepare and save data for labeling (author, claps and link only) as csv and for further processing as parquet

In [12]:
LABEL_PATH = Path.cwd() / "data" / "processed"

In [13]:
if not (LABEL_PATH / "articles_link_only.csv").is_file:
    LABEL_PATH.mkdir()
    
    unique_articles_en \
    .sort_values(by="claps", ascending=False) \
    .loc[:, ["author", "claps", "link"]] \
    .to_csv(LABEL_PATH / "articles_link_only.csv", sep=";")

In [14]:
if not (LABEL_PATH / "unique_articles.parquet").is_file:
    unique_articles_en \
    .to_parquet(LABEL_PATH / "unique_articles.parquet", engine="pyarrow")

#### Add labels

In [15]:
labels = pd.read_csv(Path.cwd() / "data" / "labels.csv", sep=";")

In [16]:
labeled_data = pd.merge(unique_articles_en, labels, how="inner", left_index=True, right_on="id").drop("id", axis=1)

#### Save for further use

In [17]:
SHARING_PATH = Path.cwd() / "data" / "shared" 

In [18]:
if not (SHARING_PATH / "train_data.parquet").is_file:
    SHARING_PATH.mkdir()
    labeled_data \
    .to_parquet(Path.cwd() / "data" / "processed" / "train_data.parquet", engine="pyarrow")