### Set up

In [21]:
# file system navigation
import os

# data transformation
import pandas as pd

# nlp
import spacy
from spacy_cld import LanguageDetector

In [2]:
wd = os.getcwd()

In [36]:
wd

'/home/steffen/projects/medium-classifier'

### Data preparation - do not run

In [3]:
articles = pd.read_csv(os.path.join(wd, "data", "articles.csv"))

In [4]:
articles.head()

Unnamed: 0,author,claps,reading_time,link,title,text
0,Justin Lee,8.3K,11,https://medium.com/swlh/chatbots-were-the-next...,Chatbots were the next big thing: what happene...,"Oh, how the headlines blared:\nChatbots were T..."
1,Conor Dewey,1.4K,7,https://towardsdatascience.com/python-for-data...,Python for Data Science: 8 Concepts You May Ha...,If you’ve ever found yourself looking up the s...
2,William Koehrsen,2.8K,11,https://towardsdatascience.com/automated-featu...,Automated Feature Engineering in Python – Towa...,Machine learning is increasingly moving from h...
3,Gant Laborde,1.3K,7,https://medium.freecodecamp.org/machine-learni...,Machine Learning: how to go from Zero to Hero ...,If your understanding of A.I. and Machine Lear...
4,Emmanuel Ameisen,935,11,https://blog.insightdatascience.com/reinforcem...,Reinforcement Learning from scratch – Insight ...,Want to learn about applied Artificial Intelli...


In [5]:
articles.shape

(337, 6)

#### Find and remove duplicates based on author and title

In [6]:
unique_articles = articles\
    .drop_duplicates(subset=["author", "title"], keep="first")\
    .copy()

In [7]:
unique_articles.shape

(230, 6)

#### Convert claps to integer and save de-duplicated data to an intermediate file

In [8]:
unique_articles["claps"] = unique_articles["claps"].apply(lambda s: int(float(s[:-1]) * 1000) if s[-1] == "K" else int(s))

In [9]:
unique_articles.head()

Unnamed: 0,author,claps,reading_time,link,title,text
0,Justin Lee,8300,11,https://medium.com/swlh/chatbots-were-the-next...,Chatbots were the next big thing: what happene...,"Oh, how the headlines blared:\nChatbots were T..."
1,Conor Dewey,1400,7,https://towardsdatascience.com/python-for-data...,Python for Data Science: 8 Concepts You May Ha...,If you’ve ever found yourself looking up the s...
2,William Koehrsen,2800,11,https://towardsdatascience.com/automated-featu...,Automated Feature Engineering in Python – Towa...,Machine learning is increasingly moving from h...
3,Gant Laborde,1300,7,https://medium.freecodecamp.org/machine-learni...,Machine Learning: how to go from Zero to Hero ...,If your understanding of A.I. and Machine Lear...
4,Emmanuel Ameisen,935,11,https://blog.insightdatascience.com/reinforcem...,Reinforcement Learning from scratch – Insight ...,Want to learn about applied Artificial Intelli...


#### Prepare and save data for labeling (author, claps and link only)

In [None]:
unique_articles\
    .sort_values(by="claps", ascending=False)\
    .loc[:, ["author", "claps", "link"]]\
    .to_csv(os.path.join(wd, "data", "processed", "articles_link_only.csv"), sep=";")

#### Remove articles, which are not written in english

In [10]:
nlp = spacy.load("en")
language_detector = LanguageDetector()
nlp.add_pipe(language_detector)

In [11]:
def get_en_score(df, col_name):
    
    data = df.copy()
    data["en_score"] = data[col_name].apply(lambda x: nlp(x)._.language_scores.get("en", 0.0))
    
    return data

In [13]:
unique_articles_en = unique_articles \
        .pipe(get_en_score, "text") \
        .query("en_score > 0.9") \
        .drop("en_score", axis=1)

In [15]:
unique_articles_en.shape

(224, 6)

In [16]:
unique_articles_en \
    .to_csv(os.path.join(wd, "data", "processed", "unique_articles.csv"), index=False)

#### Add labels

In [37]:
labels = pd.read_csv(os.path.join(wd, "data", "labels.csv"), sep=";")

In [54]:
labeled_data = pd.merge(unique_articles_en, labels, how="inner", left_index=True, right_on="id").drop("id", axis=1)

In [56]:
labeled_data.head()

Unnamed: 0,author,claps,reading_time,link,title,text,interesting
0,Justin Lee,8300,11,https://medium.com/swlh/chatbots-were-the-next...,Chatbots were the next big thing: what happene...,"Oh, how the headlines blared:\nChatbots were T...",0
1,Conor Dewey,1400,7,https://towardsdatascience.com/python-for-data...,Python for Data Science: 8 Concepts You May Ha...,If you’ve ever found yourself looking up the s...,0
2,William Koehrsen,2800,11,https://towardsdatascience.com/automated-featu...,Automated Feature Engineering in Python – Towa...,Machine learning is increasingly moving from h...,1
3,Irhum Shafkat,2000,15,https://towardsdatascience.com/intuitively-und...,Intuitively Understanding Convolutions for Dee...,The advent of powerful and versatile deep lear...,1
4,Sam Drozdov,2300,6,https://uxdesign.cc/an-intro-to-machine-learni...,An intro to Machine Learning for designers – U...,There is an ongoing debate about whether or no...,0


In [57]:
labeled_data \
    .to_csv(os.path.join(wd, "data", "processed", "train_data.csv"), index=False)

### Remove special characters

In [None]:
unique_articles_en = pd.read_csv(os.path.join(wd, "data", "processed", "unique_articles.csv"))

In [17]:
def get_raw_text(df, col_name):
    
    return df\
            .loc[:, col_name]\
            .str\
            .cat(sep = "|")\
            .lower()

In [24]:
unique_chars_title = set(get_raw_text(unique_articles_en, "title"))
print(f"There are a total of {len(unique_chars_title)} unique characters in column title:")
print(unique_chars_title)

There are a total of 69 unique characters in column title:
{']', '9', 'r', 'l', '-', 'm', '̌', 'z', '“', '4', "'", '.', '5', 'i', '8', '&', 'e', 'f', '🔨', 't', '(', '—', '%', '$', 'w', '0', '7', 'p', 'c', '🔮', ' ', '’', '–', '?', ',', '‘', '👶', 'k', '̧', 'h', 'y', '!', '3', 'u', '[', 'q', ':', '2', 'g', '/', ')', '+', 's', 'a', 'j', '*', 'o', 'n', '6', '́', '#', 'x', '🤖', '”', '|', '1', 'b', 'v', 'd'}


In [25]:
unique_chars_text = set(get_raw_text(unique_articles_en, "text"))
print(f"There are a total of {len(unique_chars_text)} unique characters in column text:")
print(unique_chars_text)

There are a total of 195 unique characters in column text:
{'🎶', ']', '🇦', 'r', '-', 'm', '±', '4', 'ع', '̃', '5', '🇺', '久', '=', '`', '_', '✖', '"', '通', 'र', 'ب', 'ð', '📕', 'p', ' ', '@', 'ᄀ', '😅', '−', '🚀', '🇮', 'ा', '👶', '̣', 'ᅥ', 'y', '}', 'с', '日', 'आ', '+', 'ل', 'ی', 'к', 'a', '*', 'ε', 'x', '|', '1', '‽', '„', 'ا', 'ᄒ', '文', '.', '普', '🤷', 'i', '️', '&', 'f', '∂', 't', '(', 'ِ', '習', '̆', 'ण', '💚', '?', 'ᅮ', ',', '‘', 'ᅡ', '̧', ';', '⁄', '👍', '3', '☞', 'g', '🏻', '≥', 's', 'j', '>', '高', 'o', 'n', '力', '♂', 'b', 'v', 'd', '🤖', '^', 'l', '\\', 'z', '😄', '\u200e', 'क', "'", '»', '🇹', 'δ', '≈', '👏', '語', 'ब', '—', '👈', '$', '0', '7', 'च', '中', 'λ', '≤', '’', 'і', 'æ', 'k', 'р', 'h', '!', 'u', '̀', 'q', ':', '~', 'ᄋ', ')', '⋆', '«', '👨', '6', '😉', '#', 'ة', '”', '😃', '9', 'у', '/', 'ي', '<', 'β', 'ّ', 'ᆨ', '©', '̂', '“', '😀', '\u200d', 'س', '8', 'e', '话', 'ł', '😆', 'σ', '%', 'ف', 'w', 'и', '☺', 'c', '👉', '–', 'π', 'γ', '💻', '學', '🎨', '[', 'ᆫ', '̈', '2', 'ह', '×', '•', '\n', 'َ', '́'

In [29]:
text = unique_articles_en.iloc[0, 4]

In [31]:
doc = nlp(text)

In [33]:
doc

Chatbots were the next big thing: what happened? – The Startup – Medium

In [32]:
for token in doc:
    print(token.text, token.pos_, token.dep_)

Chatbots NOUN nsubj
were VERB ROOT
the DET det
next ADJ amod
big ADJ amod
thing NOUN attr
: PUNCT punct
what NOUN nsubj
happened VERB relcl
? PUNCT punct
– PUNCT ROOT
The DET det
Startup PROPN nmod
– PUNCT punct
Medium NOUN ROOT


In [34]:
for token in doc:
    print(token.text)

Chatbots
were
the
next
big
thing
:
what
happened
?
–
The
Startup
–
Medium


In [35]:
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)