### Set up

In [None]:
# file system navigation
from pathlib import Path

# data transformation
import pandas as pd

# nlp
import spacy
from spacy_cld import LanguageDetector

### Data preparation

In [None]:
articles = pd.read_csv(Path.cwd() / "data" / "articles.csv")

In [None]:
articles.shape

#### Find and remove duplicates based on author and title

In [None]:
articles = articles\
    .drop_duplicates(subset=["author", "title"], keep="first")

In [None]:
articles.shape

#### Convert claps to integer

In [None]:
articles["claps"] = articles["claps"].apply(lambda s: int(float(s[:-1]) * 1000) if s[-1] == "K" else int(s))

In [None]:
articles.head()

#### Remove articles, which are not written in english

In [None]:
nlp = spacy.load("en")
language_detector = LanguageDetector()
nlp.add_pipe(language_detector)

In [None]:
def get_en_score(df, col_name):
    
    data = df.copy()
    data["en_score"] = data[col_name].apply(lambda x: nlp(x)._.language_scores.get("en", 0.0))
    
    return data

In [None]:
unique_articles_en = articles \
        .pipe(get_en_score, "text") \
        .query("en_score > 0.9") \
        .drop("en_score", axis=1)

In [None]:
unique_articles_en.shape

In [None]:
unique_articles_en.sort_values("claps", ascending=False).head(100)

#### Prepare and save data for labeling (author, claps and link only) as csv and for further processing as parquet

In [None]:
unique_articles_en\
    .sort_values(by="claps", ascending=False)\
    .loc[:, ["author", "claps", "link"]]\
    .to_csv(Path.cwd() / "data" / "processed" / "articles_link_only.csv", sep=";")

In [None]:
unique_articles_en \
    .to_parquet(Path.cwd() / "data" / "processed" / "unique_articles.parquet", engine="pyarrow")

#### Add labels

In [None]:
labels = pd.read_csv(Path.cwd() / "data" / "labels.csv", sep=";")

In [None]:
labeled_data = pd.merge(unique_articles_en, labels, how="inner", left_index=True, right_on="id").drop("id", axis=1)

In [None]:
labeled_data.sort_values("claps", ascending=False)[["author", "claps", "interesting"]].head()

In [None]:
labeled_data \
    .to_parquet(Path.cwd() / "data" / "processed" / "train_data.parquet", engine="pyarrow")

In [None]:
labeled_data_text_only = labeled_data[["text", "interesting"]]

labeled_data_text_only.to_csv(Path.cwd() / "data" / "processed" / "train_data_text_only.csv", index=False)

### Experimentation

#### Remove special characters

In [None]:
unique_articles_en = pd.read_csv(os.path.join(wd, "data", "processed", "unique_articles.csv"))

In [None]:
def get_raw_text(df, col_name):
    
    return df\
            .loc[:, col_name]\
            .str\
            .cat(sep = "|")\
            .lower()

In [None]:
unique_chars_title = set(get_raw_text(unique_articles_en, "title"))
print(f"There are a total of {len(unique_chars_title)} unique characters in column title:")
print(unique_chars_title)

In [None]:
unique_chars_text = set(get_raw_text(unique_articles_en, "text"))
print(f"There are a total of {len(unique_chars_text)} unique characters in column text:")
print(unique_chars_text)

In [None]:
text = unique_articles_en.iloc[0, 4]

In [None]:
doc = nlp(text)

In [None]:
doc

In [None]:
for token in doc:
    print(token.text, token.pos_, token.dep_)

In [None]:
for token in doc:
    print(token.text)

In [None]:
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)