In [None]:
import numpy as np
import pandas as pd
import os
from joblib import Parallel, delayed
from tqdm import tqdm
import spacy
from collections import defaultdict
import networkx as nx
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
#from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GroupKFold

import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

In [None]:
use_cosine = False

In [None]:
df = pd.read_csv("../input/coleridgeinitiative-show-us-the-data/train.csv")
labels = np.sort(df["cleaned_label"].unique())
labels = list(labels)

***
# Cosine Similarity Operation

In [None]:
def cosim(doc1, doc2):
    """
    Args:
        doc1: str
        doc2: str
    Returns:
        sim: float
    """
    nlp = spacy.load("en_core_web_lg")  # make sure to use larger package!
    # reference: https://github.com/explosion/spaCy/issues/3552
    
    doc1 = nlp(doc1)
    doc2 = nlp(doc2)
    
    sim = doc1.similarity(doc2)
    return sim

In [None]:
if use_cosine:
    is_single = False

    if is_single:
        outs = []
        for doc1 in tqdm(labels):
            for doc2 in labels:
                outs.append(cosim(doc1, doc2))
    else:
        outs = Parallel(n_jobs=-1)(delayed(cosim)(doc1, doc2) for doc2 in labels for doc1 in tqdm(labels))

    outs = np.array(outs).reshape((len(labels), len(labels)))
    np.fill_diagonal(outs, False)

***
# Simple String Operation

In [None]:
if not use_cosine:
    nlp = spacy.load('en_core_web_lg', disable=['parser', 'ner'])
    
    # Lemmatization
    # reference: https://stackoverflow.com/questions/38763007/how-to-use-spacy-lemmatizer-to-get-a-word-into-basic-form/48948642
    lab2lem = dict()
    for lab in labels:
        doc = nlp(lab)
        lem = " ".join([token.lemma_ for token in doc])
        lab2lem[lab] = lem
        
    lemmas = [lab2lem[label] for label in labels]
    
    
    outs = []
    for doc1 in lemmas:
        for doc2 in lemmas:
            outs.append(doc1 in doc2)

    outs = np.array(outs).reshape((len(lemmas), len(lemmas)))
    np.fill_diagonal(outs, False)

***
# Graph

In [None]:
# function for adding edge to graph
graph = defaultdict(list)
def addEdge(graph,u,v):
    graph[u].append(v)

# definition of function
def generate_edges(graph):
    edges = []
    # for each node in graph
    for node in graph:
        # for each neighbour node of a single node
        for neighbour in graph[node]:
            # if edge exists then append
            edges.append((node, neighbour))
    return edges

In [None]:
labels = np.array(labels)

for i in range(len(labels)):
    u = labels[i]
    vs = labels[outs[i, :]]
#     print("doc:\n", u)
#     print("Similar docs:\n", vs)
#     print()
    for v in vs:
        addEdge(graph, u, v)
        
edges = generate_edges(graph)

G = nx.Graph()
G.add_edges_from(edges)

In [None]:
#nx.draw(G, with_labels=True)

In [None]:
# for cc in nx.connected_components(G):
#     print(cc)

In [None]:
mycat = dict()
for label in labels:
    try:
        cat = "/".join(nx.node_connected_component(G, label))
        #print(f"label: {label},\n cat: {cat}\n")
    except KeyError:
        cat = label
    mycat[label] = cat

***
# CV

In [None]:
df["pub_category"] = df["cleaned_label"].apply(lambda x: mycat[x])

In [None]:
def get_cv(dataset, num_splits=None, col_target=None, col_group=None):
    """
    Args:
        dataset: pd.DataFrame
        num_splits: int
        col_target: str
        col_group: str
    Returns:
        folds: pd.DataFrame
    """
    X = dataset.index.values
    y = dataset[col_target].values
    groups = dataset[col_group].values

    group_kfold = GroupKFold(n_splits=num_splits)
    group_kfold.get_n_splits(X, y, groups)
    
    folds = pd.DataFrame()
    for i, (_, test_index) in enumerate(group_kfold.split(X, y, groups)):
        X_test = X[test_index]
        X_test = dataset[dataset.index.isin(X_test)]
        
        # Concat all and save at once
        X_test["fold"] = i+1
        folds = pd.concat([folds, X_test], ignore_index=True)
    
    return folds

In [None]:
folds = get_cv(df, num_splits=5, col_target="cleaned_label", col_group="pub_category")

In [None]:
for i in range(5):
    train = folds[folds["fold"] != i+1]
    dev = folds[folds["fold"] == i+1]
    print(f"CV: {i+1} -------------------------------------------------------------------")
    print()
    print("#### train pub_category:\n\n", train["pub_category"].unique())
    print()
    print("#### dev pub_category:\n\n", dev["pub_category"].unique())
    print()
    print()

In [None]:
folds.to_pickle("./folds_pubcat.pkl")