In [None]:
import pandas as pd
import pathlib
from collections import Counter, OrderedDict
from difflib import SequenceMatcher as SM
import seaborn as sns
import matplotlib.pyplot as plt
from copy import deepcopy
from pywaffle import Waffle



In [None]:
# Initialize directories
in_dir = pathlib.Path('./in/')
out_dir = pathlib.Path('./out/')
out_dir.mkdir(parents=True, exist_ok=True)

# Set seaborn style
sns.set(style="whitegrid")

In [None]:
def load_data():
    # Fetch papers
    df1 = pd.read_csv(in_dir/'initial_papers.csv')
    df1['source'] = 'initial'
    df2 = pd.read_csv(in_dir/'snowball_papers.csv')
    df2['source'] = 'snowball'
    df = pd.concat([df1, df2])
    df = df.reset_index(drop=True)

    # Filter out irrelevant papers
    df = df[(df['Relevant'] != 'NO') & (df['Relevant'] != 'score too low') & (df['Relevant'] != 'not sure') & (df['Relevant'] != 'exclusion criteria') & (df['Relevant'] != 'Not accessible') & (df['Relevant'] != 'not included')]
    # Change columns' type
    df['Publication Year'] = df['Publication Year'].astype('int32')
    
    return df

df = load_data()

### Conferences

In [None]:
def detect_potential_duplicates_venue_name():
    venue_names = df['venue name'].unique()
    potential_duplicates = []
    for i in range(len(venue_names)):
        for j in range(i+1, len(venue_names)):
            ratio = SM(None, venue_names[i], venue_names[j]).ratio()
            if ratio > .75:
                potential_duplicates.append((venue_names[i], venue_names[j]))
    return potential_duplicates

# potential_duplicates = detect_potential_duplicates_venue_name()
# potential_duplicates

In [None]:
def get_venue_most_than_one():
    """
    Returns the venues that more than one paper cited.
    """
    cnt = Counter(df['venue name']).most_common()
    d = OrderedDict()
    for k,v in cnt:
        if v > 1:
            d[k] = v
    return d

d = get_venue_most_than_one()

In [None]:
def plot_n_papers_venues(d, file_name='venues'):
    d_copy = deepcopy(d)
    del d_copy['arxiv']
    del d_copy['proquest']
    ax = sns.barplot(x=list(d_copy.keys()), y=list(d_copy.values()), color=sns.color_palette('colorblind')[0])
    ax.set_ylabel('# publications')

    # ax.set_axisbelow(True)
    # ax.grid(axis='y')

    plt.xticks(rotation=45, ha='right')
    if file_name:
        plt.savefig(out_dir/f'{file_name}.pdf', bbox_inches='tight')
plot_n_papers_venues(d)

### Number of papers per year


In [None]:
def plot_papers_per_year(df, file_name='years'):
    ax = sns.countplot(data=df, x="Publication Year", color=sns.color_palette('colorblind')[0])
    ax.set_ylabel('# papers')
    ax.set_xlabel('year')
    
    if file_name:
        plt.savefig(out_dir/f'{file_name}.pdf', bbox_inches='tight')

plot_papers_per_year(df)

### Authors' affiliation 

In [None]:
def plot_authors_affiliation(df, file_name="affiliation"):
    cnt = Counter(df["Author's affiliation"])
    keys, values = list(cnt.keys()), list(cnt.values())
    # Count percentages
    tot = sum(values)
    keys = [f'{key}\n{(value/tot)*100:.0f}%' for key, value in zip(keys, values)]

    # plot
    _, labels = plt.pie(values, labels=keys, colors = sns.color_palette('colorblind'), startangle=90, labeldistance=1.35)

    # Change labels horizontal and vertical alignement
    for i, label in enumerate(labels):
        label.set_horizontalalignment('center')
        if i == 0:
            label.set_verticalalignment('bottom')
        else:
            label.set_verticalalignment('top')



    # add a circle at the center to transform it in a donut chart
    my_circle=plt.Circle( (0,0), 0.65, color='white')
    p=plt.gcf()
    p.gca().add_artist(my_circle)

    if file_name:
        plt.savefig(out_dir/f'{file_name}.pdf', bbox_inches='tight')

plot_authors_affiliation(df)


### Per cleaning activity

In [None]:
def generate_waffle_plot(df, col_name, file_name=None):
    cnt = OrderedDict(Counter(df[col_name].dropna().tolist()).most_common()) 
    tot = sum(cnt.values())
    val = [ 100*(x/tot) for x in cnt.values()]
    labels = [ f'{x} ({"{:.0f}".format(val[i])}%)' for i,x in enumerate(cnt.keys())]
    
    fig = plt.figure(
        FigureClass=Waffle, 
        columns=15, 
        values=cnt, 
        labels=labels,
        colors=sns.color_palette('colorblind')[:len(cnt)],
        legend={'loc': 'lower center', 'bbox_to_anchor': (0.5, -0.5), 'fontsize':11, 'ncol': 2,},
        icons='book', font_size=25, 
        icon_legend=True,
        block_arranging_style='snake'
        # figsize=(6, 6),
    )

    if file_name:
        plt.savefig(out_dir/f'{file_name}.pdf', bbox_inches='tight')

def plot_cleaning_activity(df, file_name='task'):

    # Preprocess data
    df_c = df.copy()
    df_c.loc[ df_c['task'] == "error detection/repair", 'task'] = 'Feature Cleaning'
    df_c.loc[ df_c['task'] == "error detection", 'task'] = 'Feature Cleaning'
    df_c.loc[ df_c['task'] == "error repair", 'task'] = 'Feature Cleaning'
    df_c.loc[ df_c['task'] == "mislabel correction", 'task'] = 'Label Cleaning'
    df_c.loc[ df_c['task'] == "entity matching / duplicate removal", 'task'] = 'Entity Matching'
    df_c.loc[ df_c['task'] == "outliers detection", 'task'] = 'Outlier Detection'
    df_c.loc[ df_c['task'] == "imputation", 'task'] = 'Imputation'
    df_c.loc[ df_c['task'] == "more-than-one", 'task'] = 'Combined'
    df_c.loc[ df_c['task'] == "holistic", 'task'] = 'Holistic'
            
    generate_waffle_plot(df_c, 'task', file_name)

plot_cleaning_activity(df)