# CS848: The art and science of empirical computer science
## The Visulization Project

by Richard Bai

In [80]:
#@title Select the conference venue, and (year_start, year_end] { run: "auto" }

year_start = 2016 #@param {type:"integer"}
year_end = 2021 #@param {type:"integer"}
conference = 'EMNLP' #@param ["ACL","EMNLP","COLING","NAACL","CoNLL", "LREC","TACL"] {type:"string"}

In [59]:
#@title Plot Github
import ipywidgets as widgets
from IPython.display import display

# plot percentage of papers with github link in its abstract
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns


def get_github_percent(results, conference, year_start, year_end):
    paper_of_conf_year = results[(results['year']>year_start)&(results['year']<=year_end)&(results['conference']==conference)]
    paper_of_conf_year_with_github = results[(results['year']>year_start)&(results['year']<=year_end)&(results['conference']==conference)&(results['github']==True)]
    paper_of_conf_year_without_github = results[(results['year']>year_start)&(results['year']<=year_end)&(results['conference']==conference)&(results['github']==False)]
    if len(paper_of_conf_year_without_github+paper_of_conf_year_with_github) != 0:
        percentage = len(paper_of_conf_year_with_github)/len(paper_of_conf_year_without_github+paper_of_conf_year_with_github)
    else:
        percentage = 0
    return percentage


def plot_figure_s1():
  plt.clf()
  results = pd.read_csv('results.csv')

  sns.set_style("whitegrid")
  sns.set_context("paper", font_scale=1.5, rc={"lines.linewidth": 2.5})
  plt.figure(figsize=(10,5))

  plt.title('Percentage of Papers with Code Link for {}'.format(conference))
  plt.xlabel('Year')
  plt.ylabel('Percentage')
  plt.ylim(0,0.3)
  plt.xlim(year_start,year_end)
  avaliable_years = results[results['conference']==conference]['year'].unique()
  target_years = [year for year in avaliable_years if year>year_start and year<=year_end]
  plt.xticks(target_years)
  for year in target_years:
      percentage = get_github_percent(results, conference, year-1, year)
      plt.bar(year, percentage, width=0.5, color='blue')
  plt.show()

button = widgets.Button(description="Plot Github")
output = widgets.Output()
def on_button_clicked(b):
  # Display the message within the output widget.
  with output:
    plot_figure_s1()

button.on_click(on_button_clicked)
display(button, output)

Button(description='Plot Github', style=ButtonStyle())

Output()

In [76]:
#@title Plot WordCloud
import os

from os import path
from wordcloud import WordCloud,STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
stopwords_sa = [
    "",
    "network",
    "networks",
    "via",
    "using",
    "based",
    "towards",
    "text",
    "natural",
    "models",
    "approach",
    "improving",
    "analysis",
    "data",
    "predicting",
    "tasks",
    "corpus",
    "parsing",
    "neural",
    "deep",
    "dataset",
    "classification",
    "representation",
    "representations",
    "learning",
    "word",
    "level",
    "langauge",
    "end",
    "training",
    "language"
]
CUSTOMZED_STOPWORDS = STOPWORDS.update(stopwords_sa)

def plot_wordcloud(text, mask=None, max_words=100, max_font_size=50, figure_size=(8.0,4.0), 
                   title = None, title_size=20, image_color=False):
    wordcloud = WordCloud(background_color='white',
                    max_words=max_words,
                    max_font_size=max_font_size, 
                    random_state=42,
                    width=800, 
                    height=400,
                    mask=mask,
                    stopwords=CUSTOMZED_STOPWORDS)
    wordcloud.generate(str(text))
    
    # plt.figure(figsize=figure_size)
    # if image_color:
    #     image_colors = ImageColorGenerator(mask)
    #     plt.imshow(wordcloud.recolor(color_func=image_colors), interpolation="bilinear")
    #     plt.title(title, fontdict={'size': title_size,
    #                                'verticalalignment': 'bottom'})
    # else:
    #     plt.imshow(wordcloud)
    #     plt.title(title, fontdict={'size': title_size, 
    #                                'color': 'black', 
    #                                'verticalalignment': 'bottom'})
    # plt.axis('off')
    # plt.tight_layout()
    return wordcloud

def plot_wordcloud_with_frequency_dict(frequency_dict, mask=None, max_words=100, max_font_size=50, figure_size=(12.0,6.0),
                                        title = None, title_size=20, image_color=False):
    
    wordcloud = WordCloud(background_color='white',
                max_words=max_words,
                max_font_size=max_font_size, 
                random_state=42,
                width=800, 
                height=400,
                mask=mask,
                stopwords=CUSTOMZED_STOPWORDS)
    wordcloud.generate_from_frequencies(frequencies=frequency_dict)
    
    plt.figure(figsize=figure_size)
    if image_color:
        image_colors = ImageColorGenerator(mask)
        plt.imshow(wordcloud.recolor(color_func=image_colors), interpolation="bilinear")
        plt.title(title, fontdict={'size': title_size,
                                   'verticalalignment': 'bottom'})
    else:
        plt.imshow(wordcloud)
        plt.title(title, fontdict={'size': title_size, 
                                   'color': 'red', 
                                   'verticalalignment': 'bottom'})
    plt.axis('off')
    plt.tight_layout()

def get_input_str(confernce, year_start, year_end, results):
    input_str = ""
    for i, row in results.iterrows():
        if row['conference'] == confernce and \
        row['year'] > year_start and \
        row['year']<= year_end:
            input_str += row['title'] + " "
    return input_str

def plot_figure_s2():
  results = pd.read_csv('results.csv')
  previous_wc = None
  for year in range(year_start-1, year_end):
      input_str = get_input_str(conference, year, year+1, results)
      wc = plot_wordcloud(input_str,title=f'WordCloud for {conference} {year+1}')
      if previous_wc is not None:
          more_interested = {}
          less_interested = {}
          for k,v in wc.words_.items():
              if k in previous_wc.words_:
                  if v > previous_wc.words_[k]:
                      more_interested[k] = v - previous_wc.words_[k]
                  else:
                      less_interested[k] = previous_wc.words_[k] - v
              else:
                  more_interested[k] = v
          for k,v in previous_wc.words_.items():
              if k not in wc.words_:
                  less_interested[k] = v
          plot_wordcloud_with_frequency_dict(more_interested, title=f'More popular in {year+1} for {conference}')
          plot_wordcloud_with_frequency_dict(less_interested, title=f'Less popular in {year+1} for {conference}')
      previous_wc = wc



button = widgets.Button(description="Plot WordCloud")
output = widgets.Output()
def on_button_clicked(b):
  # Display the message within the output widget.
  with output:
    plot_figure_s2()

button.on_click(on_button_clicked)
display(button, output)

Button(description='Plot WordCloud', style=ButtonStyle())

Output()