In [None]:
from datetime import datetime as dt
from datetime import timedelta 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Read in the CSV files as DataFrames

In [None]:
pdf = pd.read_csv( '/kaggle/input/win-network/posts.csv' )
cdf = pd.read_csv( '/kaggle/input/win-network/comments.csv' )

sites = sorted( list( set( cdf[ 'site' ] ) ) )

## Generate histograms for upvote/downvote frequency, for post and comment data of all sites

In [None]:
# initialize axis for 2x11 grid of plots
fig, ax = plt.subplots( len( sites ), 2, figsize = ( 16, 6 * len( sites ) ) )

# loop over sites
for i, site in enumerate( sites ):
  
  # plot both post and comment data, using different quartiles (since post 
  # upvote data has more significant outliers than comment upvote data)
  for j, ( df, entity, quantile ) in enumerate( zip( 
      [ pdf, cdf ], 
      [ 'post', 'comment' ],
      [ 0.99, 0.999 ] ) ):
    
      # temporary DataFrame containing only posts from `site`
      _tdf = df[ df[ 'site' ] == site ]

      # define arrays of bin edges for upvotes and for downvotes
      bins_up = np.arange( 0, _tdf[ 'upvotes' ].quantile( quantile ) + 1 )
      bins_down = np.arange( -1 * ( _tdf[ 'downvotes' ].quantile( quantile ) ), 1 )

      # plot upvote histograms
      ax[ i, j ].hist( 
        x = _tdf[ 'upvotes' ], 
        bins = bins_up, 
        color = 'b', 
        label = 'Upvotes' )

      # plot downvote histogram
      ax[ i, j ].hist( 
        x = -1 * _tdf[ 'downvotes' ], 
        bins = bins_down , 
        color = 'r', 
        label = 'Downvotes' )

      # apply logarithmic scale to y axis
      ax[ i, j ].set_yscale( 'log' )

      # set labels and title
      ax[ i, j ].set_xlabel( 'Number of votes' )
      ax[ i, j ].set_ylabel( 'Vote Frequency' )
      ax[ i, j ].set_title( f'{site} {entity}s' )
  
      # specify legend location
      ax[ i, j ].legend( loc = 0 )

plt.savefig( 'votes.pdf', bbox_inches = 'tight' )
plt.show( )