In [None]:
import pandas as pd
import numpy as np
import plotly.graph_objs as go
from scipy import stats
from plotly.offline import init_notebook_mode, iplot
from datetime import datetime
init_notebook_mode(connected = True)

In [None]:
def read_json(path):
  data = pd.read_json(path)
  data.creation_date = data.creation_date.apply(datetime.utcfromtimestamp)
  return data

In [None]:
python_questions = read_json('data/python_questions_2018.json')
python_questions['language'] = 'Python'

cpp_questions = read_json('data/cpp_questions_2018.json')
cpp_questions['language'] = 'C++'

questions = pd.concat([python_questions, cpp_questions], sort = False)

1. Create an overlapped histogram of both the python and cpp data. Use the creation date of
the questions to bin your values.

In [None]:
groups = questions.groupby(['language'])

data = [
  go.Histogram(
    x = groups.get_group(language).creation_date,
    name = language,
    xbins = dict(
      start = datetime(2018, 1, 1),
      end = datetime(2018, 12, 31),
      size = 'M1',
    ),
  ) 
  for language in questions.language.unique()
]

iplot({
  'data': data,
  'layout': {'title': 'Questions per Month'}
})

2. Create a boxplot that shows the question scores. Place the boxplots in parallel so that the
plots can be compared.

In [None]:
groups = questions.groupby(['score', 'language']).size().reset_index()

data = go.Box(
  x = groups['language'],
  y = groups['score'],
  name = 'Score',
)

iplot({
  'data': [data], 
  'layout': {'title': 'Score per Language'},
})

3. Finally, compare the Answered attribute of both data sets by creating a stacked bar chart. Questions that have already been answered and that are pending should be displayed in separate bars.

In [None]:
groups = questions.groupby(['is_answered', 'language']).size().reset_index(name = 'count')

trace1 = go.Bar(
  x = groups[groups['is_answered'] == True]['language'],
  y = groups[groups['is_answered'] == True]['count'],
  name = 'Answered',
)

trace2 = go.Bar(
  x = groups[groups['is_answered'] == False]['language'],
  y = groups[groups['is_answered'] == False]['count'],
  name = 'Unanswered',
)

iplot({
  'data': [trace1, trace2], 
  'layout': {
    'title': 'Answered Questions per Language',
    'barmode': 'stack',
  },
})