In [None]:
import pandas as pd
from matplotlib import pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
%matplotlib inline
plt.style.use('fivethirtyeight')
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['lines.linewidth'] = 1.5
darkgrey = '#3A3A3A'
lightgrey = '#414141'
barblue = plt.rcParams['axes.prop_cycle'].by_key()['color'][0]
plt.rcParams['text.color'] = darkgrey
plt.rcParams['axes.labelcolor'] = darkgrey
plt.rcParams['xtick.color'] = lightgrey
plt.rcParams['ytick.color'] = lightgrey

# Your first Monte Carlo Simulation

## Your goal
You want to forecast with a monte carlo simulation based on the gathered data of your team. The data is stored in the raw.csv.
With your simulation you want to answer the question "How many stories can we do in a given time span?"

## How this notebook is structurued
This notebook gives you a structure on how to create a forecast with a monte carlo simulation. Step by step you analyse the data and create the forecast. Each step builds on the previous one. In order to guide and help you each step consists of:
* A small description on what to do in this step
* Code for visualizing the data in this step (optional to use it - but saves time)
* If you get stuck, don't worry. For each step there is a CSV with data needed for this step.

Feel free to follow the structure or find your own way!

## 1. Read and check the raw data

### Goal
This step reads the raw.csv file as pandas.DataFrame and reduces the columns to the data points to calculate throughput (Completed items per day).

Get a feeling about the data:
* What else is in the data set?
* Where could you get this data from in your project?

### Input
raw.csv

### Visualization
Output the data as table (e.g. pd.DataFrame.head())

In [None]:
kanban_data = pd.read_csv('raw.csv', usecols=['Done', 'Type'], parse_dates=['Done']).dropna()
kanban_data.head(1)

## 2. Read and check the raw data

### Goal
Calculate the throughput (items completed) per day and visualize it over time (e.g. per day or week). Does the data set look valid?

You need the data of througput per day for the next step.

### Visualization
Code to create a simple plot to show datapoints over time is given. X=Date, Y=Throughput

In [None]:
# Use the DataFrame kanban_data of the previous step
# Start coding here


# Stuck? Use this to proceed to the next step: throughput_per_week = pd.read_csv('throughput_per_week.csv')
ax = throughput_per_week.plot(
    x='Date', y='Throughput', linewidth=2.5, figsize=(14, 3), legend=None)
ax.set_title("Throughput per Week", loc='left', fontdict={
             'fontsize': 18, 'fontweight': 'semibold'})
ax.set_xlabel('')
ax.set_ylabel('Items Completed')
ax.axhline(y=0, color=lightgrey, alpha=.5);

## 3. Run a Monte Carlo Simulation
### Goal
Run a monte carlo simulation 'how many items can we complete in X days?' with the following steps:
* Define the datapoints you want to use for the simulation (e.g. last 100 days)
* Define the number of days you want to simulate (e.g. 14 days)
* Simulate the number of days at least 10000 times by randomly picking data points for each day

The result is a distribution of how many times a number of completed items has occured in the simulations.

### Visualization
Given: Code to create simple bar plot to visualize the output of the simulation: X=Items Completed, Y=# of occurences of this number of items completed

In [None]:
### SIMULATION INPUT ####
LAST_DAYS = 100
SIMULATION_DAYS = 14
SIMULATIONS = 10000
###

# Start coding here, use "throughput per day" of the previous step


# Stuck? Use this to proceed to the next step: distribution = pd.read_csv('distribution.csv')
plt.figure(figsize=(14, 3))
ax = sns.barplot(x='Items', y='Frequency', data=distribution, color=barblue)
ax.set_title(f"Distribution of Monte Carlo Simulation 'How Many' ({SIMULATIONS} Runs)", loc='left',
             fontdict={'size': 18, 'weight': 'semibold'})
ax.set_xlabel(f"Total Items Completed in {SIMULATION_DAYS} Days")
ax.set_ylabel('Frequency')
ax.axhline(y=SIMULATIONS*0.001, color=darkgrey, alpha=.5);

## 3. Analysis of the Probabilities of Occurence
### Goal
Use the distribution of the simulation to calculate the probability that a number of items is completed.

### Visualization
Given: Code to create simple bar plot to visualize the output of the simulation and highlight the percentiles 95%, 85%, 70%: 
* X=Items Completed, Y=Probability to copmlete the # of items
* To highlight the percentiles the samples of the simulation are needed (list of throughput)

In [None]:
# Start coding here, use the distribution DataFrame of the previous step.



# Stuck? Use this to proceed to the next step: 
#samples = pd.read_csv('samples.csv')
#probability = pd.read_csv('probability.csv')
plt.figure(figsize=(14, 5))
ax = sns.barplot(x='Items', y='Probability', data=probability, color=barblue)
ax.text(x=-1.4, y=118,
        s=f"Probabilities of Completing a Scope in {SIMULATION_DAYS} Days", fontsize=18, fontweight='semibold')
ax.text(x=-1.4, y=110,
        s=f"Based on a Monte Carlo Simulations ({SIMULATIONS} Runs) with data of last {LAST_DAYS} days", fontsize=16)
ax.set_ylabel('')
ax.set_xlabel('Total Items Completed')
ax.axhline(y=0.5, color=darkgrey, alpha=.5)
ax.axhline(y=70, color=darkgrey, linestyle='--')
ax.axhline(y=85, color=darkgrey, linestyle='--')
ax.axhline(y=95, color=darkgrey, linestyle='--')
label_xpos = distribution['Items'].max()-2
ax.text(y=70, x=label_xpos, s=f'70%% (%d+ Items)' % samples.Items.quantile(0.3),
        va='center', ha='center', backgroundcolor='#F0F0F0')
ax.text(y=85, x=label_xpos, s=f'85%% (%d+ Items)' % samples.Items.quantile(0.15),
        va='center', ha='center', backgroundcolor='#F0F0F0')
ax.text(y=95, x=label_xpos, s=f'95%% (%d+ Items)' % samples.Items.quantile(0.05),
        va='center', ha='center', backgroundcolor='#F0F0F0')
ax.set_yticklabels(labels=['0', '20', '40', '60', '80', '100%']);