# Introduction

In [38]:
import pandas as pd
import numpy as np
import altair as alt
from vega_datasets import data
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

import sklearn
from sklearn.datasets import load_diabetes
from sklearn.linear_model import Lasso
from sklearn.model_selection import cross_validate

from mandr import Store

# Trying out Mandr on simple objects

In [3]:
store = Store("root/getting_started")

In [5]:
store.insert("my_int", 3)

In [None]:
store.read("my_int")

In [43]:
store.update(
    "my_string_1",
    "<p><h1>Welcome to Mandr!</h1>Given to you by :probabl., Mandr is a powerful tool that allows data scientists to create clear reports from their Python code, typically a notebook. This document is actually a Mandr dashboard that was exported into HTML!</br>Mandr allows you to create text and customize it using simple HTML syntax, for example <b>bold</b>, <i>italic</i>, `code`, etc.</p>"
)

In [44]:
x = 2
y = [1, 2, 3, 4]
store.update("my_string_2", f"<p>You can also conveniently use Python f-strings. For example the value of `x` is {x} and the value of `y` is {y}.</p>")

In [29]:
store.update(
    "my_string_3",
    "<p>Once you have created cells in your Mandr dashboard, you can place them where you want, delete them, move them up or down, etc.</p>"
)

In [46]:
store.update(
    "my_string_4",
    "<p>Moreover, you can display pandas data frames inline. For example, the following dataframe was obtained after a scikit-learn cross-validation:</p>"
)

In [32]:
# Train an sklearn estimator and evaluate it with cross-validation
diabetes = load_diabetes()
X = diabetes.data[:150]
y = diabetes.target[:150]
lasso = Lasso()
df = pd.DataFrame(cross_validate(lasso, X, y, cv=5))


store.insert("my_dataframe_1", df)

In [48]:
store.update(
    "my_string_5",
    "<p>Of course, you can add some graphs. Here is a an example of graph generated using Altair:</p>"
)

In [35]:
num_points = 1000
data = pd.DataFrame(
    {"x": np.random.randn(num_points), "y": np.random.randn(num_points)}
)

chart_1 = (
    alt.Chart(data)
    .mark_circle()
    .encode(x="x", y="y", tooltip=["x", "y"])
    .interactive()
    .properties(title="Number of users over time")
)

store.insert("my_plot_1", chart_1)

In [49]:
store.update(
    "my_string_6",
    "<p>Using the interactive dashboard, you can put a cell with some text next to a cell with a plot to comment on it for example.</p>"
)

In [39]:
cars = data.cars()

chart_2 = (alt.Chart(cars).mark_point().encode(
    x='Horsepower',
    y='Miles_per_Gallon',
    color='Origin',
))
store.insert("my_plot_2", chart_2)

In [41]:
store.insert(
    "my_string_7",
    "<p>Stay tuned for some new great features!</p>"
)

# Warcraft

In [None]:
df_all = pd.read_parquet("wow-full.parquet")
# https://calmcode.io/datasets/wow-full

In [None]:
df_all.shape

In [None]:
df = df_all.sample(1_000)

In [None]:
df.head()

In [None]:
df.dtypes

In [None]:
feat = "level"
df[feat].hist();

In [None]:
feat = "where"
df[feat].unique()

In [None]:
years = df["datetime"].dt.year
print(years.min())
print(years.max())

In [None]:
df["datetime"].max() # the data set ends

In [None]:
df.isna().sum() / df.shape[0] * 100

In [None]:
store.insert("level_header", df["level"].head().tolist())

In [None]:
store.insert("level_tail", df["level"].tail().tolist())

In [None]:
df.player_id.value_counts()

In [None]:
df.query("player_id == 32358").sort_values(by="datetime")

In [None]:
df_all[["player_id", "level"]].drop_duplicates()

In [None]:
df_all["player_id"].nunique()

In [None]:
unique_levels_per_player = df_all.groupby('player_id')['level'].unique().reset_index()
unique_levels_per_player['num_levels'] = unique_levels_per_player['level'].apply(len)

In [None]:
unique_levels_per_player.sort_values(by="num_levels").tail(20)

In [None]:
df_all.query("player_id == 48432").sort_values(by="datetime")

In [None]:
# Sort by player_id and datetime to ensure correct order
df = df_all.copy() # going back to the full data set
df = df.sort_values(by=['player_id', 'datetime'])

# Group by player_id to calculate the required metrics
player_stats = df.groupby('player_id').agg(
    first_time=('datetime', 'min'),        # First play session
    last_time=('datetime', 'max'),         # Last play session
    first_level=('level', 'first'),        # First level recorded
    last_level=('level', 'last')           # Last level recorded
).reset_index()

# Calculate total hours played
player_stats['hours_played'] = (player_stats['last_time'] - player_stats['first_time']).dt.total_seconds() / 3600

# Calculate the difference in levels
player_stats['level_diff'] = player_stats['last_level'] - player_stats['first_level']

# Select the relevant columns
player_stats = player_stats[['player_id', 'hours_played', 'level_diff']]

print(player_stats)

In [None]:
player_stats = player_stats.query("level_diff > 0")

In [None]:
df_plot = player_stats.query("0 < hours_played < 5_000").copy()
x = "hours_played"
y = "level_diff"
plt.figure()
plt.plot(df_plot[x], df_plot[y], "o", markersize=0.7, alpha=0.3)
plt.xlabel(x)
plt.ylabel(y)
plt.show()

In [None]:
chart_2 = (
    alt.Chart(df_plot.sample(4000))
    .mark_circle()
    .encode(x=x, y=x, tooltip=[x, y])
    .interactive()
    .properties(title="Why do players stop playing??")
)

store.update("demo_my_plot_2", chart_2)