In [1]:
import pandas as pd

# Attendance Data

## Load the attendance.csv file and calculate an attendnace percentage for each student. 

In [2]:
attendance = pd.read_csv('csvs/attendance.csv')

In [3]:
# Rename and assign the index to be the names of the students
attendance.rename(columns=({'Unnamed: 0': 'student_name'}), inplace=True)

In [4]:
attendance

Unnamed: 0,student_name,2018-01-01,2018-01-02,2018-01-03,2018-01-04,2018-01-05,2018-01-06,2018-01-07,2018-01-08
0,Sally,P,T,T,H,P,A,T,T
1,Jane,A,P,T,T,T,T,A,T
2,Billy,A,T,A,A,H,T,P,T
3,John,P,T,H,P,P,T,P,P


## One half day is worth 50% of a full day, and 10 tardies is equal to one absence.

In [5]:
melt = attendance.melt(id_vars='student_name', var_name='date', value_name='status')

In [6]:
melt.head()

Unnamed: 0,student_name,date,status
0,Sally,2018-01-01,P
1,Jane,2018-01-01,A
2,Billy,2018-01-01,A
3,John,2018-01-01,P
4,Sally,2018-01-02,T


In [7]:
melt.status = melt.status.str.replace('P', '1')   # Present
melt.status = melt.status.str.replace('A', '0')   # Absent
melt.status = melt.status.str.replace('H', '.5')  # Half Day
melt.status = melt.status.str.replace('T', '.9')  # Tardy

In [8]:
melt.head()

Unnamed: 0,student_name,date,status
0,Sally,2018-01-01,1.0
1,Jane,2018-01-01,0.0
2,Billy,2018-01-01,0.0
3,John,2018-01-01,1.0
4,Sally,2018-01-02,0.9


In [9]:
melt.status = melt.status.astype(float)

In [10]:
melt.groupby('student_name').mean()

Unnamed: 0_level_0,status
student_name,Unnamed: 1_level_1
Billy,0.525
Jane,0.6875
John,0.9125
Sally,0.7625


You should end up with something like this:

name
Billy    0.5250
Jane     0.6875
John     0.9125
Sally    0.7625
Name: grade, dtype: float64

# Coffee Levels

## Read the coffee_levels.csv file.

In [11]:
coffee = pd.read_csv('csvs/coffee_levels.csv')

In [12]:
coffee.head()

Unnamed: 0,hour,coffee_carafe,coffee_amount
0,8,x,0.816164
1,9,x,0.451018
2,10,x,0.843279
3,11,x,0.335533
4,12,x,0.898291


## Transform the data so that each carafe is in it's own column.

In [13]:
melt = coffee.melt(value_vars='coffee_carafe')

In [14]:
coffee_pivot = coffee.pivot(index='hour', columns='coffee_carafe', values='coffee_amount').reset_index()


In [15]:
coffee_pivot.columns.name = ''

In [16]:
coffee_pivot

Unnamed: 0,hour,x,y,z
0,8,0.816164,0.189297,0.999264
1,9,0.451018,0.521502,0.91599
2,10,0.843279,0.023163,0.144928
3,11,0.335533,0.235529,0.311495
4,12,0.898291,0.017009,0.771947
5,13,0.310711,0.997464,0.39852
6,14,0.507288,0.058361,0.864464
7,15,0.215043,0.144644,0.436364
8,16,0.183891,0.544676,0.280621
9,17,0.39156,0.594126,0.436677


## Is this the best shape for the data?

Depends on what you're doing with the data. 

- For modeling this is better than the previous

# Cake Recipes

## Read the cake_recipes.csv data. This data set contains cake tastiness scores for combinations of different recipes, oven rack positions, and oven temperatures.

In [17]:
cake = pd.read_csv('csvs/cake_recipes.csv')

In [18]:
cake.head()

Unnamed: 0,recipe:position,225,250,275,300
0,a:bottom,61.738655,53.912627,74.41473,98.786784
1,a:top,51.709751,52.009735,68.576858,50.22847
2,b:bottom,57.09532,61.904369,61.19698,99.248541
3,b:top,82.455004,95.224151,98.594881,58.169349
4,c:bottom,96.470207,52.001358,92.893227,65.473084


## Tidy the data as necessary.

In [19]:
cake_melt = cake.melt('recipe:position')

In [20]:
recipe_position = cake_melt['recipe:position'].str.split(':', expand=True)
recipe_position.rename(columns={0:'recipe', 1:'rack_position'},inplace=True)

In [21]:
cake_melt = pd.concat([cake_melt, recipe_position], axis=1).drop(columns='recipe:position')
cake_melt.head()

Unnamed: 0,variable,value,recipe,rack_position
0,225,61.738655,a,bottom
1,225,51.709751,a,top
2,225,57.09532,b,bottom
3,225,82.455004,b,top
4,225,96.470207,c,bottom


In [25]:
# Rename Columns
cake_melt.rename(columns={
                'variable': 'oven_temperature',
                'value': 'score',},
                 inplace=True)

In [26]:
# Reposition Columns
cake_melt = cake_melt[['recipe', 'rack_position', 'oven_temperature', 'score']]

In [28]:
cake_melt.head(1)

Unnamed: 0,recipe,rack_position,oven_temperature,score
0,a,bottom,225,61.738655


## Which recipe, on average, is the best? recipe b

In [29]:
cake_melt.groupby('recipe').mean()

Unnamed: 0_level_0,score
recipe,Unnamed: 1_level_1
a,63.922201
b,76.736074
c,75.874748
d,62.864844


## Which oven temperature, on average, produces the best results? 275

In [30]:
cake_melt.groupby('oven_temperature').mean()

Unnamed: 0_level_0,score
oven_temperature,Unnamed: 1_level_1
225,71.306022
250,66.577437
275,74.886754
300,66.627655


## Which combination of recipe, rack position, and temperature gives the best result? recipe b, bottom rack, 300 degrees

In [34]:
cake_melt.groupby(['recipe', 'rack_position', 'oven_temperature']).mean().idxmax()

score    (b, bottom, 300)
dtype: object