In [1]:
import os
import pandas as pd
import numpy as np
import random

In [2]:
path1 = os.getcwd()
path2 = 'roster.csv'
csvFilePath = os.path.join(path1, path2)

## Read CSV

In [3]:
roster = pd.read_csv(csvFilePath)
print(type(roster))

<class 'pandas.core.frame.DataFrame'>


### Viewing the data

In [4]:
roster.head()

Unnamed: 0,name
0,Joe
1,Jihuan
2,Ali
3,Frances
4,Daniela V


In [5]:
roster.tail()

Unnamed: 0,name
17,Hsin-Yun
18,Renata
19,Max
20,Joshua
21,David


In [6]:
roster

Unnamed: 0,name
0,Joe
1,Jihuan
2,Ali
3,Frances
4,Daniela V
5,Mostafa
6,Daniela P
7,Cesar
8,Jarrod
9,Austin


## Modifying the Data

In [7]:
d = {'name': ['Wally']}
tmp_df = pd.DataFrame(data=d)
roster = pd.concat([roster, tmp_df], ignore_index=True)
roster

Unnamed: 0,name
0,Joe
1,Jihuan
2,Ali
3,Frances
4,Daniela V
5,Mostafa
6,Daniela P
7,Cesar
8,Jarrod
9,Austin


### Assign Grades

In [8]:
np.random.seed(1)
grades = np.random.randint(0,100, size=len(roster))
print(grades)

[37 12 72  9 75  5 79 64 16  1 76 71  6 25 50 20 18 84 11 28 29 14 50]


In [9]:
roster['grade'] = np.random.randint(0,100, size=len(roster))
roster

Unnamed: 0,name,grade
0,Joe,68
1,Jihuan,87
2,Ali,87
3,Frances,94
4,Daniela V,96
5,Mostafa,86
6,Daniela P,13
7,Cesar,9
8,Jarrod,7
9,Austin,63


### Modify Specific row of data
.loc can be used with a boolean array (i.e. a an array of 1s and 0s)

In [10]:
roster.loc[roster.name == "Daniela P", "grade"] = 100
roster

Unnamed: 0,name,grade
0,Joe,68
1,Jihuan,87
2,Ali,87
3,Frances,94
4,Daniela V,96
5,Mostafa,86
6,Daniela P,100
7,Cesar,9
8,Jarrod,7
9,Austin,63


### Check the class average

Each column in a pandas dataframe is a series object -- which have dozens of [built-in methods](https://pandas.pydata.org/docs/reference/api/pandas.Series.html). 

In [11]:
roster['grade'].mean()

53.78260869565217

In [12]:
roster.loc[roster['grade'] < 50, 'grade'] = roster['grade'] + 40
roster

Unnamed: 0,name,grade
0,Joe,68
1,Jihuan,87
2,Ali,87
3,Frances,94
4,Daniela V,96
5,Mostafa,86
6,Daniela P,100
7,Cesar,49
8,Jarrod,47
9,Austin,63


In [13]:
roster['grade'].mean()

69.43478260869566

In [14]:
roster.loc[roster['grade'] < 60, 'grade'] = roster['grade'] + 30
roster['grade'].mean()

78.56521739130434

## Write to CSV

In [15]:
outFilePath = os.path.join(os.getcwd(), 'roster_pandas.csv')
print(outFilePath)

/Users/joseph/python-course/week6/roster_pandas.csv


In [16]:
roster.to_csv(outFilePath, index=False)

## More Aggregation and Manipulation

In [17]:
roster['group'] = np.random.choice(['red', 'blue'], size=len(roster))

In [18]:
group_means = roster.groupby(by=['group']).mean()
group_means

Unnamed: 0_level_0,grade
group,Unnamed: 1_level_1
blue,79.0
red,78.0


In [19]:
group_means.rename(columns={'grade': 'group_avg'}, inplace=True)
group_means

Unnamed: 0_level_0,group_avg
group,Unnamed: 1_level_1
blue,79.0
red,78.0


### Merging dataframes

In [20]:
group_means.shape

(2, 1)

In [21]:
roster.shape

(23, 3)

In [22]:
roster = roster.merge(group_means, on=['group'])
roster.shape

(23, 4)

In [23]:
roster

Unnamed: 0,name,grade,group,group_avg
0,Joe,68,blue,79.0
1,Jihuan,87,blue,79.0
2,Frances,94,blue,79.0
3,Daniela V,96,blue,79.0
4,Mostafa,86,blue,79.0
5,Daniela P,100,blue,79.0
6,Austin,63,blue,79.0
7,Jack,61,blue,79.0
8,Volodymyr,60,blue,79.0
9,Yijia,81,blue,79.0


### Creating new columns from custom functions

In [24]:
def is_top50(col):
    return col > col.median()

In [25]:
roster['top50'] = roster[['grade']].apply(is_top50)
roster

Unnamed: 0,name,grade,group,group_avg,top50
0,Joe,68,blue,79.0,False
1,Jihuan,87,blue,79.0,True
2,Frances,94,blue,79.0,True
3,Daniela V,96,blue,79.0,True
4,Mostafa,86,blue,79.0,True
5,Daniela P,100,blue,79.0,True
6,Austin,63,blue,79.0,False
7,Jack,61,blue,79.0,False
8,Volodymyr,60,blue,79.0,False
9,Yijia,81,blue,79.0,True
