### Dataframes

In [94]:
import pandas as pd

### Create dataframe from a Dict object

In [95]:
data = {
    "name":['John', 'Jane', 'Lisa', 'Greg'],
    "age": [27, 30, 20, 37],
    "gender": ['m','f','','m'],
    "position": ['manager', 'sr. manager', 'architect', 'scrum master'],
    "emp_id": [1,2,3,4],
    "status": ['active', 'active', 'inactive', None],
    "years_of_service": [7, 10, 9, 15]
}
df = pd.DataFrame(data=data)
df

Unnamed: 0,name,age,gender,position,emp_id,status,years_of_service
0,John,27,m,manager,1,active,7
1,Jane,30,f,sr. manager,2,active,10
2,Lisa,20,,architect,3,inactive,9
3,Greg,37,m,scrum master,4,,15


In [96]:
df = pd.DataFrame(data=data, index=data['emp_id'])
df

Unnamed: 0,name,age,gender,position,emp_id,status,years_of_service
1,John,27,m,manager,1,active,7
2,Jane,30,f,sr. manager,2,active,10
3,Lisa,20,,architect,3,inactive,9
4,Greg,37,m,scrum master,4,,15


### Select columns based on condition

In [97]:
df[df.status == 'active']

Unnamed: 0,name,age,gender,position,emp_id,status,years_of_service
1,John,27,m,manager,1,active,7
2,Jane,30,f,sr. manager,2,active,10


In [98]:
df[df.age >=30]

Unnamed: 0,name,age,gender,position,emp_id,status,years_of_service
2,Jane,30,f,sr. manager,2,active,10
4,Greg,37,m,scrum master,4,,15


In [99]:
df[df.status.isnull()]

Unnamed: 0,name,age,gender,position,emp_id,status,years_of_service
4,Greg,37,m,scrum master,4,,15


In [100]:
df[df.gender == '']

Unnamed: 0,name,age,gender,position,emp_id,status,years_of_service
3,Lisa,20,,architect,3,inactive,9


### Insert columns to existing dataframe

In [101]:
df.insert(7, "grade", None)
df

Unnamed: 0,name,age,gender,position,emp_id,status,years_of_service,grade
1,John,27,m,manager,1,active,7,
2,Jane,30,f,sr. manager,2,active,10,
3,Lisa,20,,architect,3,inactive,9,
4,Greg,37,m,scrum master,4,,15,


### Update value based on condition

In [102]:
df.loc[df.years_of_service >= 10, "grade"] = "senior"
df

Unnamed: 0,name,age,gender,position,emp_id,status,years_of_service,grade
1,John,27,m,manager,1,active,7,
2,Jane,30,f,sr. manager,2,active,10,senior
3,Lisa,20,,architect,3,inactive,9,
4,Greg,37,m,scrum master,4,,15,senior


### Drop columns

In [103]:
df=df.drop("grade", axis=1)
df

Unnamed: 0,name,age,gender,position,emp_id,status,years_of_service
1,John,27,m,manager,1,active,7
2,Jane,30,f,sr. manager,2,active,10
3,Lisa,20,,architect,3,inactive,9
4,Greg,37,m,scrum master,4,,15


In [104]:
df.T

Unnamed: 0,1,2,3,4
name,John,Jane,Lisa,Greg
age,27,30,20,37
gender,m,f,,m
position,manager,sr. manager,architect,scrum master
emp_id,1,2,3,4
status,active,active,inactive,
years_of_service,7,10,9,15


In [105]:
df.values

array([['John', 27, 'm', 'manager', 1, 'active', 7],
       ['Jane', 30, 'f', 'sr. manager', 2, 'active', 10],
       ['Lisa', 20, '', 'architect', 3, 'inactive', 9],
       ['Greg', 37, 'm', 'scrum master', 4, None, 15]], dtype=object)

In [108]:
df.loc[1]

name                   John
age                      27
gender                    m
position            manager
emp_id                    1
status               active
years_of_service          7
Name: 1, dtype: object

In [109]:
df.loc[1, ["name", "age"]]

name    John
age       27
Name: 1, dtype: object

### Statistics methods

In [118]:
df.years_of_service.sum()

41

In [119]:
df.years_of_service.mean()

10.25

In [120]:
df.years_of_service.idxmax()

4

In [121]:
df.years_of_service.cumsum()

1     7
2    17
3    26
4    41
Name: years_of_service, dtype: int64

In [122]:
df.years_of_service.describe()

count     4.00000
mean     10.25000
std       3.40343
min       7.00000
25%       8.50000
50%       9.50000
75%      11.25000
max      15.00000
Name: years_of_service, dtype: float64

In [123]:
df.years_of_service.corr(df.years_of_service)

1.0