In [7]:
import pandas as pd

In [46]:
employee = pd.read_table("small_data_samples/data_table.tsv")
employee.head()

Unnamed: 0,Name,Position,Office,Age,Start date,Salary
0,Airi Satou,Accountant,Tokyo,33,2008/11/28,"$162,700"
1,Angelica Ramos,Chief Executive Officer (CEO),London,47,2009/10/09,"$1,200,000"
2,Ashton Cox,Junior Technical Author,San Francisco,66,2009/01/12,"$86,000"
3,Bradley Greer,Software Engineer,London,41,2012/10/13,"$132,000"
4,Brenden Wagner,Software Engineer,San Francisco,28,2011/06/07,"$206,850"


In [31]:
cols = ["Name", "Position"]
employee[cols].head()

Unnamed: 0,Name,Position
0,Airi Satou,Accountant
1,Angelica Ramos,Chief Executive Officer (CEO)
2,Ashton Cox,Junior Technical Author
3,Bradley Greer,Software Engineer
4,Brenden Wagner,Software Engineer


In [47]:
employee.Salary = employee.Salary.str.replace("$", "")
employee.Salary = employee.Salary.str.replace(",", "").astype(int)
employee.dtypes

Name          object
Position      object
Office        object
Age            int64
Start date    object
Salary         int64
dtype: object

In [None]:
employee = pd.read_table("data_sources/data_table.tsv", nrows=4)
employee.head()

In [38]:
employee.dtypes

Name          object
Position      object
Office        object
Age            int64
Start date    object
Salary        object
dtype: object

In [10]:
import numpy as np
employee.select_dtypes(include  = [np.number]).dtypes

Age    int64
dtype: object

In [48]:
employee.describe()

Unnamed: 0,Age,Salary
count,57.0,57.0
mean,42.736842,252135.3
std,14.877507,215384.7
min,19.0,75650.0
25%,30.0,112000.0
50%,42.0,164500.0
75%,56.0,327900.0
max,66.0,1200000.0


In [49]:
# Sorting
employee.sort_values(by="Age", ascending=True).head()

Unnamed: 0,Name,Position,Office,Age,Start date,Salary
48,Tatyana Fitzpatrick,Regional Director,London,19,2010/03/17,385750
45,Shou Itou,Regional Marketing,Tokyo,20,2011/08/14,163000
7,Caesar Vance,Pre-Sales Support,New York,21,2011/12/12,106450
32,Lael Greer,Systems Administrator,London,21,2009/02/27,103500
18,Gavin Cortez,Team Leader,San Francisco,22,2008/10/26,235500


In [50]:
# This seems not to work because the Salary data is not integers
employee.sort_values(by="Salary", ascending=False).head()

Unnamed: 0,Name,Position,Office,Age,Start date,Salary
1,Angelica Ramos,Chief Executive Officer (CEO),London,47,2009/10/09,1200000
16,Fiona Green,Chief Operating Officer (COO),San Francisco,48,2010/03/11,850000
38,Paul Byrd,Chief Financial Officer (CFO),New York,64,2010/06/09,725000
54,Yuri Berry,Chief Marketing Officer (CMO),New York,40,2009/06/25,675000
26,Jackson Bradshaw,Director,New York,65,2008/09/26,645750


In [51]:
# Filter applications
employee["Name"].sort_values().head()

0        Airi Satou
1    Angelica Ramos
2        Ashton Cox
3     Bradley Greer
4    Brenden Wagner
Name: Name, dtype: object

In [52]:
employee[employee.Age > 40].head()

Unnamed: 0,Name,Position,Office,Age,Start date,Salary
1,Angelica Ramos,Chief Executive Officer (CEO),London,47,2009/10/09,1200000
2,Ashton Cox,Junior Technical Author,San Francisco,66,2009/01/12,86000
3,Bradley Greer,Software Engineer,London,41,2012/10/13,132000
5,Brielle Williamson,Integration Specialist,New York,61,2012/12/02,372000
8,Cara Stevens,Sales Assistant,New York,46,2011/12/06,145600


In [53]:
# .head() can be ommited in case there is only a few results.
employee[(employee.Age > 40) & (employee.Name == "Angelica Ramos")].head()

Unnamed: 0,Name,Position,Office,Age,Start date,Salary
1,Angelica Ramos,Chief Executive Officer (CEO),London,47,2009/10/09,1200000


In [54]:
employee[employee.Age > 40][cols].head()

Unnamed: 0,Name,Position
1,Angelica Ramos,Chief Executive Officer (CEO)
2,Ashton Cox,Junior Technical Author
3,Bradley Greer,Software Engineer
5,Brielle Williamson,Integration Specialist
8,Cara Stevens,Sales Assistant


In [55]:
employee.mean()

Age           42.736842
Salary    252135.263158
dtype: float64

In [56]:
employee.Name.str.upper().head()

0        AIRI SATOU
1    ANGELICA RAMOS
2        ASHTON COX
3     BRADLEY GREER
4    BRENDEN WAGNER
Name: Name, dtype: object

In [57]:
employee.Name.str.lower().head()

0        airi satou
1    angelica ramos
2        ashton cox
3     bradley greer
4    brenden wagner
Name: Name, dtype: object

In [58]:
employee.Position.str.contains("Software").head()

0    False
1    False
2    False
3     True
4     True
Name: Position, dtype: bool

In [59]:
employee[employee.Position.str.contains("Software")]

Unnamed: 0,Name,Position,Office,Age,Start date,Salary
3,Bradley Greer,Software Engineer,London,41,2012/10/13,132000
4,Brenden Wagner,Software Engineer,San Francisco,28,2011/06/07,206850
6,Bruno Nash,Software Engineer,London,38,2011/05/03,163500
46,Sonya Frost,Software Engineer,Edinburgh,23,2008/12/13,103600
55,Zenaida Frank,Software Engineer,New York,63,2010/01/04,125250
56,Zorita Serrano,Software Engineer,San Francisco,56,2012/06/01,115000


In [60]:
# Replacing strings
employee.Position.str.replace("Engineer", "Developer").head()

0                       Accountant
1    Chief Executive Officer (CEO)
2          Junior Technical Author
3               Software Developer
4               Software Developer
Name: Position, dtype: object

In [61]:
employee.Age.min()

19

In [62]:
employee.Age.max()

66

In [63]:
# groupby aggregation
employee.groupby("Position").Age.min().head()

Position
Accountant                       33
Chief Executive Officer (CEO)    47
Chief Financial Officer (CFO)    64
Chief Marketing Officer (CMO)    40
Chief Operating Officer (COO)    48
Name: Age, dtype: int64

In [64]:
employee.groupby("Position").Age.agg(["mean", "min", "max", "count"]).head()

Unnamed: 0_level_0,mean,min,max,count
Position,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Accountant,48.0,33,63,2
Chief Executive Officer (CEO),47.0,47,47,1
Chief Financial Officer (CFO),64.0,64,64,1
Chief Marketing Officer (CMO),40.0,40,40,1
Chief Operating Officer (COO),48.0,48,48,1


In [65]:
employee.groupby("Salary").Age.agg(["mean", "min", "max", "count"]).head()

Unnamed: 0_level_0,mean,min,max,count
Salary,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
75650,43,43,43,1
85600,23,23,23,1
85675,47,47,47,1
86000,66,66,66,1
86500,30,30,30,1
