In [2]:
import pandas as pd

## 資料集介紹

In [3]:
#  這是一個芝加哥的data，人員姓名、部份、職級以及薪資
chicago = pd.read_csv('chicago.csv')
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [4]:
#  薪資被當字串了(因為$符號)，造成無法做數值計算
#  姓氏與名字在同一個欄位
chicago.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32063 entries, 0 to 32062
Data columns (total 4 columns):
Name                      32062 non-null object
Position Title            32062 non-null object
Department                32062 non-null object
Employee Annual Salary    32062 non-null object
dtypes: object(4)
memory usage: 1002.0+ KB


In [5]:
#  唯一能先優化的就是部份，可以調整為category
#  三萬多筆的資料，部門有35個
chicago['Department'].nunique()

35

In [6]:
#  節省了20%左右的記憶體用量
chicago["Department"] = chicago["Department"].astype('category')
chicago.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32063 entries, 0 to 32062
Data columns (total 4 columns):
Name                      32062 non-null object
Position Title            32062 non-null object
Department                32062 non-null category
Employee Annual Salary    32062 non-null object
dtypes: category(1), object(3)
memory usage: 784.4+ KB


## String Methods .lower() .upper() .title() .len()

In [7]:
chicago = pd.read_csv('chicago.csv')
chicago["Department"] = chicago["Department"].astype('category')
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00


In [8]:
#  標準python作法，直接字串.method就可以大小寫、第一碼調整
#  但是在pandas這樣做的話會出現異常
chicago['Name'].title()

AttributeError: 'Series' object has no attribute 'title'

In [9]:
#  所以，必需先經過.str的轉型之後再執行相關字串操作
chicago['Position Title'].str.title()

0                                     Water Rate Taker
1                                       Police Officer
2                                       Police Officer
3                             Chief Contract Expediter
4                                    Civil Engineer Iv
5                                 Asst To The Alderman
6                                General Laborer - Dss
7                          Traffic Control Aide-Hourly
8                           Staff Asst To The Alderman
9                                  Electrical Mechanic
10                                   Fire Engineer-Emt
11                                      Police Officer
12                                  Foster Grandparent
13                                           Clerk Iii
14                              Investigator - Ipra Ii
15                                      Police Officer
16                                      Police Officer
17       Firefighter (Per Arbitrators Award)-Paramedic
18        

In [10]:
chicago['Position Title'] = chicago['Position Title'].str.title()
chicago.head(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",Water Rate Taker,WATER MGMNT,$90744.00
1,"AARON, JEFFERY M",Police Officer,POLICE,$84450.00
2,"AARON, KARINA",Police Officer,POLICE,$84450.00


In [12]:
#  需要每個欄位的字串長度的時候，len不要包錯地方
#  用len包住pandas的話，回傳是資料總筆數
len(chicago['Department'])

32063

In [11]:
chicago['Department'].str.len().head()

0    11.0
1     6.0
2     6.0
3    16.0
4    11.0
Name: Department, dtype: float64

## .str.replace() Method

In [12]:
chicago = pd.read_csv('chicago.csv').dropna(how='all')
chicago['Department'] = chicago['Department'].astype('category')
chicago.tail(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
32059,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
32060,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
32061,"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [13]:
chicago['Department'] = chicago['Department'].str.replace('MGMNT', 'MANAGEMENT')
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MANAGEMENT,$90744.00
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MANAGEMENT,$106836.00


In [14]:
#  調整欄位為float才能計算相關數據
chicago['Employee Annual Salary'] = chicago['Employee Annual Salary'].str.replace('$', '').astype(float)
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MANAGEMENT,90744.0
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,84450.0
2,"AARON, KARINA",POLICE OFFICER,POLICE,84450.0
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,89880.0
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MANAGEMENT,106836.0


In [16]:
chicago['Employee Annual Salary'].mean()

80204.178633899

## Filtering with String Methods

In [17]:
chicago = pd.read_csv('chicago.csv').dropna(how='all')
chicago['Department'] = chicago['Department'].astype('category')
chicago.tail(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
32059,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
32060,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
32061,"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [20]:
#  要過濾取得position title中有water字串
#  先將資料統一小寫，再利用contains來搜尋
#  contains=like '%%'
#  即使是串列指令，還是需要再一次的.str
mask = chicago['Position Title'].str.lower().str.contains('water')

In [21]:
chicago[mask]

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
554,"ALUISE, VINCENT G",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MGMNT,$102440.00
671,"ANDER, PERRY A",WATER CHEMIST II,WATER MGMNT,$82044.00
685,"ANDERSON, ANDREW J",DISTRICT SUPERINTENDENT OF WATER DISTRIBUTION,WATER MGMNT,$109272.00
702,"ANDERSON, DONALD",FOREMAN OF WATER PIPE CONSTRUCTION,WATER MGMNT,$102440.00
1054,"ASHLEY, KARMA T",WATER CHEMIST II,WATER MGMNT,$82044.00
1079,"ATKINS, JOANNA M",WATER CHEMIST II,WATER MGMNT,$82044.00
1181,"AZEEM, MOHAMMED A",WATER CHEMIST II,WATER MGMNT,$53172.00
1285,"BAJIC, JOHN A",WATER METER MACHINIST,WATER MGMNT,$82576.00
2400,"BOLTON, BRIAN E",WATER RATE TAKER,WATER MGMNT,$78948.00


In [23]:
#  startswith=like 'xx%'=起始為
mask2 = chicago['Position Title'].str.lower().str.startswith('water')
chicago[mask2]

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00
671,"ANDER, PERRY A",WATER CHEMIST II,WATER MGMNT,$82044.00
1054,"ASHLEY, KARMA T",WATER CHEMIST II,WATER MGMNT,$82044.00
1079,"ATKINS, JOANNA M",WATER CHEMIST II,WATER MGMNT,$82044.00
1181,"AZEEM, MOHAMMED A",WATER CHEMIST II,WATER MGMNT,$53172.00
1285,"BAJIC, JOHN A",WATER METER MACHINIST,WATER MGMNT,$82576.00
2400,"BOLTON, BRIAN E",WATER RATE TAKER,WATER MGMNT,$78948.00
2586,"BOYCE, ADNER L",WATER CHEMIST II,WATER MGMNT,$82044.00
2745,"BRANDYS, DANIEL",WATER CHEMIST II,WATER MGMNT,$53172.00
3143,"BROWN, SHARON L",WATER RATE TAKER,WATER MGMNT,$82728.00


In [24]:
#  endswith=like '%xx'=結尾為
mask3 = chicago['Position Title'].str.lower().str.endswith('ist')
chicago[mask3]

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
184,"AFROZ, NAYYAR",PSYCHIATRIST,HEALTH,$99840.00
308,"ALARCON, LUIS J",LOAN PROCESSING SPECIALIST,COMMUNITY DEVELOPMENT,$81948.00
422,"ALLAIN, CAROLYN",SENIOR TELECOMMUNICATIONS SPECIALIST,DoIT,$89880.00
472,"ALLEN, ROBERT",MACHINIST,WATER MGMNT,$94328.00
705,"ANDERSON, EDWARD M",SR PROCUREMENT SPECIALIST,PROCUREMENT,$91476.00
1022,"ARTEAGA, PAUL",MACHINIST,TRANSPORTN,$94328.00
1163,"AYALA JR, JUAN",FIELD SANITATION SPECIALIST,STREETS & SAN,$78948.00
1285,"BAJIC, JOHN A",WATER METER MACHINIST,WATER MGMNT,$82576.00
1558,"BARRETT, BARBARA J",TECHNICAL TRAINING SPECIALIST,POLICE,$94200.00
1869,"BELTRAN, MAURICIO",PROCUREMENT SPECIALIST,PROCUREMENT,$79596.00


## .strip() .lstrip() .rstrip() Methods

In [2]:
chicago = pd.read_csv('chicago.csv').dropna(how='all')
chicago['Department'] = chicago['Department'].astype('category')
chicago.tail(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
32059,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
32060,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
32061,"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [7]:
#  去除字串間的空白
chicago['Name'].str.rstrip()

0                 AARON,  ELVIA J
1               AARON,  JEFFERY M
2                  AARON,  KARINA
3             AARON,  KIMBERLEI R
4             ABAD JR,  VICENTE M
5                 ABARCA,  ANABEL
6               ABARCA,  EMMANUEL
7               ABASCAL,  REECE E
8            ABBASI,  CHRISTOPHER
9           ABBATACOLA,  ROBERT J
10          ABBATEMARCO,  JAMES J
11               ABBATE,  TERRY M
12               ABBOTT,  BETTY L
13              ABBOTT,  LYNISE M
14         ABBRUZZESE,  WILLIAM J
15                ABDALLAH,  ZAID
16          ABDELHADI,  ABDALMAHD
17            ABDELLATIF,  AREF R
18             ABDELMAJEID,  AZIZ
19            ABDOLLAHZADEH,  ALI
20       ABDUL-KARIM,  MUHAMMAD A
21            ABDULLAH,  DANIEL N
22               ABDULLAH,  KEVIN
23           ABDULLAH,  LAKENYA N
24            ABDULLAH,  RASHAD J
25           ABDULSATTAR,  MUDHAR
26           ABDUL-SHAKUR,  TAHIR
27         ABDULWAHAB,  ABUUBAIDA
28              ABEJERO,  JASON V
29        ABER

## String Method on index and columns

In [9]:
chicago = pd.read_csv('chicago.csv', index_col='Name').dropna(how='all')
chicago['Department'] = chicago['Department'].astype('category')
chicago.tail(3)

Unnamed: 0_level_0,Position Title,Department,Employee Annual Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [13]:
#  將所有的index設置為大寫開頭字串
chicago.index = chicago.index.str.strip().str.title()
chicago.tail()

Unnamed: 0_level_0,Position Title,Department,Employee Annual Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Zygadlo, Michael J",FRM OF MACHINISTS - AUTOMOTIVE,GENERAL SERVICES,$99528.00
"Zygowicz, Peter J",POLICE OFFICER,POLICE,$87384.00
"Zymantas, Mark E",POLICE OFFICER,POLICE,$84450.00
"Zyrkowski, Carlo E",POLICE OFFICER,POLICE,$87384.00
"Zyskowski, Dariusz",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [14]:
#  將所有的columns設置為大寫開頭字串
chicago.columns = chicago.columns.str.strip().str.title()
chicago.tail()

Unnamed: 0_level_0,Position Title,Department,Employee Annual Salary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Zygadlo, Michael J",FRM OF MACHINISTS - AUTOMOTIVE,GENERAL SERVICES,$99528.00
"Zygowicz, Peter J",POLICE OFFICER,POLICE,$87384.00
"Zymantas, Mark E",POLICE OFFICER,POLICE,$84450.00
"Zyrkowski, Carlo E",POLICE OFFICER,POLICE,$87384.00
"Zyskowski, Dariusz",CHIEF DATA BASE ANALYST,DoIT,$113664.00


## Split String by Characters with .str.split() Method

In [15]:
chicago = pd.read_csv('chicago.csv').dropna(how='all')
chicago['Department'] = chicago['Department'].astype('category')
chicago.tail(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
32059,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
32060,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
32061,"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [16]:
#  Python 標準字串分割
'Hello Man'.split(' ')

['Hello', 'Man']

In [17]:
#  將姓名分成姓氏與名字，資料上的逗點是區隔的逗點，不是原始資料的逗點了
chicago['Name'].str.split(',')

0                 [AARON,   ELVIA J]
1               [AARON,   JEFFERY M]
2                  [AARON,   KARINA]
3             [AARON,   KIMBERLEI R]
4             [ABAD JR,   VICENTE M]
5                 [ABARCA,   ANABEL]
6               [ABARCA,   EMMANUEL]
7               [ABASCAL,   REECE E]
8            [ABBASI,   CHRISTOPHER]
9           [ABBATACOLA,   ROBERT J]
10          [ABBATEMARCO,   JAMES J]
11               [ABBATE,   TERRY M]
12               [ABBOTT,   BETTY L]
13              [ABBOTT,   LYNISE M]
14         [ABBRUZZESE,   WILLIAM J]
15                [ABDALLAH,   ZAID]
16          [ABDELHADI,   ABDALMAHD]
17            [ABDELLATIF,   AREF R]
18             [ABDELMAJEID,   AZIZ]
19            [ABDOLLAHZADEH,   ALI]
20       [ABDUL-KARIM,   MUHAMMAD A]
21            [ABDULLAH,   DANIEL N]
22               [ABDULLAH,   KEVIN]
23           [ABDULLAH,   LAKENYA N]
24            [ABDULLAH,   RASHAD J]
25           [ABDULSATTAR,   MUDHAR]
26           [ABDUL-SHAKUR,   TAHIR]
2

In [21]:
#  利用get取得分割字串的index，再以value_counts計算最常見的名字
chicago['Name'].str.split(',').str.get(0).str.title().value_counts()

Williams         293
Johnson          244
Smith            241
Brown            185
Jones            183
Rodriguez        171
Jackson          136
Garcia           130
Davis            127
Hernandez        110
Martinez         108
Lopez            106
Gonzalez         104
Perez            100
Wilson            94
Rivera            90
Thomas            89
Anderson          82
Torres            81
Murphy            80
Robinson          79
Moore             78
Harris            76
Sanchez           76
Miller            75
Lewis             74
Taylor            73
Martin            72
Clark             66
White             66
                ... 
Olavarria          1
Modzelewski        1
Glockner           1
Sullivan Jr        1
Dantes             1
Adams El           1
Dal Ponte          1
Sidner             1
Woodward           1
Word               1
Shimkus            1
Goldie             1
Conwell            1
Chidichimo         1
Zientarski         1
Osuoji             1
Greene Morris

In [22]:
#  找最長見的部門職稱
chicago['Position Title'].str.split(' ').str.get(0).str.title().value_counts()

Police                   10856
Firefighter-Emt           1509
Sergeant                  1186
Pool                       918
Firefighter                810
Crossing                   775
Motor                      721
Sanitation                 715
Paramedic                  641
Asst                       606
Fire                       512
Traffic                    512
Senior                     470
Construction               452
Lieutenant-Emt             394
Administrative             375
Library                    365
Librarian                  335
Lieutenant                 332
Operating                  324
Electrical                 313
Aviation                   309
Firefighter/Paramedic      259
General                    257
Staff                      250
Clerk                      242
Foreman                    237
Hoisting                   214
Deputy                     213
Machinist                  210
                         ...  
Leasing                      1
Chairper

## More Practice with Splits

In [2]:
chicago = pd.read_csv('chicago.csv').dropna(how='all')
chicago['Department'] = chicago['Department'].astype('category')
chicago.tail(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
32059,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
32060,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
32061,"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [6]:
#  切割之後可以再切割
#  我們會發現切割之後似乎都多了一個block出來，這是因為name的前面有個空格
chicago['Name'].str.split(',').str.get(1).str.split(' ').head()

0        [, , ELVIA, J]
1      [, , JEFFERY, M]
2          [, , KARINA]
3    [, , KIMBERLEI, R]
4      [, , VICENTE, M]
Name: Name, dtype: object

In [9]:
#  先去空格再來切割就正常了
chicago['Name'].str.split(',').str.get(1).str.strip().str.split(' ').head()

0        [ELVIA, J]
1      [JEFFERY, M]
2          [KARINA]
3    [KIMBERLEI, R]
4      [VICENTE, M]
Name: Name, dtype: object

In [12]:
#  再利用get就可以取得名字的部份，再做統計就知道芝加哥最多名字的統計
chicago['Name'].str.split(',').str.get(1).str.strip().str.split(' ').str.get(0).value_counts().head()

MICHAEL    1153
JOHN        899
JAMES       676
ROBERT      622
JOSEPH      537
Name: Name, dtype: int64

## The expand and n Parameters of .str.split()

In [14]:
chicago = pd.read_csv('chicago.csv').dropna(how='all')
chicago['Department'] = chicago['Department'].astype('category')
chicago.tail(3)

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary
32059,"ZYMANTAS, MARK E",POLICE OFFICER,POLICE,$84450.00
32060,"ZYRKOWSKI, CARLO E",POLICE OFFICER,POLICE,$87384.00
32061,"ZYSKOWSKI, DARIUSZ",CHIEF DATA BASE ANALYST,DoIT,$113664.00


In [18]:
#  透過參數expand，可以讓取得資料為Dataframe，非series
type(chicago['Name'].str.split(',', expand=True))

pandas.core.frame.DataFrame

In [19]:
type(chicago['Name'].str.split(',', expand=False))

pandas.core.series.Series

In [22]:
chicago[['First Name','Last Name']] = chicago['Name'].str.split(',', expand=True)
chicago.head()

Unnamed: 0,Name,Position Title,Department,Employee Annual Salary,First Name,Last Name
0,"AARON, ELVIA J",WATER RATE TAKER,WATER MGMNT,$90744.00,AARON,ELVIA J
1,"AARON, JEFFERY M",POLICE OFFICER,POLICE,$84450.00,AARON,JEFFERY M
2,"AARON, KARINA",POLICE OFFICER,POLICE,$84450.00,AARON,KARINA
3,"AARON, KIMBERLEI R",CHIEF CONTRACT EXPEDITER,GENERAL SERVICES,$89880.00,AARON,KIMBERLEI R
4,"ABAD JR, VICENTE M",CIVIL ENGINEER IV,WATER MGMNT,$106836.00,ABAD JR,VICENTE M


In [24]:
#  幾個空格就切出幾欄
chicago['Position Title'].str.split(' ', expand=True)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,WATER,RATE,TAKER,,,,,,
1,POLICE,OFFICER,,,,,,,
2,POLICE,OFFICER,,,,,,,
3,CHIEF,CONTRACT,EXPEDITER,,,,,,
4,CIVIL,ENGINEER,IV,,,,,,
5,ASST,TO,THE,ALDERMAN,,,,,
6,GENERAL,LABORER,-,DSS,,,,,
7,TRAFFIC,CONTROL,AIDE-HOURLY,,,,,,
8,STAFF,ASST,TO,THE,ALDERMAN,,,,
9,ELECTRICAL,MECHANIC,,,,,,,


In [25]:
#  幾個空格就切出幾欄，利用n來限制分割
chicago['Position Title'].str.split(' ', expand=True, n=1)

Unnamed: 0,0,1
0,WATER,RATE TAKER
1,POLICE,OFFICER
2,POLICE,OFFICER
3,CHIEF,CONTRACT EXPEDITER
4,CIVIL,ENGINEER IV
5,ASST,TO THE ALDERMAN
6,GENERAL,LABORER - DSS
7,TRAFFIC,CONTROL AIDE-HOURLY
8,STAFF,ASST TO THE ALDERMAN
9,ELECTRICAL,MECHANIC
