## Pandas 자료형 DataFrame

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = [['Alice', 25, 'Engineer'],
        ['Bob', 30, 'Manager'],
        ['Charlie', 35, 'Analyst'],
        ['David', 28, 'Developer'],
        ['Eve', 32, 'Designer']]

- 데이터프레임 생성

In [3]:
np.array(data)

array([['Alice', '25', 'Engineer'],
       ['Bob', '30', 'Manager'],
       ['Charlie', '35', 'Analyst'],
       ['David', '28', 'Developer'],
       ['Eve', '32', 'Designer']], dtype='<U11')

In [4]:
np.array(data).ndim # 2차원 배열

2

In [5]:
# 리스트를 통해 생성
df = pd.DataFrame(data)
df

Unnamed: 0,0,1,2
0,Alice,25,Engineer
1,Bob,30,Manager
2,Charlie,35,Analyst
3,David,28,Developer
4,Eve,32,Designer


In [6]:
df.columns

RangeIndex(start=0, stop=3, step=1)

In [7]:
df.columns = ['Name', 'Age', 'Job']
df.head(3)

Unnamed: 0,Name,Age,Job
0,Alice,25,Engineer
1,Bob,30,Manager
2,Charlie,35,Analyst


In [70]:
# 딕셔너리를 통한 생성: 키 값이 컬럼
name_list = ['Alice', 'Bob', 'Charlie', 'David', 'Eve']
age_list = [25, 30, 35, 28, 32]
job_list = ['Engineer', 'Manager', 'Analyst', 'Developer', 'Designer']

data = {
'Name':name_list,
'Age':age_list,
'Job':job_list
}

df = pd.DataFrame(data)
df.head(3)

Unnamed: 0,Name,Age,Job
0,Alice,25,Engineer
1,Bob,30,Manager
2,Charlie,35,Analyst


In [9]:
# 컬럼을 추가하여 Dataframe 생성
df = pd.DataFrame()
df['Name'] = name_list
df['Age'] = age_list
df['Job'] = job_list
df.head(3)

Unnamed: 0,Name,Age,Job
0,Alice,25,Engineer
1,Bob,30,Manager
2,Charlie,35,Analyst


In [10]:
type(df['Age']) # 데이터프레임의 각 열은 시리즈 자료형 입니다. 

pandas.core.series.Series

In [11]:
df['Name'] # object

0      Alice
1        Bob
2    Charlie
3      David
4        Eve
Name: Name, dtype: object

In [12]:
df['Age'] # int -> 열 마다 다른 데이터 타입인 것을 알 수 있음

0    25
1    30
2    35
3    28
4    32
Name: Age, dtype: int64

- 속성

In [13]:
df.index

RangeIndex(start=0, stop=5, step=1)

In [14]:
df.columns

Index(['Name', 'Age', 'Job'], dtype='object')

In [15]:
# 값만 확인 -> array
df.values

array([['Alice', 25, 'Engineer'],
       ['Bob', 30, 'Manager'],
       ['Charlie', 35, 'Analyst'],
       ['David', 28, 'Developer'],
       ['Eve', 32, 'Designer']], dtype=object)

In [16]:
df.dtypes # 컬럼마다 타입확인

Name    object
Age      int64
Job     object
dtype: object

In [17]:
df.T # transpose 값 확인

Unnamed: 0,0,1,2,3,4
Name,Alice,Bob,Charlie,David,Eve
Age,25,30,35,28,32
Job,Engineer,Manager,Analyst,Developer,Designer


In [18]:
df.T.index

Index(['Name', 'Age', 'Job'], dtype='object')

In [19]:
df.T.columns

RangeIndex(start=0, stop=5, step=1)

- 인덱스 관련

In [20]:
# 인덱스 값 변경
df.index = ['first', 'second', 'third', 'fourth', 'fifth']
df.head()

Unnamed: 0,Name,Age,Job
first,Alice,25,Engineer
second,Bob,30,Manager
third,Charlie,35,Analyst
fourth,David,28,Developer
fifth,Eve,32,Designer


In [71]:
# index에 있는 값을 -> 컬럼으로 위치 이동
df.reset_index(inplace=True) #inplace 해야 실제로 바뀜
df

Unnamed: 0,index,Name,Age,Job
0,0,Alice,25,Engineer
1,1,Bob,30,Manager
2,2,Charlie,35,Analyst
3,3,David,28,Developer
4,4,Eve,32,Designer


In [72]:
# 기존 컬럼을 인덱스로 사용
df.set_index('Name', inplace=True)
df

Unnamed: 0_level_0,index,Age,Job
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Alice,0,25,Engineer
Bob,1,30,Manager
Charlie,2,35,Analyst
David,3,28,Developer
Eve,4,32,Designer


In [73]:
# 인덱스 단위로 값을 추가
# df['Bob'] <- 열
df.loc['Bob'] # 인덱스로 접근

index          1
Age           30
Job      Manager
Name: Bob, dtype: object

In [74]:
df.loc['Jone'] = ['sixth', 27, 'Marketer'] 

In [75]:
df

Unnamed: 0_level_0,index,Age,Job
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Alice,0,25,Engineer
Bob,1,30,Manager
Charlie,2,35,Analyst
David,3,28,Developer
Eve,4,32,Designer
Jone,sixth,27,Marketer


In [76]:
# 행 삭제
df.drop('Eve', axis=0, inplace=True) # df.drop('Eve', inplace=True) 디폴트 값이 0(행)임

In [77]:
df

Unnamed: 0_level_0,index,Age,Job
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Alice,0,25,Engineer
Bob,1,30,Manager
Charlie,2,35,Analyst
David,3,28,Developer
Jone,sixth,27,Marketer


In [78]:
df['Age'] # 컬럼은 키로 접근 가능

Name
Alice      25
Bob        30
Charlie    35
David      28
Jone       27
Name: Age, dtype: int64

In [79]:
df[['Age', 'Job']] # 2 개 이상 컬럼 선택할 때 이중 대괄호

Unnamed: 0_level_0,Age,Job
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Alice,25,Engineer
Bob,30,Manager
Charlie,35,Analyst
David,28,Developer
Jone,27,Marketer


In [80]:
# 특정 컬럼 이름 변경
df.rename(columns={'Job':'직업'}, inplace=True) # 컬럼명을 잡에서 직업으로 변경

In [81]:
df

Unnamed: 0_level_0,index,Age,직업
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Alice,0,25,Engineer
Bob,1,30,Manager
Charlie,2,35,Analyst
David,3,28,Developer
Jone,sixth,27,Marketer


In [82]:
# 특정 컬럼 삭제
df.drop('index', axis=1, inplace=True)

In [83]:
df

Unnamed: 0_level_0,Age,직업
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Alice,25,Engineer
Bob,30,Manager
Charlie,35,Analyst
David,28,Developer
Jone,27,Marketer


In [84]:
df.reset_index(inplace=True)

In [85]:
df

Unnamed: 0,Name,Age,직업
0,Alice,25,Engineer
1,Bob,30,Manager
2,Charlie,35,Analyst
3,David,28,Developer
4,Jone,27,Marketer


In [86]:
df.loc[4] = ['Eve', 32, 'Designer']

In [87]:
df.loc[5] = ['John', 27, 'Marketer']

# df.loc[5] = ['Eve', 32, 'Designer']
# df.sort_values('이름')
# df.reset_index

In [88]:
# df.columns = ['이름', '나이', '직업']
df.rename(columns={'Name':'이름', 'Age':'나이'}, inplace=True)
df

Unnamed: 0,이름,나이,직업
0,Alice,25,Engineer
1,Bob,30,Manager
2,Charlie,35,Analyst
3,David,28,Developer
4,Eve,32,Designer
5,John,27,Marketer


In [92]:
# # 문제: Charlie 가 포함된 행
# idx = df[df['이름'] == 'Charlie'].index
# df.drop(df.index[idx], axis=0, inplace=True)

In [65]:
df.set_index('이름', inplace=True)
df.drop('Charlie', axis=0, inplace=True)
df.reset_index(inplace=True)

In [None]:
# df.drop(2, inplace=True)
# df.reset_index(drop=True, inplace=True) # 기존의 인덱스를 날린다.

In [93]:
df

Unnamed: 0,이름,나이,직업
0,Alice,25,Engineer
1,Bob,30,Manager
3,David,28,Developer
4,Eve,32,Designer
5,John,27,Marketer


In [52]:
pd.concat([df, df, df], axis=1) # 열 추가 0 이면 행 추가

Unnamed: 0,이름,나이,직업,이름.1,나이.1,직업.1,이름.2,나이.2,직업.2
0,Alice,25,Engineer,Alice,25,Engineer,Alice,25,Engineer
1,Bob,30,Manager,Bob,30,Manager,Bob,30,Manager
2,Charlie,35,Analyst,Charlie,35,Analyst,Charlie,35,Analyst
3,David,28,Developer,David,28,Developer,David,28,Developer
4,Eve,32,Designer,Eve,32,Designer,Eve,32,Designer
5,John,27,Marketer,John,27,Marketer,John,27,Marketer


In [94]:
# 문제 3: 컬럼 추가
df['성별'] = ['여', '남', '남', '여', '남']

In [95]:
df

Unnamed: 0,이름,나이,직업,성별
0,Alice,25,Engineer,여
1,Bob,30,Manager,남
3,David,28,Developer,남
4,Eve,32,Designer,여
5,John,27,Marketer,남


In [None]:
# 문제 4: 복합
df.loc[6] = ['George', 31, 'Engineer', '남']
df.set_index('이름', inplace=True)

In [100]:
df

Unnamed: 0_level_0,나이,직업,성별
이름,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Alice,25,Engineer,여
Bob,30,Manager,남
David,28,Developer,남
Eve,32,Designer,여
George,31,Engineer,남
John,27,Marketer,남
George,31,Engineer,남
