In [2]:
# 데이터 분석에 사용되는 표준 라이브러리 로딩 작업
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# import metplotlib as mpl

In [2]:
# 1차원 배열 : 시리즈(Series) = 값(Values)과 인덱스(Index)
sr = pd.Series([17000, 18000, 1000, 5000], 
               index = ['피자', '치킨', '콜라','맥주'])
print(sr)

피자    17000
치킨    18000
콜라     1000
맥주     5000
dtype: int64


In [4]:
# 데이터값만 추출
sr.values

array([17000, 18000,  1000,  5000], dtype=int64)

In [5]:
# 인덱스만 추출
sr.index

Index(['피자', '치킨', '콜라', '맥주'], dtype='object')

In [6]:
# 2차원 배열 : 데이터 프레임(DataFrame)
# 행(index)와 열(column) 그리고 값(values)
values = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
index = ['one', 'two', 'three']
columns = ['A', 'B', 'C']

df = pd.DataFrame(values, index = index, columns = columns)

In [7]:
print(df)

       A  B  C
one    1  2  3
two    4  5  6
three  7  8  9


In [8]:
df.index

Index(['one', 'two', 'three'], dtype='object')

In [9]:
df.columns

Index(['A', 'B', 'C'], dtype='object')

In [12]:
# 데이터프레임 : 리스트, 시리즈, 딕셔너리, 넘파이배열 등으로 생성
# 리스트를 이용한 데이터 프레임 생성
data = [
    ['1000', 'Steve', 90.72],
    ['1001', 'James', 78.09],
    ['1002', 'Dohee', 98.43],
    ['1003', 'Jamie', 64.19],
    ['1004', 'Pilip', 81.30],
    ['1005', 'Tonie', 99.14]
]

df = pd.DataFrame(data)
df

Unnamed: 0,0,1,2
0,1000,Steve,90.72
1,1001,James,78.09
2,1002,Dohee,98.43
3,1003,Jamie,64.19
4,1004,Pilip,81.3
5,1005,Tonie,99.14


In [13]:
df = pd.DataFrame(data, columns = ['학번', '이름', '점수'])
df

Unnamed: 0,학번,이름,점수
0,1000,Steve,90.72
1,1001,James,78.09
2,1002,Dohee,98.43
3,1003,Jamie,64.19
4,1004,Pilip,81.3
5,1005,Tonie,99.14


In [15]:
# 딕셔너리를 이용한 데이터프레임 생성
data = {
    '학번' : ['1000', '1001', '1002', '1003', '1004', '1005'],
    '이름' : ['aaa', 'bbb', 'ccc', 'ddd', 'eee', 'fff' ],
    '점수' : [90.72, 78.09, 98.43, 64.19, 72.30, 99.14]
}

df = pd.DataFrame(data)
df

Unnamed: 0,학번,이름,점수
0,1000,aaa,90.72
1,1001,bbb,78.09
2,1002,ccc,98.43
3,1003,ddd,64.19
4,1004,eee,72.3
5,1005,fff,99.14


In [16]:
# 데이터프레임 조회하는 작업
# DATAFRAME.head(n) : 데이터프레임의 앞에서부터 (n)개의 자료만 추출 / 기본 : 5개
# head(DATAFRAME, n)
# DATAFRAME.tail(n) : 데이터프레임의 뒤에서부터 (n)개의 자료만 추출 / 기본 : 5개
# DATAFRAME['COLUMN_NAME'] : 해당 열을 추출

In [17]:
df.head()

Unnamed: 0,학번,이름,점수
0,1000,aaa,90.72
1,1001,bbb,78.09
2,1002,ccc,98.43
3,1003,ddd,64.19
4,1004,eee,72.3


In [18]:
df.tail(3)

Unnamed: 0,학번,이름,점수
3,1003,ddd,64.19
4,1004,eee,72.3
5,1005,fff,99.14


In [19]:
df['학번']

0    1000
1    1001
2    1002
3    1003
4    1004
5    1005
Name: 학번, dtype: object

In [21]:
# 학번과 이름만 추출
df[['학번', '이름']]

Unnamed: 0,학번,이름
0,1000,aaa
1,1001,bbb
2,1002,ccc
3,1003,ddd
4,1004,eee
5,1005,fff


In [None]:
# loc[행 인덱스값, 열 인덱스값] : 라벨값 기반의 2차원 인덱싱
# df.loc[행 인덱싱 값] or df.loc[행 인덱싱 값, 열 인덱싱 값]
# iloc : 순서를 기반으로 정수 기반의 2차원 인덱싱

In [24]:
df = pd.DataFrame(np.arange(10, 22).reshape(3, 4),
                 index = ['a', 'b', 'c'],
                 columns = ['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
a,10,11,12,13
b,14,15,16,17
c,18,19,20,21


In [25]:
# loc 인덱서를 사용할때 하나의 값만 있다면 행을 선택
df.loc['a']

A    10
B    11
C    12
D    13
Name: a, dtype: int32

In [26]:
df.loc['b':'c']

Unnamed: 0,A,B,C,D
b,14,15,16,17
c,18,19,20,21


In [29]:
# df[행 조건식, 열 조건식]
df['b' : 'c']

Unnamed: 0,A,B,C,D
b,14,15,16,17
c,18,19,20,21


In [30]:
df.loc[['b', 'c']]

Unnamed: 0,A,B,C,D
b,14,15,16,17
c,18,19,20,21


In [31]:
df.A > 15

a    False
b    False
c     True
Name: A, dtype: bool

In [32]:
df.loc[df.A > 15]

Unnamed: 0,A,B,C,D
c,18,19,20,21


In [33]:
def select_rows(df):
    return df.A > 15

In [34]:
select_rows(df)

a    False
b    False
c     True
Name: A, dtype: bool

In [35]:
df.loc[select_rows(df)]

Unnamed: 0,A,B,C,D
c,18,19,20,21


In [36]:
# df[행조건식, 열조건식]
df['b':'c']

Unnamed: 0,A,B,C,D
b,14,15,16,17
c,18,19,20,21


In [37]:
df.loc[['b', 'c']]

Unnamed: 0,A,B,C,D
b,14,15,16,17
c,18,19,20,21


In [40]:
df = pd.DataFrame(np.arange(10, 22).reshape(3, 4),
                 columns = ['A', 'B', 'C', 'D'])

In [42]:
df.loc[1:'A']

Unnamed: 0,A,B,C,D
1,14,15,16,17
2,18,19,20,21


In [43]:
df.loc[[0, 1], ['B', 'D']]

Unnamed: 0,B,D
0,11,13
1,15,17


In [44]:
df.loc[df.A > 10,['C', 'D']]

Unnamed: 0,C,D
1,16,17
2,20,21


In [45]:
df = pd.DataFrame({
    '체중' : [80, 70, 65, 55, 52], 
    '신장' : [180, 177, 169, 190, 155],
    '성별' : ['남', '여', '남', '여', '여']
})

In [46]:
df[['체중', '신장']]

Unnamed: 0,체중,신장
0,80,180
1,70,177
2,65,169
3,55,190
4,52,155


In [47]:
df[df.성별 == '남']

Unnamed: 0,체중,신장,성별
0,80,180,남
2,65,169,남


In [48]:
df[df.성별 == '여']

Unnamed: 0,체중,신장,성별
1,70,177,여
3,55,190,여
4,52,155,여


In [49]:
data = {
    'names' : ['홍실동', '이준신', '장보호', '김유진', '강감탄'],
    'year' : [2014, 2015, 2016, 2017, 2018],
    'points' : [1.5, 1.7, 3.6, 2.4, 2.9]
}

df = pd.DataFrame(data)
df

Unnamed: 0,names,year,points
0,홍실동,2014,1.5
1,이준신,2015,1.7
2,장보호,2016,3.6
3,김유진,2017,2.4
4,강감탄,2018,2.9


In [50]:
# 데이터 프레임 정보 확인
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
names     5 non-null object
year      5 non-null int64
points    5 non-null float64
dtypes: float64(1), int64(1), object(1)
memory usage: 200.0+ bytes


In [51]:
# 수치형 데이터 기초통계분석
df.describe()

Unnamed: 0,year,points
count,5.0,5.0
mean,2016.0,2.42
std,1.581139,0.864292
min,2014.0,1.5
25%,2015.0,1.7
50%,2016.0,2.4
75%,2017.0,2.9
max,2018.0,3.6


In [52]:
# 데이터의 개수를 세는 메서드 : count(), NaN
s = pd.Series(range(10))
s[3] = np.nan
s

0    0.0
1    1.0
2    2.0
3    NaN
4    4.0
5    5.0
6    6.0
7    7.0
8    8.0
9    9.0
dtype: float64

In [53]:
s.count()

9

In [55]:
# 인덱스 이름 변경
df.index.name = 'Nid'
# 열 인덱스의 이름 변경
df.columns.name = 'Info'
df

Info,names,year,points
Nid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,홍실동,2014,1.5
1,이준신,2015,1.7
2,장보호,2016,3.6
3,김유진,2017,2.4
4,강감탄,2018,2.9


In [56]:
# columns : 열 이름을 목록으로 추출
# index : 행 인덱스의 목록을 추출
# NaN : Not a Number

df2 = pd.DataFrame(data,
                  columns = ['year', 'names', 'points', 'penalty'],
                  index = ['one', 'two', 'three', 'four', 'five'])

df2

Unnamed: 0,year,names,points,penalty
one,2014,홍실동,1.5,
two,2015,이준신,1.7,
three,2016,장보호,3.6,
four,2017,김유진,2.4,
five,2018,강감탄,2.9,


In [57]:
df2['year']

one      2014
two      2015
three    2016
four     2017
five     2018
Name: year, dtype: int64

In [58]:
df2.year

one      2014
two      2015
three    2016
four     2017
five     2018
Name: year, dtype: int64

In [59]:
df2[['year', 'points']]

Unnamed: 0,year,points
one,2014,1.5
two,2015,1.7
three,2016,3.6
four,2017,2.4
five,2018,2.9


In [61]:
df2['penalty'] = 0.7
df2

Unnamed: 0,year,names,points,penalty
one,2014,홍실동,1.5,0.7
two,2015,이준신,1.7,0.7
three,2016,장보호,3.6,0.7
four,2017,김유진,2.4,0.7
five,2018,강감탄,2.9,0.7


In [62]:
df2['penalty'] = [0.5, 0.7, 0.9, 1.0, 0.6]
df2

Unnamed: 0,year,names,points,penalty
one,2014,홍실동,1.5,0.5
two,2015,이준신,1.7,0.7
three,2016,장보호,3.6,0.9
four,2017,김유진,2.4,1.0
five,2018,강감탄,2.9,0.6


In [63]:
df2['ages'] = range(10, 15)
df2

Unnamed: 0,year,names,points,penalty,ages
one,2014,홍실동,1.5,0.5,10
two,2015,이준신,1.7,0.7,11
three,2016,장보호,3.6,0.9,12
four,2017,김유진,2.4,1.0,13
five,2018,강감탄,2.9,0.6,14


In [64]:
del df2['ages']
df2

Unnamed: 0,year,names,points,penalty
one,2014,홍실동,1.5,0.5
two,2015,이준신,1.7,0.7
three,2016,장보호,3.6,0.9
four,2017,김유진,2.4,1.0
five,2018,강감탄,2.9,0.6


In [65]:
# [start : end-1]
df2[0:3]

Unnamed: 0,year,names,points,penalty
one,2014,홍실동,1.5,0.5
two,2015,이준신,1.7,0.7
three,2016,장보호,3.6,0.9


In [66]:
df2.loc['two']

year       2015
names       이준신
points      1.7
penalty     0.7
Name: two, dtype: object

In [67]:
df2.loc['two':'four']

Unnamed: 0,year,names,points,penalty
two,2015,이준신,1.7,0.7
three,2016,장보호,3.6,0.9
four,2017,김유진,2.4,1.0


In [69]:
# loc[행 범위, 열 범위]
# 범위 => start:end, :(all)
df2.loc[:, ['year', 'names']]

Unnamed: 0,year,names
one,2014,홍실동
two,2015,이준신
three,2016,장보호
four,2017,김유진
five,2018,강감탄


In [71]:
df2.iloc[3]

year       2017
names       김유진
points      2.4
penalty       1
Name: four, dtype: object

In [72]:
# iloc[행범위, 열범위]
df2.iloc[:, 1:4]

Unnamed: 0,names,points,penalty
one,홍실동,1.5,0.5
two,이준신,1.7,0.7
three,장보호,3.6,0.9
four,김유진,2.4,1.0
five,강감탄,2.9,0.6


In [73]:
df2.iloc[1, 1]

'이준신'

In [75]:
df2.loc[df2['year']>2016, :]

Unnamed: 0,year,names,points,penalty
four,2017,김유진,2.4,1.0
five,2018,강감탄,2.9,0.6


In [76]:
# 6 x 4
df = pd.DataFrame(np.random.randn(6, 4))
df

Unnamed: 0,0,1,2,3
0,-0.712355,-0.237506,0.714179,-0.412531
1,-1.926929,0.912036,-1.004986,-1.024656
2,-0.078228,1.121381,-0.091876,0.203771
3,-0.349751,-0.656093,-0.333768,0.228164
4,0.68291,1.318803,0.809859,-0.230663
5,-0.737246,0.637083,0.679406,-2.763325


In [77]:
# 생성된 데이터 프레임에 열 인덱스와 행 인덱스 값
df.columns = ['A', 'B', 'C', 'D']
# date_range('시작날짜', 옵션)
df.index = pd.date_range('20210419', periods = 6)

df

Unnamed: 0,A,B,C,D
2021-04-19,-0.712355,-0.237506,0.714179,-0.412531
2021-04-20,-1.926929,0.912036,-1.004986,-1.024656
2021-04-21,-0.078228,1.121381,-0.091876,0.203771
2021-04-22,-0.349751,-0.656093,-0.333768,0.228164
2021-04-23,0.68291,1.318803,0.809859,-0.230663
2021-04-24,-0.737246,0.637083,0.679406,-2.763325


In [78]:
# D 컬럼을 삭제하는 작업
# axis = 1, 열
# axis = 0, 행
df.drop('D', axis = 1)

Unnamed: 0,A,B,C
2021-04-19,-0.712355,-0.237506,0.714179
2021-04-20,-1.926929,0.912036,-1.004986
2021-04-21,-0.078228,1.121381,-0.091876
2021-04-22,-0.349751,-0.656093,-0.333768
2021-04-23,0.68291,1.318803,0.809859
2021-04-24,-0.737246,0.637083,0.679406


In [80]:
# B, C 컬럼 삭제
df.drop(['B', 'C'], axis = 1)

Unnamed: 0,A,D
2021-04-19,-0.712355,-0.412531
2021-04-20,-1.926929,-1.024656
2021-04-21,-0.078228,0.203771
2021-04-22,-0.349751,0.228164
2021-04-23,0.68291,-0.230663
2021-04-24,-0.737246,-2.763325


In [85]:
df = pd.DataFrame({
    'weight' : [80, 70.4, 65.5, 45.9, 51.2],
    'height' : [170, 180, 155, 143, 144]
})
df

Unnamed: 0,weight,height
0,80.0,170
1,70.4,180
2,65.5,155
3,45.9,143
4,51.2,144


In [87]:
# 세로방향을 합계를 구하는 작업, 컬럼별 합계
df.sum(axis = 0)

weight    313.0
height    792.0
dtype: float64

In [88]:
# 가로방향의 합계(각 행의 합계)
df.sum(axis = 1)

0    250.0
1    250.4
2    220.5
3    188.9
4    195.2
dtype: float64

In [90]:
# 신장의 평균값
df['height'].mean()

158.4

In [91]:
df['weight'].mean()

62.6

In [92]:
# 분산
df['height'].var()

264.3

In [93]:
# value_counts : 각각의 값의 줄현 빈도수를 계산 하는 메서드
s = pd.Series(np.random.randint(6, size = 100))
s.tail()

95    2
96    0
97    1
98    1
99    3
dtype: int32

In [94]:
s.value_counts()

1    22
3    20
5    17
2    15
4    13
0    13
dtype: int64

In [98]:
# 데이터 정렬 : sort_index / sort_values
# sort_index : 인덱스 기준
# sort_valuse : 데이터 값 기준
# 내림차순 : ascending = False
# by=열
s.value_counts().sort_index()

0    13
1    22
2    15
3    20
4    13
5    17
dtype: int64

In [97]:
s.value_counts().sort_values(ascending = False)

1    22
3    20
5    17
2    15
0    13
4    13
dtype: int64

In [100]:
# 체중을 이용해서 정렬
df.sort_values(by='weight')

Unnamed: 0,weight,height
3,45.9,143
4,51.2,144
2,65.5,155
1,70.4,180
0,80.0,170


In [108]:
# 행/열 합계
# sum(axis) , 0-열(세로) 1-행(가로)
df2 = pd.DataFrame(np.random.randint(10, size = (4, 8)))
df2

Unnamed: 0,0,1,2,3,4,5,6,7
0,9,9,4,7,7,8,5,8
1,1,1,8,5,2,3,7,3
2,9,8,0,1,8,5,1,0
3,7,3,7,1,7,3,5,4


In [102]:
# 행 방향 합계
df2.sum(axis = 1)

0    37
1    50
2    28
3    27
dtype: int64

In [109]:
df2['RowSum'] = df2.sum(axis = 1)
df2

Unnamed: 0,0,1,2,3,4,5,6,7,RowSum
0,9,9,4,7,7,8,5,8,57
1,1,1,8,5,2,3,7,3,30
2,9,8,0,1,8,5,1,0,32
3,7,3,7,1,7,3,5,4,37


In [104]:
df2.sum() # axis = 0은 생략가능

0          16
1          11
2          14
3          22
4          19
5          23
6          19
7          18
RowSum    142
dtype: int64

In [110]:
df2.loc['ColSum', :] = df2.sum()
df2

Unnamed: 0,0,1,2,3,4,5,6,7,RowSum
0,9.0,9.0,4.0,7.0,7.0,8.0,5.0,8.0,57.0
1,1.0,1.0,8.0,5.0,2.0,3.0,7.0,3.0,30.0
2,9.0,8.0,0.0,1.0,8.0,5.0,1.0,0.0,32.0
3,7.0,3.0,7.0,1.0,7.0,3.0,5.0,4.0,37.0
ColSum,26.0,21.0,19.0,14.0,24.0,19.0,18.0,15.0,156.0


In [111]:
# apply() : 행과 열을 반복해서 특정 함수를 이용해 작업
df3 = pd.DataFrame({
    'A' : [1, 3, 4, 3, 4],
    'B' : [2, 3, 1, 2, 3],
    'C' : [1, 5, 2, 4, 4]
})
df3

Unnamed: 0,A,B,C
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


In [112]:
# 각 컬럼별 최대값에서 최소값의 차이를 구하는 작업
df3.apply(lambda x: x.max() - x.min())

A    3
B    2
C    4
dtype: int64

In [113]:
# 각 행별 최대값에서 최소값 차이를 구하는 작업
df3.apply(lambda x: x.max() - x.min(), axis = 1)

0    1
1    2
2    3
3    2
4    1
dtype: int64

In [114]:
# value_counts
# 각 열에 어떤 값이 얼마나 사용되었는지 알고싶다면
df3.apply(pd.value_counts)

Unnamed: 0,A,B,C
1,1.0,1.0,1.0
2,,2.0,1.0
3,2.0,2.0,
4,2.0,,2.0
5,,,1.0


In [115]:
# 정제 : 결측치, 이상치
# NaN => fillna(value)
df3.apply(pd.value_counts).fillna(0.0)

Unnamed: 0,A,B,C
1,1.0,1.0,1.0
2,0.0,2.0,1.0
3,2.0,2.0,0.0
4,2.0,0.0,2.0
5,0.0,0.0,1.0


In [116]:
# as~ 변환, astype(자료형)
df3.apply(pd.value_counts).fillna(0.0).astype(int)

Unnamed: 0,A,B,C
1,1,1,1
2,0,2,1
3,2,2,0
4,2,0,2
5,0,0,1


In [None]:
# 매직명령어(magic)
%%writefile sample1.csv
c1, c2, c3
1, 1.11, one
2, 2.22, two
3, 3.33, three

In [3]:
df4 = pd.read_csv('sample1.csv')
df4

Unnamed: 0,c1,c2,c3
0,1,1.11,one
1,2,2.22,two
2,3,3.33,three


In [6]:
df5 = pd.read_csv('sample2.csv', header = None)
df5

Unnamed: 0,0,1,2
0,1,1.11,one
1,2,2.22,two
2,3,3.33,three


In [7]:
df6 = pd.read_csv('sample2.csv', names = ['c1', 'c2', 'c3'])
df6

Unnamed: 0,c1,c2,c3
0,1,1.11,one
1,2,2.22,two
2,3,3.33,three
