In [1]:
import numpy as np
import pandas as pd

In [3]:
s = pd.Series(np.random.randint(0, 10, 5))
s

0    0
1    7
2    2
3    3
4    9
dtype: int32

In [6]:
df = pd.DataFrame(np.random.randint(0,10,(3,3)), columns = ['A','B','C'])
df

Unnamed: 0,A,B,C
0,7,9,5
1,8,0,3
2,6,9,5


In [7]:
# .exp() - 지수함수(e^x)로 변환
np.exp(s)

0       1.000000
1    1096.633158
2       7.389056
3      20.085537
4    8103.083928
dtype: float64

In [8]:
np.cos(df * np.pi / 4)

Unnamed: 0,A,B,C
0,0.7071068,0.707107,-0.707107
1,1.0,1.0,-0.707107
2,-1.83697e-16,0.707107,-0.707107


In [9]:
#.add() - 덧셈
s1 = pd.Series([1,2,3], index=[0,1,2])
s2 = pd.Series([4,5,6], index=[1,2,3])

s1.add(s2, fill_value=0) #fill_value 로 NaN 값 설정 가능

0    1.0
1    6.0
2    8.0
3    6.0
dtype: float64

In [10]:
df1 = pd.DataFrame(np.random.randint(0, 20, (5,5)), columns=list('BAECD'))
df1

Unnamed: 0,B,A,E,C,D
0,17,17,9,17,14
1,16,8,9,8,8
2,4,3,8,14,14
3,6,7,6,15,6
4,0,9,17,12,10


In [11]:
df2 = pd.DataFrame(np.random.randint(0, 20, (3,3)), columns=list('ACD'))
df2

Unnamed: 0,A,C,D
0,8,1,18
1,15,9,0
2,0,13,17


In [12]:
df1 + df2

Unnamed: 0,A,B,C,D,E
0,25.0,,18.0,32.0,
1,23.0,,17.0,8.0,
2,3.0,,27.0,31.0,
3,,,,,
4,,,,,


In [13]:
fvalue = df1.stack().mean()
df1.add(df2, fill_value=fvalue)

Unnamed: 0,A,B,C,D,E
0,25.0,27.16,18.0,32.0,19.16
1,23.0,26.16,17.0,8.0,19.16
2,3.0,14.16,27.0,31.0,18.16
3,17.16,16.16,25.16,16.16,16.16
4,19.16,10.16,22.16,20.16,27.16


In [18]:
#정렬
s = pd.Series(range(5), index=['A','D','B','E','C'])
s

A    0
D    1
B    2
E    3
C    4
dtype: int64

In [19]:
s.sort_index()

A    0
B    2
C    4
D    1
E    3
dtype: int64

In [20]:
s.sort_values()

A    0
D    1
B    2
E    3
C    4
dtype: int64

In [21]:
df = pd.DataFrame(np.random.randint(0, 10, (4,4)), index=[2,4,1,3], columns=list('BDAC'))
df

Unnamed: 0,B,D,A,C
2,3,2,6,7
4,7,0,0,2
1,2,6,3,6
3,5,6,0,4


In [22]:
df.sort_index(axis=1) # 축 지정 가능

Unnamed: 0,B,D,A,C
1,2,6,3,6
2,3,2,6,7
3,5,6,0,4
4,7,0,0,2


In [24]:
df.sort_values(by='A') #기준 지정 여러개 가능

Unnamed: 0,B,D,A,C
4,7,0,0,2
3,5,6,0,4
1,2,6,3,6
2,3,2,6,7


In [25]:
# 순위
s = pd.Series([-2,4,7,8,3,0,5])
s

0   -2
1    4
2    7
3    8
4    3
5    0
6    5
dtype: int64

In [26]:
s.rank()

0    1.0
1    4.0
2    6.0
3    7.0
4    3.0
5    2.0
6    5.0
dtype: float64

In [28]:
s.rank(method='average')

0    1.0
1    4.0
2    6.0
3    7.0
4    3.0
5    2.0
6    5.0
dtype: float64

In [29]:
# 데이터 결합
# concat() - 두 테이블을 합침
# append() - 테이블에 새 요소 추가

s1 = pd.Series(['a', 'b'], index=[1,2])
s2 = pd.Series(['c', 'd'], index=[3,4])
pd.concat([s1, s2])

1    a
2    b
3    c
4    d
dtype: object

In [30]:
def create_df(cols, idx):
    data = {c: [str(c.lower()) + str(i) for i in idx] for c in cols}
    return pd.DataFrame(data, idx)

In [31]:
df1 = create_df('AB', [0, 1])
df2 = create_df('DF', [2, 3])

In [32]:
df1.append(df2)

Unnamed: 0,A,B,D,F
0,a0,b0,,
1,a1,b1,,
2,,,d2,f2
3,,,d3,f3


In [33]:
#병합과 조인
df1 = pd.DataFrame({'학생':['홍', '이', '임'],
                   '학과':['경영','교육','컴퓨터']})
df1

Unnamed: 0,학생,학과
0,홍,경영
1,이,교육
2,임,컴퓨터


In [35]:
df2 = pd.DataFrame({'학생':['홍', '이', '임'],
                   '입학년도':[2012, 2015, 2017]})
df2

Unnamed: 0,학생,입학년도
0,홍,2012
1,이,2015
2,임,2017


In [36]:
df3 = pd.merge(df1, df2)
df3

Unnamed: 0,학생,학과,입학년도
0,홍,경영,2012
1,이,교육,2015
2,임,컴퓨터,2017


In [37]:
# 데이터 집계와 그룹 연산

In [39]:
df = pd.DataFrame([[1, 1.2, np.nan],
                 [2.4, 5.5, 4.2],
                 [np.nan, np.nan, np.nan],
                 [0.44, -3.1, -4.1]],
                 index=[1,2,3,4],
                 columns=['A','B','C'])
df

Unnamed: 0,A,B,C
1,1.0,1.2,
2,2.4,5.5,4.2
3,,,
4,0.44,-3.1,-4.1


In [40]:
df.head(2)

Unnamed: 0,A,B,C
1,1.0,1.2,
2,2.4,5.5,4.2


In [41]:
df.describe()

Unnamed: 0,A,B,C
count,3.0,3.0,2.0
mean,1.28,1.2,0.05
std,1.009554,4.3,5.868986
min,0.44,-3.1,-4.1
25%,0.72,-0.95,-2.025
50%,1.0,1.2,0.05
75%,1.7,3.35,2.125
max,2.4,5.5,4.2


In [42]:
# 그룹 연산
df = pd.DataFrame({'c1':['a','a','b','b','c','d','b'],
                  'c2':['A','B','B','A','D','C','C'],
                  'c3':np.random.randint(7),
                  'c4':np.random.random(7)})
df

Unnamed: 0,c1,c2,c3,c4
0,a,A,4,0.527081
1,a,B,4,0.792076
2,b,B,4,0.699265
3,b,A,4,0.013614
4,c,D,4,0.398427
5,d,C,4,0.103523
6,b,C,4,0.399349


In [44]:
df['c3'].groupby(df['c1']).mean()

c1
a    4.0
b    4.0
c    4.0
d    4.0
Name: c3, dtype: float64

In [46]:
# 피벗 테이블
df.pivot_table(['c3','c4'],index=['c1'],columns=['c2'], margins=True)

Unnamed: 0_level_0,c3,c3,c3,c3,c3,c4,c4,c4,c4,c4
c2,A,B,C,D,All,A,B,C,D,All
c1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
a,4.0,4.0,,,4.0,0.527081,0.792076,,,0.659578
b,4.0,4.0,4.0,,4.0,0.013614,0.699265,0.399349,,0.370743
c,,,,4.0,4.0,,,,0.398427,0.398427
d,,,4.0,,4.0,,,0.103523,,0.103523
All,4.0,4.0,4.0,4.0,4.0,0.270347,0.745671,0.251436,0.398427,0.419048


In [53]:
# 범주형
s = pd.Series(['c1','c2','c1','c2','c1'] * 2)
s

0    c1
1    c2
2    c1
3    c2
4    c1
5    c1
6    c2
7    c1
8    c2
9    c1
dtype: object

In [55]:
df = pd.DataFrame({'id':np.arange(len(s)), 'c':s, 'v':np.random.randint(1000, 5000, size=len(s))})
df

Unnamed: 0,id,c,v
0,0,c1,3308
1,1,c2,2749
2,2,c1,3486
3,3,c2,3826
4,4,c1,4419
5,5,c1,1531
6,6,c2,2259
7,7,c1,1223
8,8,c2,2348
9,9,c1,4812


In [56]:
c = df['c'].astype('category')
c

0    c1
1    c2
2    c1
3    c2
4    c1
5    c1
6    c2
7    c1
8    c2
9    c1
Name: c, dtype: category
Categories (2, object): ['c1', 'c2']