## 데이터프레임 생성

In [1]:
import pandas as pd

In [2]:
dict_data = {'c0':[1, 2, 3], 'c1':[4, 5, 6], 'c2':[7, 8, 9], 'c3':[10, 11, 12], 'c4':[13, 14, 15]}

In [3]:
df = pd.DataFrame(dict_data)

In [4]:
df

Unnamed: 0,c0,c1,c2,c3,c4
0,1,4,7,10,13
1,2,5,8,11,14
2,3,6,9,12,15


In [5]:
type(df)

pandas.core.frame.DataFrame

In [6]:
df.index

RangeIndex(start=0, stop=3, step=1)

In [7]:
df.columns

Index(['c0', 'c1', 'c2', 'c3', 'c4'], dtype='object')

In [4]:
df2 = pd.DataFrame([[25, '남자', '율도국'], [17, '여자', '인당수']], index = ['홍길동', '심청'], columns = ['나이', '성별', '주소'])

In [20]:
df2

Unnamed: 0,나이,성별,주소
홍길동,25,남자,율도국
심청,17,여자,인당수


In [21]:
id(df2)

1929910075592

In [11]:
df2.index

Index(['홍길동', '심청'], dtype='object')

In [12]:
df2.columns

Index(['나이', '성별', '주소'], dtype='object')

In [22]:
df2.index = ['주인공1', '주인공2']  #인덱스 변경 가능

In [23]:
df2

Unnamed: 0,나이,성별,주소
주인공1,25,남자,율도국
주인공2,17,여자,인당수


In [24]:
df2.columns = ['age', 'gender', 'address']  #칼럼 변경 가능

In [25]:
df2

Unnamed: 0,age,gender,address
주인공1,25,남자,율도국
주인공2,17,여자,인당수


In [26]:
id(df2)

1929910075592

In [3]:
import numpy as np

my_2darray = np.array([[1, 2, 3], [4, 5, 6]])
display(pd.DataFrame(my_2darray))

Unnamed: 0,0,1,2
0,1,2,3
1,4,5,6


In [5]:
display(df2)

Unnamed: 0,나이,성별,주소
홍길동,25,남자,율도국
심청,17,여자,인당수


In [6]:
df2

Unnamed: 0,나이,성별,주소
홍길동,25,남자,율도국
심청,17,여자,인당수


### 행/열 삭제

In [7]:
exam_data = {'수학': [90, 80, 70], '영어': [95, 89, 90], '국어': [100, 80, 75], '과학': [70, 70, 70]}

In [10]:
df3 = pd.DataFrame(exam_data, index = ['영희', '철수', '삼식'])

In [11]:
df3

Unnamed: 0,수학,영어,국어,과학
영희,90,95,100,70
철수,80,89,80,70
삼식,70,90,75,70


In [20]:
df4 = df3[:]   #새로운 DataFrame객체를 복제함(주솟값 복사 아님)

In [24]:
print(id(df3))
print(id(df4))

2410244759176
2410259583752


In [21]:
df4

Unnamed: 0,수학,영어,국어,과학
영희,90,95,100,70
철수,80,89,80,70
삼식,70,90,75,70


In [22]:
df4.drop('삼식')   #열 삭제

Unnamed: 0,수학,영어,국어,과학
영희,90,95,100,70
철수,80,89,80,70


In [23]:
df3

Unnamed: 0,수학,영어,국어,과학
영희,90,95,100,70
철수,80,89,80,70
삼식,70,90,75,70


In [25]:
df4   #그냥 drop하면 원본은 그대로임; 고정을 원한다면 inplace = True를 인수로 넣어주기

Unnamed: 0,수학,영어,국어,과학
영희,90,95,100,70
철수,80,89,80,70
삼식,70,90,75,70


In [26]:
df4.drop('수학', axis = 1, inplace = True)    #칼럼 삭제; '칼럼명', axis = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [27]:
df4

Unnamed: 0,영어,국어,과학
영희,95,100,70
철수,89,80,70
삼식,90,75,70


In [29]:
df4.drop(columns = '과학')   #칼럼 삭제; columns = '칼럼명'

Unnamed: 0,영어,국어
영희,95,100
철수,89,80
삼식,90,75


### 행/열 선택

In [43]:
df4

Unnamed: 0,영어,국어,과학
영희,95,100,70
철수,89,80,70
삼식,90,75,70


In [32]:
student1 = df4.loc['철수']   #label 이용하여 행 선택
student2 = df4.iloc[0]   #index 이용하여 행 선택

In [34]:
student1

영어    89
국어    80
과학    70
Name: 철수, dtype: int64

In [35]:
student2

영어     95
국어    100
과학     70
Name: 영희, dtype: int64

In [36]:
students = df4.iloc[[0, 1]]   #여러 행 선택할 때 .iloc[[idx1, idx2 ... ]]

In [38]:
students

Unnamed: 0,영어,국어,과학
영희,95,100,70
철수,89,80,70


In [40]:
students2 = df4.loc['철수':'삼식']   #'삼식' 데이터도 포함 (일반적인 슬라이싱과 다름)

In [42]:
students2

Unnamed: 0,영어,국어,과학
철수,89,80,70
삼식,90,75,70


In [44]:
exam_data = {'이름': ['영희', '철수', '삼식'], '수학': [90, 80, 70], '영어': [95, 89, 90], '국어': [100, 80, 75], '과학': [70, 70, 70]}

In [110]:
df5 = pd.DataFrame(exam_data)

In [48]:
df5['영어']

0    95
1    89
2    90
Name: 영어, dtype: int64

In [47]:
df5.영어

0    95
1    89
2    90
Name: 영어, dtype: int64

In [49]:
eng = df5.영어

In [50]:
eng

0    95
1    89
2    90
Name: 영어, dtype: int64

In [51]:
type(eng)

pandas.core.series.Series

In [53]:
subjects = df5[['수학', '국어']]

In [54]:
subjects

Unnamed: 0,수학,국어
0,90,100
1,80,80
2,70,75


In [55]:
type(subjects)

pandas.core.frame.DataFrame

In [59]:
df5

Unnamed: 0,이름,수학,영어,국어,과학
0,영희,90,95,100,70
1,철수,80,89,80,70
2,삼식,70,90,75,70


In [60]:
df5.set_index('이름', inplace = True)

In [61]:
df5

Unnamed: 0_level_0,수학,영어,국어,과학
이름,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
영희,90,95,100,70
철수,80,89,80,70
삼식,70,90,75,70


In [69]:
df5.loc['철수'][['수학', '과학']]

수학    80
과학    70
Name: 철수, dtype: int64

In [73]:
df5.loc['철수', ['수학', '과학']]  #둘 다 됨

수학    80
과학    70
Name: 철수, dtype: int64

In [74]:
df5.iloc[1][[0, 3]]

수학    80
과학    70
Name: 철수, dtype: int64

In [78]:
df5.iloc[1, [0, 3]]

수학    80
과학    70
Name: 철수, dtype: int64

In [76]:
df5.loc['철수']['국어':]

국어    80
과학    70
Name: 철수, dtype: int64

In [77]:
df5.loc['철수', '국어':]

국어    80
과학    70
Name: 철수, dtype: int64

In [75]:
df5.iloc[1][2:]

국어    80
과학    70
Name: 철수, dtype: int64

In [79]:
df5.iloc[1, 2:]

국어    80
과학    70
Name: 철수, dtype: int64

### 행/열 추가

In [98]:
df5

Unnamed: 0,이름,수학,영어,국어,과학
0,영희,90,95,100,70
1,철수,80,89,80,70
2,삼식,70,90,75,70


In [112]:
df5.loc[3] = 0  #일괄 추가

In [113]:
df5

Unnamed: 0,이름,수학,영어,국어,과학
0,영희,90,95,100,70
1,철수,80,89,80,70
2,삼식,70,90,75,70
3,0,0,0,0,0


In [114]:
df5.loc[4] = ['동식', 70, 80, 90, 100]   #칼럼별 추가

In [115]:
df5

Unnamed: 0,이름,수학,영어,국어,과학
0,영희,90,95,100,70
1,철수,80,89,80,70
2,삼식,70,90,75,70
3,0,0,0,0,0
4,동식,70,80,90,100


In [116]:
df5.loc['행5'] = df5.loc[3]   #기존 행 복사

In [117]:
df5

Unnamed: 0,이름,수학,영어,국어,과학
0,영희,90,95,100,70
1,철수,80,89,80,70
2,삼식,70,90,75,70
3,0,0,0,0,0
4,동식,70,80,90,100
행5,0,0,0,0,0


In [125]:
df5 = pd.DataFrame(exam_data)    #초기화

In [126]:
df5.set_index('이름', inplace = True)

In [127]:
df5.loc['철수',['수학', '과학']] = 50

In [128]:
df5

Unnamed: 0_level_0,수학,영어,국어,과학
이름,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
영희,90,95,100,70
철수,50,89,80,50
삼식,70,90,75,70


### 전치

In [131]:
df5 = df5.T  #고정

In [134]:
df5

이름,영희,철수,삼식
수학,90,50,70
영어,95,89,90
국어,100,80,75
과학,70,50,70


In [133]:
df5.transpose()   #고정x

Unnamed: 0_level_0,수학,영어,국어,과학
이름,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
영희,90,95,100,70
철수,50,89,80,50
삼식,70,90,75,70


### 인덱스 재지정

In [149]:
dict_data = {'c0':[1, 2, 3], 'c1':[4, 5, 6], 'c2':[7, 8, 9], 'c3':[10, 11, 12], 'c4':[13, 14, 15]}

In [150]:
df6 = pd.DataFrame(dict_data, index = ['r0', 'r1', 'r2'])

In [151]:
df6

Unnamed: 0,c0,c1,c2,c3,c4
r0,1,4,7,10,13
r1,2,5,8,11,14
r2,3,6,9,12,15


In [152]:
new_index = ['r0', 'r1', 'r2', 'r3', 'r4']

In [153]:
new_df = df6.reindex(new_index, fill_value = 0)   #reindex로 새롭게 발생된 NA를 0으로 채움

In [154]:
new_df

Unnamed: 0,c0,c1,c2,c3,c4
r0,1,4,7,10,13
r1,2,5,8,11,14
r2,3,6,9,12,15
r3,0,0,0,0,0
r4,0,0,0,0,0


In [155]:
new_df2 = df6.reset_index()    #reset_index 인자를 지정하지 않으면 다시 기본 정수형으로 행 인덱스 초기화

In [156]:
new_df2

Unnamed: 0,index,c0,c1,c2,c3,c4
0,r0,1,4,7,10,13
1,r1,2,5,8,11,14
2,r2,3,6,9,12,15


In [159]:
df6.sort_index(ascending = False)   #sort_index: 행 인덱스를 기준으로 내림차순 정렬

Unnamed: 0,c0,c1,c2,c3,c4
r2,3,6,9,12,15
r1,2,5,8,11,14
r0,1,4,7,10,13


In [161]:
df6.sort_values(by = 'c3', ascending = False)  #sort_values(by = '칼럼명')

Unnamed: 0,c0,c1,c2,c3,c4
r2,3,6,9,12,15
r1,2,5,8,11,14
r0,1,4,7,10,13


In [162]:
student1 = pd.Series({'국어':100, '영어':90, '수학':80})

In [163]:
student1

국어    100
영어     90
수학     80
dtype: int64

In [164]:
percentage = student1/300   #벡터연산

In [165]:
percentage

국어    0.333333
영어    0.300000
수학    0.266667
dtype: float64

In [166]:
type(percentage)

pandas.core.series.Series

In [170]:
student1 = pd.Series({'국어':np.nan, '영어':90, '수학':80})
student2 = pd.Series({'영어':90, '수학':80})

In [171]:
student1

국어     NaN
영어    90.0
수학    80.0
dtype: float64

In [172]:
student2

영어    90
수학    80
dtype: int64

In [179]:
student1 + student2   #라벨 인덱스 일치하는 값끼리 연산

국어      NaN
수학    160.0
영어    180.0
dtype: float64

In [176]:
student1 - student2

국어    NaN
수학    0.0
영어    0.0
dtype: float64

In [177]:
student1 / student2

국어    NaN
수학    1.0
영어    1.0
dtype: float64

In [178]:
student1 * student2

국어       NaN
수학    6400.0
영어    8100.0
dtype: float64

In [192]:
re_add = student1.add(student2, fill_value = 0)

In [210]:
re_add

국어      NaN
수학    160.0
영어    180.0
dtype: float64

In [193]:
re_sub = student1.sub(student2, fill_value = 0)

In [194]:
re_div = student1.div(student2, fill_value = 0)

In [195]:
re_mul = student1.mul(student2, fill_value = 0)

In [196]:
result = pd.DataFrame([re_add, re_div, re_mul, re_sub], index = ['덧셈', '나눗셈', '곱셈', '뺄셈'])

In [197]:
result

Unnamed: 0,국어,수학,영어
덧셈,,160.0,180.0
나눗셈,,1.0,1.0
곱셈,,6400.0,8100.0
뺄셈,,0.0,0.0


## Seaborn

In [198]:
import seaborn as sns

In [199]:
titanic = sns.load_dataset('titanic')

In [200]:
titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [206]:
type(titanic)

pandas.core.frame.DataFrame

In [203]:
df = titanic.loc[:, ['age', 'fare']]

In [207]:
type(df)

pandas.core.frame.DataFrame

In [205]:
df.head()

Unnamed: 0,age,fare
0,22.0,7.25
1,38.0,71.2833
2,26.0,7.925
3,35.0,53.1
4,35.0,8.05


In [208]:
addition = df + 10  #벡터 연산 가능
addition.head()

Unnamed: 0,age,fare
0,32.0,17.25
1,48.0,81.2833
2,36.0,17.925
3,45.0,63.1
4,45.0,18.05


In [209]:
type(addition)

pandas.core.frame.DataFrame

In [211]:
subtraction = addition - df  #데이터프레임끼리도 벡터 연산 가능

In [216]:
subtraction.tail()

Unnamed: 0,age,fare
886,10.0,10.0
887,10.0,10.0
888,,10.0
889,10.0,10.0
890,10.0,10.0


In [217]:
type(subtraction)

pandas.core.frame.DataFrame

## 외부파일 읽어오기

#### csv 파일 읽어오기

In [220]:
file_path = './datas/read_csv_sample.csv'

df = pd.read_csv(file_path)

In [221]:
df

Unnamed: 0,c0,c1,c2,c3
0,0,1,4,7
1,1,2,5,8
2,2,3,6,9


In [222]:
type(df)

pandas.core.frame.DataFrame

In [224]:
df2 = pd.read_csv(file_path, header = None)

In [225]:
df2

Unnamed: 0,0,1,2,3
0,c0,c1,c2,c3
1,0,1,4,7
2,1,2,5,8
3,2,3,6,9


In [226]:
df3 = pd.read_csv(file_path, index_col = None)

In [227]:
df3

Unnamed: 0,c0,c1,c2,c3
0,0,1,4,7
1,1,2,5,8
2,2,3,6,9


In [228]:
df4 = pd.read_csv(file_path, index_col = 'c0')

In [229]:
df4

Unnamed: 0_level_0,c1,c2,c3
c0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1,4,7
1,2,5,8
2,3,6,9


In [234]:
df.dtypes

c0    int64
c1    int64
c2    int64
c3    int64
dtype: object

#### excel 파일 읽어오기

In [238]:
file_path = './datas/남북한발전전력량.xlsx'

df = pd.read_excel(file_path)

In [239]:
df

Unnamed: 0,전력량 (억㎾h),발전 전력별,1990,1991,1992,1993,1994,1995,1996,1997,...,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016
0,남한,합계,1077,1186,1310,1444,1650,1847,2055,2244,...,4031,4224,4336,4747,4969,5096,5171,5220,5281,5404
1,,수력,64,51,49,60,41,55,52,54,...,50,56,56,65,78,77,84,78,58,66
2,,화력,484,573,696,803,1022,1122,1264,1420,...,2551,2658,2802,3196,3343,3430,3581,3427,3402,3523
3,,원자력,529,563,565,581,587,670,739,771,...,1429,1510,1478,1486,1547,1503,1388,1564,1648,1620
4,,신재생,-,-,-,-,-,-,-,-,...,-,-,-,-,-,86,118,151,173,195
5,북한,합계,277,263,247,221,231,230,213,193,...,236,255,235,237,211,215,221,216,190,239
6,,수력,156,150,142,133,138,142,125,107,...,133,141,125,134,132,135,139,130,100,128
7,,화력,121,113,105,88,93,88,88,86,...,103,114,110,103,79,80,82,86,90,111
8,,원자력,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-


In [240]:
df1 = pd.read_excel(file_path, header = None)

In [241]:
df1

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,28
0,전력량 (억㎾h),발전 전력별,1990,1991,1992,1993,1994,1995,1996,1997,...,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016
1,남한,합계,1077,1186,1310,1444,1650,1847,2055,2244,...,4031,4224,4336,4747,4969,5096,5171,5220,5281,5404
2,,수력,64,51,49,60,41,55,52,54,...,50,56,56,65,78,77,84,78,58,66
3,,화력,484,573,696,803,1022,1122,1264,1420,...,2551,2658,2802,3196,3343,3430,3581,3427,3402,3523
4,,원자력,529,563,565,581,587,670,739,771,...,1429,1510,1478,1486,1547,1503,1388,1564,1648,1620
5,,신재생,-,-,-,-,-,-,-,-,...,-,-,-,-,-,86,118,151,173,195
6,북한,합계,277,263,247,221,231,230,213,193,...,236,255,235,237,211,215,221,216,190,239
7,,수력,156,150,142,133,138,142,125,107,...,133,141,125,134,132,135,139,130,100,128
8,,화력,121,113,105,88,93,88,88,86,...,103,114,110,103,79,80,82,86,90,111
9,,원자력,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-


#### json

In [245]:
file_path = "./datas/df_sample.json"
df2 = pd.read_json(file_path)

In [246]:
df2

Unnamed: 0,algol,basic,c++
Jerry,A,C,B+
Riah,A+,B,C
Paul,B,B+,C+


In [247]:
df2.index

Index(['Jerry', 'Riah', 'Paul'], dtype='object')

In [248]:
df2.columns

Index(['algol', 'basic', 'c++'], dtype='object')

#### html

In [265]:
file_path = "./datas/sample.html"

tables = pd.read_html(file_path)

In [266]:
for i in range(len(tables)):
    print("tables[%s]"%i)
    print(tables[i])

tables[0]
   Unnamed: 0  c0  c1  c2  c3
0           0   0   1   4   7
1           1   1   2   5   8
2           2   2   3   6   9
tables[1]
         name  year        developer  opensource
0       NumPy  2006  Travis Oliphant        True
1  matplotlib  2003   John D. Hunter        True
2      pandas  2008    Wes Mckinneye        True


In [267]:
df3 = tables[1]

In [268]:
df3

Unnamed: 0,name,year,developer,opensource
0,NumPy,2006,Travis Oliphant,True
1,matplotlib,2003,John D. Hunter,True
2,pandas,2008,Wes Mckinneye,True


In [269]:
df3.set_index(['name'], inplace = True)

In [270]:
df3

Unnamed: 0_level_0,year,developer,opensource
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
NumPy,2006,Travis Oliphant,True
matplotlib,2003,John D. Hunter,True
pandas,2008,Wes Mckinneye,True


## 외부 디렉토리에 파일 저장

In [271]:
exam_data

{'이름': ['영희', '철수', '삼식'],
 '수학': [90, 80, 70],
 '영어': [95, 89, 90],
 '국어': [100, 80, 75],
 '과학': [70, 70, 70]}

In [272]:
df = pd.DataFrame(exam_data)
df.set_index('이름', inplace = True)
df

Unnamed: 0_level_0,수학,영어,국어,과학
이름,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
영희,90,95,100,70
철수,80,89,80,70
삼식,70,90,75,70


In [273]:
import os

In [274]:
os.mkdir('output')

In [280]:
#csv
df.to_csv("./output/exam_sample.csv", encoding = 'euc-kr')

In [283]:
#json
df.to_json("./output/exam_sample.json")    #json은 인코딩 어떻게?

In [286]:
#excel
df.to_excel("./output/examp_sample.xlsx", encoding = 'euc-kr')

In [287]:
df2 = pd.DataFrame(dict_data)

In [288]:
df2

Unnamed: 0,c0,c1,c2,c3,c4
0,1,4,7,10,13
1,2,5,8,11,14
2,3,6,9,12,15


In [294]:
#개별 시트로도 저장할 수 있음
writer = pd.ExcelWriter("./output/df_excelwriter.xlsx")
df.to_excel(writer, sheet_name = "sheet1")
df2.to_excel(writer, sheet_name = "sheet2")
writer.save()

## 함수 적용

In [295]:
frame = pd.DataFrame(np.random.randn(4, 3), columns = list('bde'), index = ['Utah', 'Ohio', 'Texas', 'Oregon'])

frame

Unnamed: 0,b,d,e
Utah,-1.119389,-0.355683,-1.707551
Ohio,-0.088219,-1.358221,-0.607045
Texas,-0.14354,0.453892,0.342088
Oregon,0.462978,0.887172,0.245668


In [296]:
np.abs(frame)  #단항 유니버설 함수 .abs(): 절댓값 리턴

Unnamed: 0,b,d,e
Utah,1.119389,0.355683,1.707551
Ohio,0.088219,1.358221,0.607045
Texas,0.14354,0.453892,0.342088
Oregon,0.462978,0.887172,0.245668


In [297]:
#행 단위: .apply()
f = lambda x: x.max() - x.min()

frame.apply(f)

b    1.582367
d    2.245393
e    2.049639
dtype: float64

In [302]:
#열 단위: .apply(axis = 1)
frame.apply(f, axis = 1)

Utah      1.351867
Ohio      1.270001
Texas     0.597431
Oregon    0.641504
dtype: float64

In [303]:
def f(x):
    return pd.Series([x.min(), x.max()], index = ['min', 'max'])

frame.apply(f)

Unnamed: 0,b,d,e
min,-1.119389,-1.358221,-1.707551
max,0.462978,0.887172,0.342088


In [304]:
#데이터 단위: .applymap()
format = lambda x: '%.2f' % x
frame.applymap(format)

Unnamed: 0,b,d,e
Utah,-1.12,-0.36,-1.71
Ohio,-0.09,-1.36,-0.61
Texas,-0.14,0.45,0.34
Oregon,0.46,0.89,0.25


In [305]:
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])

In [306]:
obj.rank()

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [308]:
obj.rank(method = 'first')

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

In [309]:
df

Unnamed: 0_level_0,수학,영어,국어,과학
이름,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
영희,90,95,100,70
철수,80,89,80,70
삼식,70,90,75,70


In [314]:
df.idxmax()

수학    영희
영어    영희
국어    영희
과학    영희
dtype: object

In [313]:
df.cumsum()

Unnamed: 0_level_0,수학,영어,국어,과학
이름,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
영희,90,95,100,70
철수,170,184,180,140
삼식,240,274,255,210


## 상관관계 / 공분산

In [5]:
import pandas_datareader.data as web

In [7]:
all_data = {ticker: web.get_data_yahoo(ticker) for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']}

In [8]:
all_data

{'AAPL':                   High         Low        Open       Close      Volume  \
 Date                                                                     
 2014-12-17  109.839996  106.820000  107.120003  109.410004  53411800.0   
 2014-12-18  112.650002  110.660004  111.870003  112.650002  59006200.0   
 2014-12-19  113.239998  111.660004  112.260002  111.779999  88429800.0   
 2014-12-22  113.489998  111.970001  112.160004  112.940002  45167500.0   
 2014-12-23  113.330002  112.459999  113.230003  112.540001  26028400.0   
 ...                ...         ...         ...         ...         ...   
 2019-12-10  270.070007  265.859985  268.600006  268.480011  22605100.0   
 2019-12-11  271.100006  268.500000  268.809998  270.769989  19689200.0   
 2019-12-12  272.559998  267.320007  267.779999  271.459991  34327600.0   
 2019-12-13  275.299988  270.929993  271.459991  275.149994  33396900.0   
 2019-12-16  280.649994  276.980011  277.000000  279.859985  29765105.0   
 
              A

In [13]:
price = pd.DataFrame({ticker:data['Adj Close'] for ticker, data in all_data.items()})
volume = pd.DataFrame({ticker:data['Volume'] for ticker, data in all_data.items()})

In [14]:
price

Unnamed: 0_level_0,AAPL,IBM,MSFT,GOOG
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2014-12-17,100.527809,122.428116,41.063480,503.507629
2014-12-18,103.504761,127.061554,42.661484,509.700623
2014-12-19,102.705391,127.730415,42.787170,514.936218
2014-12-22,103.771225,130.091492,43.074444,523.432922
2014-12-23,103.403709,130.736130,43.496403,529.137268
...,...,...,...,...
2019-12-10,268.480011,133.910004,151.130005,1344.660034
2019-12-11,270.769989,133.759995,151.699997,1345.020020
2019-12-12,271.459991,135.320007,153.240005,1350.270020
2019-12-13,275.149994,134.210007,154.529999,1347.829956


In [15]:
volume

Unnamed: 0_level_0,AAPL,IBM,MSFT,GOOG
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2014-12-17,53411800.0,5131000.0,34970900.0,2883100.0
2014-12-18,59006200.0,7302400.0,40105600.0,2926700.0
2014-12-19,88429800.0,8864900.0,64551200.0,3690200.0
2014-12-22,45167500.0,4682500.0,26566000.0,2723700.0
2014-12-23,26028400.0,4043300.0,23648100.0,2197600.0
...,...,...,...,...
2019-12-10,22605100.0,3480400.0,16476100.0,1094100.0
2019-12-11,19689200.0,3953300.0,18856600.0,850400.0
2019-12-12,34327600.0,4824100.0,24612100.0,1281000.0
2019-12-13,33396900.0,2535000.0,23845400.0,1549600.0


In [16]:
returns = price.pct_change()
returns.tail()

Unnamed: 0_level_0,AAPL,IBM,MSFT,GOOG
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-12-10,0.005844,-7.5e-05,-0.00152,0.000819
2019-12-11,0.008529,-0.00112,0.003772,0.000268
2019-12-12,0.002548,0.011663,0.010152,0.003903
2019-12-13,0.013593,-0.008203,0.008418,-0.001807
2019-12-16,0.017118,-0.000596,0.006471,0.009897


In [18]:
returns['MSFT'].corr(returns['IBM'])

0.4879996206338405

In [21]:
returns.MSFT.corr(returns.IBM)

0.4879996206338405

In [20]:
returns['MSFT'].cov(returns['IBM'])

9.348465031382588e-05

In [22]:
returns.corr()

Unnamed: 0,AAPL,IBM,MSFT,GOOG
AAPL,1.0,0.402014,0.575064,0.521959
IBM,0.402014,1.0,0.488,0.411852
MSFT,0.575064,0.488,1.0,0.658773
GOOG,0.521959,0.411852,0.658773,1.0


In [23]:
returns.cov()

Unnamed: 0,AAPL,IBM,MSFT,GOOG
AAPL,0.000246,8.2e-05,0.000133,0.000124
IBM,8.2e-05,0.000169,9.3e-05,8.1e-05
MSFT,0.000133,9.3e-05,0.000217,0.000147
GOOG,0.000124,8.1e-05,0.000147,0.000229


In [24]:
returns.corrwith(returns.IBM)

AAPL    0.402014
IBM     1.000000
MSFT    0.488000
GOOG    0.411852
dtype: float64

In [25]:
returns.corrwith(volume)

AAPL   -0.117818
IBM    -0.129091
MSFT   -0.084323
GOOG   -0.000865
dtype: float64

### 유일값, 값세기, 멤버십

In [27]:
obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])

In [28]:
uniques = obj.unique()

In [29]:
uniques

array(['c', 'a', 'd', 'b'], dtype=object)

In [30]:
uniques.sort()

In [31]:
uniques

array(['a', 'b', 'c', 'd'], dtype=object)

In [32]:
obj.value_counts()

a    3
c    3
b    2
d    1
dtype: int64

In [33]:
obj.value_counts(sort = False)

c    3
d    1
b    2
a    3
dtype: int64

In [39]:
obj

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

In [34]:
mask = obj.isin(['b', 'c'])

In [36]:
mask

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

In [38]:
obj[mask]

0    c
5    b
6    b
7    c
8    c
dtype: object

In [43]:
to_match = pd.Series(['c', 'a', 'b', 'b', 'c', 'a'])
unique_vals = pd.Series(to_match.unique())

In [44]:
unique_vals

0    c
1    a
2    b
dtype: object

In [45]:
pd.Index(unique_vals).get_indexer(to_match)

array([0, 1, 2, 2, 0, 1], dtype=int64)