In [1]:
import numpy as np
import pandas as pd

In [2]:
data = [
    [9.9, 8.8],
    [np.nan, 6.6],
    [7.7, np.nan],
    [0.99, 9.8]
]
df = pd.DataFrame(data, columns=["first", "second"], index=["a", "b", "c", "d"])
df

Unnamed: 0,first,second
a,9.9,8.8
b,,6.6
c,7.7,
d,0.99,9.8


In [3]:
df.sum(axis=0) # 행의 진행방향 즉, 열의 개수만큼 return됨

first     18.59
second    25.20
dtype: float64

In [12]:
df['first'].sum() # first column에 대해 합계

18.59

In [10]:
df['second'].mean() # second column에 대해 평균

8.4

In [11]:
df.loc['a'].sum() # a row에 대해 합계

18.700000000000003

In [13]:
df.mean(axis=1, skipna=False) # NaN가 들어있으면 계산하지 않음

a    9.350
b      NaN
c      NaN
d    5.395
dtype: float64

In [14]:
first_mean = df.mean(axis=0)['first']

In [15]:
first_mean

6.196666666666666

In [16]:
second_min = df.min(axis=0)['second']

In [17]:
second_min

6.6

In [19]:
df['first'] = df['first'].fillna(value=first_mean)

In [20]:
df

Unnamed: 0,first,second
a,9.9,8.8
b,6.196667,6.6
c,7.7,
d,0.99,9.8


In [21]:
df['second'] = df['second'].fillna(value=second_min)

In [22]:
df

Unnamed: 0,first,second
a,9.9,8.8
b,6.196667,6.6
c,7.7,6.6
d,0.99,9.8


In [25]:
df2 = pd.DataFrame(
    np.random.randn(6, 4),
    columns=["A", "B", "C", "D"],
    index=pd.date_range("20180220", periods=6)
)

In [26]:
df2

Unnamed: 0,A,B,C,D
2018-02-20,-0.77842,-0.625813,-0.670107,1.758712
2018-02-21,-0.683797,1.334712,-1.55258,0.513848
2018-02-22,0.313467,-0.834368,0.474769,1.552199
2018-02-23,0.569629,0.239116,-0.122799,-1.488369
2018-02-24,-0.574625,0.75921,0.32826,-2.03491
2018-02-25,-2.600913,-0.528393,-1.315435,-1.321746


In [27]:
df2.corr() # 모든 경우에 대한 상관계수(correlations)를 구해 줌. 자기 자신과 만나면 1이 출력됨

Unnamed: 0,A,B,C,D
A,1.0,0.124381,0.65745,0.204087
B,0.124381,1.0,-0.264426,-0.397549
C,0.65745,-0.264426,1.0,-0.06742
D,0.204087,-0.397549,-0.06742,1.0


In [28]:
df2['A'].corr(df2['B']) # A와 B의 상관관계

0.12438078964674289

In [29]:
df2['B'].corr(df2['C'])

-0.26442553306425454

In [31]:
df2.cov()

Unnamed: 0,A,B,C,D
A,1.248324,0.120547,0.62129,0.377391
B,0.120547,0.752454,-0.194004,-0.570746
C,0.62129,-0.194004,0.71538,-0.094378
D,0.377391,-0.570746,-0.094378,2.73921


In [32]:
df2['C'].cov(df2['D'])

-0.09437760087191782

In [33]:
dates = df2.index
# permutation 은 치환
random_dates = np.random.permutation(dates)
# 무작위로 섞어봄. index 순서와 컬럼의 순서가 불규칙하게 변함
df2 = df2.reindex(index=random_dates, columns=["D", "B", "C", "A"])

In [34]:
df2

Unnamed: 0,D,B,C,A
2018-02-23,-1.488369,0.239116,-0.122799,0.569629
2018-02-22,1.552199,-0.834368,0.474769,0.313467
2018-02-21,0.513848,1.334712,-1.55258,-0.683797
2018-02-24,-2.03491,0.75921,0.32826,-0.574625
2018-02-25,-1.321746,-0.528393,-1.315435,-2.600913
2018-02-20,1.758712,-0.625813,-0.670107,-0.77842


In [35]:
df2.sort_index(axis=0) # 행의 진행방향 (열)을 정렬

Unnamed: 0,D,B,C,A
2018-02-20,1.758712,-0.625813,-0.670107,-0.77842
2018-02-21,0.513848,1.334712,-1.55258,-0.683797
2018-02-22,1.552199,-0.834368,0.474769,0.313467
2018-02-23,-1.488369,0.239116,-0.122799,0.569629
2018-02-24,-2.03491,0.75921,0.32826,-0.574625
2018-02-25,-1.321746,-0.528393,-1.315435,-2.600913


In [38]:
df2.sort_index(axis=0, ascending=False) # 행의 진행방향 (열)을 descending  정렬

Unnamed: 0,D,B,C,A
2018-02-25,-1.321746,-0.528393,-1.315435,-2.600913
2018-02-24,-2.03491,0.75921,0.32826,-0.574625
2018-02-23,-1.488369,0.239116,-0.122799,0.569629
2018-02-22,1.552199,-0.834368,0.474769,0.313467
2018-02-21,0.513848,1.334712,-1.55258,-0.683797
2018-02-20,1.758712,-0.625813,-0.670107,-0.77842


In [39]:
df2.sort_index(axis=1, ascending=True)

Unnamed: 0,A,B,C,D
2018-02-23,0.569629,0.239116,-0.122799,-1.488369
2018-02-22,0.313467,-0.834368,0.474769,1.552199
2018-02-21,-0.683797,1.334712,-1.55258,0.513848
2018-02-24,-0.574625,0.75921,0.32826,-2.03491
2018-02-25,-2.600913,-0.528393,-1.315435,-1.321746
2018-02-20,-0.77842,-0.625813,-0.670107,1.758712


In [41]:
df2.sort_values(by='D') # D colomn에 대해 정렬

Unnamed: 0,D,B,C,A
2018-02-24,-2.03491,0.75921,0.32826,-0.574625
2018-02-23,-1.488369,0.239116,-0.122799,0.569629
2018-02-25,-1.321746,-0.528393,-1.315435,-2.600913
2018-02-21,0.513848,1.334712,-1.55258,-0.683797
2018-02-22,1.552199,-0.834368,0.474769,0.313467
2018-02-20,1.758712,-0.625813,-0.670107,-0.77842


In [42]:
df2.sort_values(by='B', ascending=False)

Unnamed: 0,D,B,C,A
2018-02-21,0.513848,1.334712,-1.55258,-0.683797
2018-02-24,-2.03491,0.75921,0.32826,-0.574625
2018-02-23,-1.488369,0.239116,-0.122799,0.569629
2018-02-25,-1.321746,-0.528393,-1.315435,-2.600913
2018-02-20,1.758712,-0.625813,-0.670107,-0.77842
2018-02-22,1.552199,-0.834368,0.474769,0.313467


In [43]:
df2["E"] = np.random.randint(0, 6, size=6) # integer random number를 return함
df2["F"] = ["first", "second", "first", "third", "first", "second"]

In [44]:
df2

Unnamed: 0,D,B,C,A,E,F
2018-02-23,-1.488369,0.239116,-0.122799,0.569629,0,first
2018-02-22,1.552199,-0.834368,0.474769,0.313467,5,second
2018-02-21,0.513848,1.334712,-1.55258,-0.683797,2,first
2018-02-24,-2.03491,0.75921,0.32826,-0.574625,5,third
2018-02-25,-1.321746,-0.528393,-1.315435,-2.600913,2,first
2018-02-20,1.758712,-0.625813,-0.670107,-0.77842,3,second


In [45]:
df2["E"] = np.random.randint(0, 6, size=6)
df2

Unnamed: 0,D,B,C,A,E,F
2018-02-23,-1.488369,0.239116,-0.122799,0.569629,0,first
2018-02-22,1.552199,-0.834368,0.474769,0.313467,3,second
2018-02-21,0.513848,1.334712,-1.55258,-0.683797,5,first
2018-02-24,-2.03491,0.75921,0.32826,-0.574625,5,third
2018-02-25,-1.321746,-0.528393,-1.315435,-2.600913,4,first
2018-02-20,1.758712,-0.625813,-0.670107,-0.77842,5,second


In [46]:
df2.sort_values(by=['E', 'F']) # E열을 기준으로 정렬하되, E열에 동일 값이 있는 경우 F열을 기준으로 정렬함

Unnamed: 0,D,B,C,A,E,F
2018-02-23,-1.488369,0.239116,-0.122799,0.569629,0,first
2018-02-22,1.552199,-0.834368,0.474769,0.313467,3,second
2018-02-25,-1.321746,-0.528393,-1.315435,-2.600913,4,first
2018-02-21,0.513848,1.334712,-1.55258,-0.683797,5,first
2018-02-20,1.758712,-0.625813,-0.670107,-0.77842,5,second
2018-02-24,-2.03491,0.75921,0.32826,-0.574625,5,third


In [49]:
df2['F'].unique() # SQL로 따지면 distinct 키워드를 쓴 결과처럼 중복 값은 제외하고 보여줌

array(['first', 'second', 'third'], dtype=object)

In [51]:
df2['F'].value_counts() # F의 각 값이 몇 번씩 나오는지 카운팅

first     3
second    2
third     1
Name: F, dtype: int64

In [52]:
df3 = pd.DataFrame(
    np.random.randn(4, 3),
    columns=["b", "d", "e"],
    index=["Seoul", "Incheon", "Busan", "Daegu"]
)

In [53]:
df3

Unnamed: 0,b,d,e
Seoul,-0.965342,0.10843,-0.541255
Incheon,0.392496,0.867019,-0.185686
Busan,0.281158,-0.263917,1.488005
Daegu,-1.293285,-1.439733,0.158454


In [58]:
my_func = lambda x: x.max() - x.min()

In [59]:
df3.apply(my_func, axis=0) # apply 함수는 param으로 함수를 가진다. => 고차함수

b    1.685781
d    2.306752
e    2.029260
dtype: float64

In [60]:
df3

Unnamed: 0,b,d,e
Seoul,-0.965342,0.10843,-0.541255
Incheon,0.392496,0.867019,-0.185686
Busan,0.281158,-0.263917,1.488005
Daegu,-1.293285,-1.439733,0.158454
