# Lec29_データのマージ 

In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame

In [2]:
dframe1 = DataFrame({'key':['X','Z','Y','Z','X','X'],'data_set_1':np.arange(6)})
dframe1

Unnamed: 0,data_set_1,key
0,0,X
1,1,Z
2,2,Y
3,3,Z
4,4,X
5,5,X


In [3]:
dframe2 = DataFrame({'key':['Q','Y','Z'],'data_set_2':[1,2,3]})
dframe2

Unnamed: 0,data_set_2,key
0,1,Q
1,2,Y
2,3,Z


In [4]:
# mergeメソッドで１対多の結合する
pd.merge(dframe1,dframe2)

# keyが一致する要素のみ結合される

Unnamed: 0,data_set_1,key,data_set_2
0,1,Z,3
1,3,Z,3
2,2,Y,2


In [5]:
#key列を指定し、joinの方法も選べる
pd.merge(dframe1,dframe2,on='key',how='left')

# 'key'列をキーとし、dframe1にleft outer join

Unnamed: 0,data_set_1,key,data_set_2
0,0,X,
1,1,Z,3.0
2,2,Y,2.0
3,3,Z,3.0
4,4,X,
5,5,X,


In [6]:
pd.merge(dframe1,dframe2,on='key',how='right')

# 'key'列をキーとし、dframe2にright outer join

Unnamed: 0,data_set_1,key,data_set_2
0,1.0,Z,3
1,3.0,Z,3
2,2.0,Y,2
3,,Q,1


In [7]:
dframe3 = DataFrame({'key':['X','X','X','Y','Z','Z'], 'data_set_3':range(6)})
dframe3

Unnamed: 0,data_set_3,key
0,0,X
1,1,X
2,2,X
3,3,Y
4,4,Z
5,5,Z


In [8]:
dframe4 = DataFrame({'key':['Y','Y','X','X','Z'], 'data_set_4':range(5)})
dframe4

Unnamed: 0,data_set_4,key
0,0,Y
1,1,Y
2,2,X
3,3,X
4,4,Z


In [9]:
# mergeメソッドで多対多の結合をする
pd.merge(dframe3,dframe4)

Unnamed: 0,data_set_3,key,data_set_4
0,0,X,2
1,0,X,3
2,1,X,2
3,1,X,3
4,2,X,2
5,2,X,3
6,3,Y,0
7,3,Y,1
8,4,Z,4
9,5,Z,4


In [10]:
# 複数列をkeyにmergeする
df_left = DataFrame({'key1':['SF','SF','LA'],
                     'key2':['one','two','one'],
                     'left_data':[10,20,30]})
df_left

Unnamed: 0,key1,key2,left_data
0,SF,one,10
1,SF,two,20
2,LA,one,30


In [11]:
df_right = DataFrame({'key1':['SF','SF','LA','LA'],
                     'key2':['one','one','one','two'],
                     'right_data':[40,50,60,70]})
df_right

Unnamed: 0,key1,key2,right_data
0,SF,one,40
1,SF,one,50
2,LA,one,60
3,LA,two,70


In [12]:
pd.merge(df_left,df_right,on=['key1','key2'],how='outer')

Unnamed: 0,key1,key2,left_data,right_data
0,SF,one,10.0,40.0
1,SF,one,10.0,50.0
2,SF,two,20.0,
3,LA,one,30.0,60.0
4,LA,two,,70.0


In [13]:
# 列名に重複がある場合は、自動で接尾辞がつく
pd.merge(df_left,df_right,on='key1')

Unnamed: 0,key1,key2_x,left_data,key2_y,right_data
0,SF,one,10,one,40
1,SF,one,10,one,50
2,SF,two,20,one,40
3,SF,two,20,one,50
4,LA,one,30,one,60
5,LA,one,30,two,70


In [14]:
# 接尾辞を指定する
pd.merge(df_left,df_right,on='key1',suffixes=('_left','_right'))

Unnamed: 0,key1,key2_left,left_data,key2_right,right_data
0,SF,one,10,one,40
1,SF,one,10,one,50
2,SF,two,20,one,40
3,SF,two,20,one,50
4,LA,one,30,one,60
5,LA,one,30,two,70


## pandas.merge
http://pandas.pydata.org/pandas-docs/stable/generated/pandas.merge.html#pandas.merge

# Lec30_indexを使ったmerge

In [15]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np

In [16]:
df_left = DataFrame({'key':['X','Y','X','X','Y'],'data':range(5)})
df_left

Unnamed: 0,data,key
0,0,X
1,1,Y
2,2,X
3,3,X
4,4,Y


In [17]:
df_right = DataFrame({'group_data':[10,20]},index=['X','Y'])
df_right

Unnamed: 0,group_data
X,10
Y,20


In [18]:
# key列とindexを使ったマージ
pd.merge(df_left,df_right,left_on='key',right_index=True)

Unnamed: 0,data,key,group_data
0,0,X,10
2,2,X,10
3,3,X,10
1,1,Y,20
4,4,Y,20


In [19]:
# df_leftにleft outer join
pd.merge(df_left,df_right,left_on='key',right_index=True,how='outer')

Unnamed: 0,data,key,group_data
0,0,X,10
2,2,X,10
3,3,X,10
1,1,Y,20
4,4,Y,20


In [20]:
# joinメソッドを使用する
df_left.join(df_right,how='outer')

Unnamed: 0,data,key,group_data
0,0.0,X,
1,1.0,Y,
2,2.0,X,
3,3.0,X,
4,4.0,Y,
X,,,10.0
Y,,,20.0


In [21]:
# 階層的なindexでのmerge
df_left_hr = DataFrame({'key1': ['SF','SF','SF','LA','LA'],
                   'key2': [10, 20, 30, 20, 30],
                   'data_set': np.arange(5.)})
df_left_hr

Unnamed: 0,data_set,key1,key2
0,0.0,SF,10
1,1.0,SF,20
2,2.0,SF,30
3,3.0,LA,20
4,4.0,LA,30


In [22]:
df_right_hr = DataFrame(np.arange(10).reshape((5, 2)),
                   index=[['LA','LA','SF','SF','SF'],
                          [20, 10, 10, 10, 20]],
                   columns=['col_1', 'col_2'])
df_right_hr

Unnamed: 0,Unnamed: 1,col_1,col_2
LA,20,0,1
LA,10,2,3
SF,10,4,5
SF,10,6,7
SF,20,8,9


In [23]:
# keyとindexでmerge
pd.merge(df_left_hr,df_right_hr,left_on=['key1','key2'],right_index=True,how='outer')

Unnamed: 0,data_set,key1,key2,col_1,col_2
0,0.0,SF,10,4.0,5.0
0,0.0,SF,10,6.0,7.0
1,1.0,SF,20,8.0,9.0
2,2.0,SF,30,,
3,3.0,LA,20,0.0,1.0
4,4.0,LA,30,,
4,,LA,10,2.0,3.0


# Lec31_データの連結

In [24]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

## arrayの連結

In [25]:
arr1 = np.arange(9).reshape([3,3])
arr1

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [26]:
arr2 = np.array([[10,11,12],[13,14,15],[16,17,18]])
arr2

array([[10, 11, 12],
       [13, 14, 15],
       [16, 17, 18]])

In [27]:
# 行方向に連結する
np.concatenate([arr1,arr2])

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [10, 11, 12],
       [13, 14, 15],
       [16, 17, 18]])

In [28]:
# 列方向に連結する
np.concatenate([arr1,arr2],axis=1)

array([[ 0,  1,  2, 10, 11, 12],
       [ 3,  4,  5, 13, 14, 15],
       [ 6,  7,  8, 16, 17, 18]])

## Series の連結

In [29]:
ser1 = Series([0,1,2],index=['A','B','C'])
ser1

A    0
B    1
C    2
dtype: int64

In [30]:
ser2 = Series([3,4],index=['D','E'])
ser2

D    3
E    4
dtype: int64

In [31]:
# 行方向に連結
pd.concat([ser1,ser2,ser1])

A    0
B    1
C    2
D    3
E    4
A    0
B    1
C    2
dtype: int64

In [32]:
# 階層的なindexをつけて連結する
pd.concat([ser1,ser2],keys=['cat1','cat2'])

cat1  A    0
      B    1
      C    2
cat2  D    3
      E    4
dtype: int64

In [33]:
# 列方向に連結する（DataFrameされる）
pd.concat([ser1,ser2],axis=1)

Unnamed: 0,0,1
A,0.0,
B,1.0,
C,2.0,
D,,3.0
E,,4.0


In [34]:
# 列に名称をつける
pd.concat([ser1,ser2],axis=1,keys=['cat1','cat2'])

Unnamed: 0,cat1,cat2
A,0.0,
B,1.0,
C,2.0,
D,,3.0
E,,4.0


In [35]:
# indexを指定して連結する
pd.concat([ser1,ser2],axis=1,join_axes=[['B','C','E']])

Unnamed: 0,0,1
B,1.0,
C,2.0,
E,,4.0


## DataFrameの連結 

In [36]:
dframe1 = DataFrame(np.random.randn(4,3), columns=['A','B','C'])
dframe1

Unnamed: 0,A,B,C
0,1.075474,1.075658,0.211429
1,0.01524,0.47,-0.96088
2,0.856945,0.718641,-0.404643
3,0.278745,-1.670241,-0.286379


In [37]:
dframe2 = DataFrame(np.random.randn(3,3), columns=['D','B','C'])
dframe2

Unnamed: 0,D,B,C
0,0.387414,-0.97868,0.459442
1,-0.43528,0.208438,0.403707
2,2.278594,-1.000347,0.844543


In [38]:
# DataFrameを連結する
pd.concat([dframe1,dframe2])

Unnamed: 0,A,B,C,D
0,1.075474,1.075658,0.211429,
1,0.01524,0.47,-0.96088,
2,0.856945,0.718641,-0.404643,
3,0.278745,-1.670241,-0.286379,
0,,-0.97868,0.459442,0.387414
1,,0.208438,0.403707,-0.43528
2,,-1.000347,0.844543,2.278594


In [39]:
# indexを振り直す
pd.concat([dframe1,dframe2],ignore_index=True)

Unnamed: 0,A,B,C,D
0,1.075474,1.075658,0.211429,
1,0.01524,0.47,-0.96088,
2,0.856945,0.718641,-0.404643,
3,0.278745,-1.670241,-0.286379,
4,,-0.97868,0.459442,0.387414
5,,0.208438,0.403707,-0.43528
6,,-1.000347,0.844543,2.278594


## Pandas.concat
http://pandas.pydata.org/pandas-docs/stable/generated/pandas.concat.html

# Lec32_データを組み合わせる

In [40]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [41]:
ser1 = Series([2,np.nan,4,np.nan,6,np.nan],index=['Q','R','S','T','U','V'])
ser1

Q    2.0
R    NaN
S    4.0
T    NaN
U    6.0
V    NaN
dtype: float64

In [42]:
ser2 = Series(np.arange(len(ser1),dtype=np.float64),index=['Q','R','S','T','U','V'])
ser2

Q    0.0
R    1.0
S    2.0
T    3.0
U    4.0
V    5.0
dtype: float64

In [43]:
# ser1のnull値をser2の値で埋める
Series(np.where(pd.isnull(ser1),ser2,ser1),index=ser1.index)

Q    2.0
R    1.0
S    4.0
T    3.0
U    6.0
V    5.0
dtype: float64

In [44]:
# 上記を1行で記述
ser1.combine_first(ser2)

Q    2.0
R    1.0
S    4.0
T    3.0
U    6.0
V    5.0
dtype: float64

In [45]:
# 奇数のみを持つDataFrame
dframe_odds = DataFrame({'X': [1, np.nan, 3, np.nan],
                         'Y': [np.nan, 5, np.nan, 7],
                         'Z': [np.nan, 9, np.nan, 11]})
dframe_odds

Unnamed: 0,X,Y,Z
0,1.0,,
1,,5.0,9.0
2,3.0,,
3,,7.0,11.0


In [46]:
# 偶数のみを持つDataFrame
dframe_evens = DataFrame({'X': [2, 4, np.nan, 6, 8],
                          'Y': [np.nan, 10, 12, 14, 16]})
dframe_evens

Unnamed: 0,X,Y
0,2.0,
1,4.0,10.0
2,,12.0
3,6.0,14.0
4,8.0,16.0


In [47]:
# 奇数のみを持つDataFrameに対し、DataFrameを連結してnull値には偶数を埋める
dframe_odds.combine_first(dframe_evens)

Unnamed: 0,X,Y,Z
0,1.0,,
1,4.0,5.0,9.0
2,3.0,12.0,
3,6.0,7.0,11.0
4,8.0,16.0,


# Lec33_SeriesとDataFrameの変換

In [48]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [49]:
dframe1 = DataFrame(np.arange(8).reshape([2,4]),
                    index=pd.Index(['HND','FUK'],name='city'),
                    columns=pd.Index(['A','B','C','D'], name='letter'))
dframe1

letter,A,B,C,D
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
HND,0,1,2,3
FUK,4,5,6,7


In [50]:
# 列名をindexにしてSeriesを作成する
dframe_st = dframe1.stack()
dframe_st

city  letter
HND   A         0
      B         1
      C         2
      D         3
FUK   A         4
      B         5
      C         6
      D         7
dtype: int64

In [51]:
# 型の確認
type(dframe_st)

pandas.core.series.Series

In [52]:
# SeriesからDataFrameに戻す
dframe_st.unstack()

letter,A,B,C,D
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
HND,0,1,2,3
FUK,4,5,6,7


In [53]:
# 列名で指定しても同じ結果が得られる
dframe_st.unstack('letter')

letter,A,B,C,D
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
HND,0,1,2,3
FUK,4,5,6,7


In [54]:
# 行列を入れ替えてDataFrameに戻す
dframe_st.unstack(0)

city,HND,FUK
letter,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0,4
B,1,5
C,2,6
D,3,7


In [55]:
# 列名で指定しても同じ結果が得られる
dframe_st.unstack('city')

city,HND,FUK
letter,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0,4
B,1,5
C,2,6
D,3,7


In [56]:
ser1 = Series([0,1,2], index=['Q','X','Y'])
ser1

Q    0
X    1
Y    2
dtype: int64

In [57]:
ser2 = Series([4,5,6], index=['X','Y','Z'])
ser2

X    4
Y    5
Z    6
dtype: int64

In [58]:
dframe = pd.concat([ser1,ser2],keys=['Alpha','Beta'])
dframe

Alpha  Q    0
       X    1
       Y    2
Beta   X    4
       Y    5
       Z    6
dtype: int64

In [59]:
# SeriesをDataFrameに変換
dframe_sr = dframe.unstack()
dframe_sr

Unnamed: 0,Q,X,Y,Z
Alpha,0.0,1.0,2.0,
Beta,,4.0,5.0,6.0


In [60]:
# DataFrameをSeriesに変換(null値は自動で除外される)
dframe_sr.stack()

Alpha  Q    0.0
       X    1.0
       Y    2.0
Beta   X    4.0
       Y    5.0
       Z    6.0
dtype: float64

In [61]:
# null値を残して変換
dframe_sr.stack(dropna='False')

Alpha  Q    0.0
       X    1.0
       Y    2.0
Beta   X    4.0
       Y    5.0
       Z    6.0
dtype: float64

# Lec34_ピボットテーブルの作り方

In [62]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

### ここから下二つは意味不明なので後々解析

In [63]:
import pandas.util.testing as tm
tm.N = 3

In [64]:
def unpivot(frame):
    N, K = frame.shape
    data = {'value'   : frame.values.ravel('F'),
            'variable': np.asarray(frame.columns).repeat(N),
            'date'    : np.tile(np.asarray(frame.index), K)}
    return DataFrame(data, columns=['date','variable','value'])

dframe = unpivot(tm.makeTimeDataFrame())
dframe

Unnamed: 0,date,variable,value
0,2000-01-03,A,1.528267
1,2000-01-04,A,0.682174
2,2000-01-05,A,-0.744592
3,2000-01-03,B,0.764126
4,2000-01-04,B,-0.752905
5,2000-01-05,B,0.031851
6,2000-01-03,C,-1.348011
7,2000-01-04,C,-2.135506
8,2000-01-05,C,-1.101521
9,2000-01-03,D,1.069246


In [65]:
# 行をdate、列をvriableに指定し、valueで値を埋める
dframe_piv = dframe.pivot('date','variable','value')
dframe_piv

variable,A,B,C,D
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2000-01-03,1.528267,0.764126,-1.348011,1.069246
2000-01-04,0.682174,-0.752905,-2.135506,-1.686734
2000-01-05,-0.744592,0.031851,-1.101521,0.43862


# Lec35_重複したデータの処理

In [66]:
import numpy as np
import pandas as pd
from pandas import DataFrame

In [67]:
dframe = DataFrame({'key1':['A'] * 2 + ['B'] * 3,
                    'key2':[2,2,2,3,3]})
dframe

Unnamed: 0,key1,key2
0,A,2
1,A,2
2,B,2
3,B,3
4,B,3


In [68]:
# 重複有無の確認
dframe.duplicated()

0    False
1     True
2    False
3    False
4     True
dtype: bool

In [69]:
# 重複行の削除
dframe.drop_duplicates()

Unnamed: 0,key1,key2
0,A,2
2,B,2
3,B,3


In [70]:
# 重複排除するkey列を指定可能(key2は、最初の値をとってくる)
dframe.drop_duplicates(['key1'])

Unnamed: 0,key1,key2
0,A,2
2,B,2


In [71]:
# key2の値を最後を取るように指定する
dframe.drop_duplicates(['key1'],keep='last')

Unnamed: 0,key1,key2
1,A,2
4,B,3


# Lec36_マッピングを使った列の追加

In [72]:
from pandas import DataFrame

In [73]:
dframe = DataFrame({'city':['Alma', 'Brian Head', 'Fox Park'],
                    'altitude':[3158,3000,2762]})
dframe

Unnamed: 0,altitude,city
0,3158,Alma
1,3000,Brian Head
2,2762,Fox Park


In [74]:
# マッピングに使用する辞書を作成
state_map = {'Alma':'Colorado','Brian Head':'Utah','Fox Park':'Wyoming'}

In [75]:
# mapメソッドでマッピングする
dframe['state'] = dframe['city'].map(state_map)
dframe

Unnamed: 0,altitude,city,state
0,3158,Alma,Colorado
1,3000,Brian Head,Utah
2,2762,Fox Park,Wyoming


# Lec37_置換

In [76]:
import numpy as np
from pandas import Series

In [77]:
ser1 = Series([1,2,3,4,1,2,3,4])
ser1

0    1
1    2
2    3
3    4
4    1
5    2
6    3
7    4
dtype: int64

In [78]:
# replaceメソッドを使って、データを置換する
ser1.replace(1, np.nan)

0    NaN
1    2.0
2    3.0
3    4.0
4    NaN
5    2.0
6    3.0
7    4.0
dtype: float64

In [79]:
# listを渡して、複数項目を同時に置換
ser1.replace([1,4],[100,400])

0    100
1      2
2      3
3    400
4    100
5      2
6      3
7    400
dtype: int64

In [80]:
# 辞書型を渡しても置換可能
ser1.replace({1:np.nan,4:np.nan})

0    NaN
1    2.0
2    3.0
3    NaN
4    NaN
5    2.0
6    3.0
7    NaN
dtype: float64

# Lec38_indexの変更

In [81]:
import numpy as np
from pandas import DataFrame

In [82]:
dframe = DataFrame(np.arange(12).reshape(3,4),
                   index=['NY','LA','SF'],
                   columns=['A','B','C','D'])
dframe

Unnamed: 0,A,B,C,D
NY,0,1,2,3
LA,4,5,6,7
SF,8,9,10,11


In [83]:
# 文字列を小文字化する
str.lower('A')

'a'

In [84]:
# DataFrameの文字列を小文字化する
dframe.index.map(str.lower)

Index(['ny', 'la', 'sf'], dtype='object')

In [85]:
# DataFrameのindexを小文字化する
dframe.index = dframe.index.map(str.lower)
dframe

Unnamed: 0,A,B,C,D
ny,0,1,2,3
la,4,5,6,7
sf,8,9,10,11


In [86]:
# 単語の最初の文字を大文字化する
str.title('have a nice day')

'Have A Nice Day'

In [87]:
# renameメソッドで行と列の名前を書き換える
dframe.rename(index=str.title, columns=str.lower)

Unnamed: 0,a,b,c,d
Ny,0,1,2,3
La,4,5,6,7
Sf,8,9,10,11


In [88]:
# renameは辞書型を渡すことも可能
dframe.rename(index={'ny':'New York','la':'Los Angels','sf':'San Francisco'},
              columns={'A':'1','B':'2','C':'3','D':'4'})

Unnamed: 0,1,2,3,4
New York,0,1,2,3
Los Angels,4,5,6,7
San Francisco,8,9,10,11


In [89]:
dframe.rename(index={'ny':'new york','la':'los angels','sf':'san francisco'},
              columns={'A':'1','B':'2','C':'3','D':'4'}, inplace=True)
dframe

Unnamed: 0,1,2,3,4
new york,0,1,2,3
los angels,4,5,6,7
san francisco,8,9,10,11


In [90]:
dframe.index = dframe.index.map(str.title)
dframe

Unnamed: 0,1,2,3,4
New York,0,1,2,3
Los Angels,4,5,6,7
San Francisco,8,9,10,11


# Lec39_ビニング(Binning)

In [91]:
import pandas as pd

In [92]:
years = [1990,1991,1992,2008,2012,2015,1987,1969,2013,2008,1999]

In [93]:
# 10年ごとにまとめるためのlistを作成しておく
decade_bins = [1960,1970,1980,1990,2000,2010,2020]

In [94]:
# cutメソッドでカテゴリ分けをする
decade_cat = pd.cut(years, decade_bins)
decade_cat

[(1980, 1990], (1990, 2000], (1990, 2000], (2000, 2010], (2010, 2020], ..., (1980, 1990], (1960, 1970], (2010, 2020], (2000, 2010], (1990, 2000]]
Length: 11
Categories (6, interval[int64]): [(1960, 1970] < (1970, 1980] < (1980, 1990] < (1990, 2000] < (2000, 2010] < (2010, 2020]]

In [95]:
# カテゴリそのものを参照する
decade_cat.categories

IntervalIndex([(1960, 1970], (1970, 1980], (1980, 1990], (1990, 2000], (2000, 2010], (2010, 2020]]
              closed='right',
              dtype='interval[int64]')

In [96]:
# カテゴリごとの分布を見る
pd.value_counts(decade_cat)

(2010, 2020]    3
(1990, 2000]    3
(2000, 2010]    2
(1980, 1990]    2
(1960, 1970]    1
(1970, 1980]    0
dtype: int64

In [97]:
# カテゴリを２グループに分ける
pd.cut(years,2)

[(1968.954, 1992.0], (1968.954, 1992.0], (1968.954, 1992.0], (1992.0, 2015.0], (1992.0, 2015.0], ..., (1968.954, 1992.0], (1968.954, 1992.0], (1992.0, 2015.0], (1992.0, 2015.0], (1992.0, 2015.0]]
Length: 11
Categories (2, interval[float64]): [(1968.954, 1992.0] < (1992.0, 2015.0]]

# Lec40_外れ値

In [98]:
import numpy as np
from pandas import DataFrame

In [99]:
# seed値を設定して、乱数を発生させる
np.random.seed(12345)
dframe = DataFrame(np.random.randn(1000,4))
dframe.head()

Unnamed: 0,0,1,2,3
0,-0.204708,0.478943,-0.519439,-0.55573
1,1.965781,1.393406,0.092908,0.281746
2,0.769023,1.246435,1.007189,-1.296221
3,0.274992,0.228913,1.352917,0.886429
4,-2.001637,-0.371843,1.669025,-0.43857


In [100]:
dframe.tail()

Unnamed: 0,0,1,2,3
995,1.089085,0.251232,-1.451985,1.653126
996,-0.478509,-0.010663,-1.060881,-1.50287
997,-1.946267,1.013592,0.037333,0.133304
998,-1.293122,-0.322542,-0.78296,-0.30334
999,0.089987,0.292291,1.177706,0.882755


In [101]:
# データの特徴を確認する
dframe.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.067684,0.067924,0.025598,-0.002298
std,0.998035,0.992106,1.006835,0.996794
min,-3.428254,-3.548824,-3.184377,-3.745356
25%,-0.77489,-0.591841,-0.641675,-0.644144
50%,-0.116401,0.101143,0.002073,-0.013611
75%,0.616366,0.780282,0.680391,0.654328
max,3.366626,2.653656,3.260383,3.927528


In [102]:
# 最初の列をSeriesとして取り出す
col = dframe[0]
col.head()

0   -0.204708
1    1.965781
2    0.769023
3    0.274992
4   -2.001637
Name: 0, dtype: float64

In [103]:
# 絶対値が３より大きい要素を取り出す
col[np.abs(col) > 3]

523   -3.428254
900    3.366626
Name: 0, dtype: float64

In [104]:
# DataFrameの全要素で絶対値が３より大きい要素を取り出す
dframe[(np.abs(dframe)>3).any(1)]

Unnamed: 0,0,1,2,3
5,-0.539741,0.476985,3.248944,-1.021228
97,-0.774363,0.552936,0.106061,3.927528
102,-0.655054,-0.56523,3.176873,0.959533
305,-2.315555,0.457246,-0.025907,-3.399312
324,0.050188,1.951312,3.260383,0.963301
400,0.146326,0.508391,-0.196713,-3.745356
499,-0.293333,-0.242459,-3.05699,1.918403
523,-3.428254,-0.296336,-0.439938,-0.867165
586,0.275144,1.179227,-3.184377,1.369891
808,-0.362528,-3.548824,1.553205,-2.186301


In [105]:
# DataFrameの要素の正負を参照する
np.sign(dframe)

Unnamed: 0,0,1,2,3
0,-1.0,1.0,-1.0,-1.0
1,1.0,1.0,1.0,1.0
2,1.0,1.0,1.0,-1.0
3,1.0,1.0,1.0,1.0
4,-1.0,-1.0,1.0,-1.0
5,-1.0,1.0,1.0,-1.0
6,-1.0,1.0,1.0,1.0
7,1.0,1.0,-1.0,-1.0
8,-1.0,-1.0,-1.0,1.0
9,-1.0,1.0,-1.0,1.0


In [106]:
# 絶対値が３を超える要素を、絶対値が３になるように置き換える
dframe[np.abs(dframe)>3] = np.sign(dframe) *3
dframe.describe()
# 最大値、最小値が±3以内になった

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.067623,0.068473,0.025153,-0.002081
std,0.995485,0.990253,1.003977,0.989736
min,-3.0,-3.0,-3.0,-3.0
25%,-0.77489,-0.591841,-0.641675,-0.644144
50%,-0.116401,0.101143,0.002073,-0.013611
75%,0.616366,0.780282,0.680391,0.654328
max,3.0,2.653656,3.0,3.0


# Lec41_行列の置換

In [107]:
import numpy as np
from pandas import DataFrame

In [108]:
dframe = DataFrame(np.arange(4*4).reshape((4,4)))
dframe

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15


In [109]:
blender = np.random.permutation(4)
blender

array([1, 3, 2, 0])

In [110]:
# blenderに従って、行を入れ替える
dframe.take(blender)

Unnamed: 0,0,1,2,3
1,4,5,6,7
3,12,13,14,15
2,8,9,10,11
0,0,1,2,3


#### 母集団から取り出しては元に戻す動作を繰り返す

In [111]:
# 箱の中に、大吉,中吉,小吉の札があるとする
box = np.array(['大吉','中吉','小吉'])
box

array(['大吉', '中吉', '小吉'], 
      dtype='<U2')

In [112]:
shaker = np.random.randint(0, len(box), size=10)
shaker

array([2, 0, 2, 0, 2, 0, 2, 0, 2, 2])

In [113]:
# 札を取り出しては戻す動作を10回繰り返す
hand_grabs = box.take(shaker)
hand_grabs

array(['小吉', '大吉', '小吉', '大吉', '小吉', '大吉', '小吉', '大吉', '小吉', '小吉'], 
      dtype='<U2')