In [1]:
import numpy as np
import pandas as pd

In [2]:
ser = pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd'])
ser

a   -2.462334
b   -1.152108
c    0.165155
d   -1.370438
dtype: float64

In [3]:
ser.values

array([-2.46233357, -1.15210769,  0.16515482, -1.37043821])

In [4]:
ser.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [5]:
da = {'seoul' : 2000, 'busan' : 2500, 'daejeon' : 3000}

In [6]:
pd.Series(da)

seoul      2000
busan      2500
daejeon    3000
dtype: int64

In [7]:
da = {'a' : 0., 'b' : 1., 'c' : 2.}

In [8]:
pd.Series(da)

a    0.0
b    1.0
c    2.0
dtype: float64

In [9]:
pd.Series(da, index=['b', 'c', 'd', 'a'])

b    1.0
c    2.0
d    NaN
a    0.0
dtype: float64

In [10]:
pd.Series(7., index=['a', 'b', 'c'])

a    7.0
b    7.0
c    7.0
dtype: float64

In [11]:
ser[0]

-2.4623335680922405

In [12]:
ser[:3]

a   -2.462334
b   -1.152108
c    0.165155
dtype: float64

In [13]:
np.exp(ser)

a    0.085236
b    0.315970
c    1.179576
d    0.253996
dtype: float64

In [14]:
ser['a']

-2.4623335680922405

In [15]:
ser['d'] = 7.

In [16]:
ser

a   -2.462334
b   -1.152108
c    0.165155
d    7.000000
dtype: float64

In [17]:
ser + ser

a    -4.924667
b    -2.304215
c     0.330310
d    14.000000
dtype: float64

In [18]:
ser * 2

a    -4.924667
b    -2.304215
c     0.330310
d    14.000000
dtype: float64

In [19]:
ser[1:] + ser[:-1]

a         NaN
b   -2.304215
c    0.330310
d         NaN
dtype: float64

In [20]:
ser = pd.Series(np.random.randn(5), name='seoul')
ser

0    0.928059
1    1.353727
2   -0.340133
3    0.069274
4   -0.816523
Name: seoul, dtype: float64

In [21]:
ser1 = ser.rename("busan")

In [22]:
ser1.name

'busan'

In [23]:
d = {'one': pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
'two': pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}

In [24]:
df = pd.DataFrame(d)
df

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, a to d
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   one     3 non-null      float64
 1   two     4 non-null      float64
dtypes: float64(2)
memory usage: 96.0+ bytes


In [26]:
df.dtypes

one    float64
two    float64
dtype: object

In [27]:
pd.DataFrame(d, index=['d', 'b', 'a'])

Unnamed: 0,one,two
d,,4.0
b,2.0,2.0
a,1.0,1.0


In [28]:
pd.DataFrame(d, index=['d', 'b', 'a'], columns=['two', 'three'])

Unnamed: 0,two,three
d,4.0,
b,2.0,
a,1.0,


In [29]:
df.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [30]:
df.columns

Index(['one', 'two'], dtype='object')

In [31]:
d = {'one': [1., 2., 3.], 'two': [3., 2., 1.]}

In [32]:
pd.DataFrame(d)

Unnamed: 0,one,two
0,1.0,3.0
1,2.0,2.0
2,3.0,1.0


In [33]:
pd.DataFrame(d, index=['a', 'b', 'c'])

Unnamed: 0,one,two
a,1.0,3.0
b,2.0,2.0
c,3.0,1.0


In [34]:
arr = np.zeros((2,), dtype=[('A', 'i4'), ('B', 'f4'), ('C', 'a10')])

In [35]:
arr[:] = [(1, 2., 'Hello'), (2, 3., 'World')]

In [36]:
arr

array([(1, 2., b'Hello'), (2, 3., b'World')],
      dtype=[('A', '<i4'), ('B', '<f4'), ('C', 'S10')])

In [37]:
pd.DataFrame(arr)

Unnamed: 0,A,B,C
0,1,2.0,b'Hello'
1,2,3.0,b'World'


In [38]:
pd.DataFrame(arr, index=['first', 'second'])

Unnamed: 0,A,B,C
first,1,2.0,b'Hello'
second,2,3.0,b'World'


In [39]:
pd.DataFrame(arr, columns=['C', 'A', 'B'])

Unnamed: 0,C,A,B
0,b'Hello',1,2.0
1,b'World',2,3.0


In [40]:
data = [{'a': 1, 'b': 2}, {'a': 5, 'b': 10, 'c': 20}]

In [41]:
pd.DataFrame(data)

Unnamed: 0,a,b,c
0,1,2,
1,5,10,20.0


In [42]:
pd.DataFrame(data, columns=['a', 'b'])

Unnamed: 0,a,b
0,1,2
1,5,10


In [43]:
pd.DataFrame({('a', 'b'): {('A', 'B'): 1, ('A', 'C'): 2},
('a', 'a'): {('A', 'C'): 3, ('A', 'B'): 4},
('a', 'c'): {('A', 'B'): 5, ('A', 'C'): 6},
('b', 'a'): {('A', 'C'): 7, ('A', 'B'): 8},
('b', 'b'): {('A', 'D'): 9, ('A', 'B'): 10}})

Unnamed: 0_level_0,Unnamed: 1_level_0,a,a,a,b,b
Unnamed: 0_level_1,Unnamed: 1_level_1,b,a,c,a,b
A,B,1.0,4.0,5.0,8.0,10.0
A,C,2.0,3.0,6.0,7.0,
A,D,,,,,9.0


In [44]:
dict([('A', [1, 2, 3]), ('B', [4, 5, 6])])

{'A': [1, 2, 3], 'B': [4, 5, 6]}

In [45]:
pd.DataFrame.from_dict(dict([('A', [1, 2, 3]), ('B', [4, 5, 6])]))

Unnamed: 0,A,B
0,1,4
1,2,5
2,3,6


In [46]:
pd.DataFrame.from_dict(dict([('A', [1, 2, 3]), ('B', [4, 5, 6])]), orient='index',
columns=['one', 'two', 'three'])

Unnamed: 0,one,two,three
A,1,2,3
B,4,5,6


In [47]:
arr

array([(1, 2., b'Hello'), (2, 3., b'World')],
      dtype=[('A', '<i4'), ('B', '<f4'), ('C', 'S10')])

In [48]:
pd.DataFrame.from_records(arr, index='C')

Unnamed: 0_level_0,A,B
C,Unnamed: 1_level_1,Unnamed: 2_level_1
b'Hello',1,2.0
b'World',2,3.0


In [49]:
df['one']

a    1.0
b    2.0
c    3.0
d    NaN
Name: one, dtype: float64

In [50]:
df['two']

a    1.0
b    2.0
c    3.0
d    4.0
Name: two, dtype: float64

In [51]:
df['three'] = df['one'] * df['two']
df['flag'] = df['one'] > 2
df

Unnamed: 0,one,two,three,flag
a,1.0,1.0,1.0,False
b,2.0,2.0,4.0,False
c,3.0,3.0,9.0,True
d,,4.0,,False


In [52]:
del df['two']
df.pop('three')

a    1.0
b    4.0
c    9.0
d    NaN
Name: three, dtype: float64

In [53]:
df

Unnamed: 0,one,flag
a,1.0,False
b,2.0,False
c,3.0,True
d,,False


In [54]:
df['ha'] = 'hiho'
df

Unnamed: 0,one,flag,ha
a,1.0,False,hiho
b,2.0,False,hiho
c,3.0,True,hiho
d,,False,hiho


In [55]:
df['trunced_one'] = df['one'][:2]
df

Unnamed: 0,one,flag,ha,trunced_one
a,1.0,False,hiho,1.0
b,2.0,False,hiho,2.0
c,3.0,True,hiho,
d,,False,hiho,


In [56]:
df.insert(1, 'hi', df['one'])
df

Unnamed: 0,one,hi,flag,ha,trunced_one
a,1.0,1.0,False,hiho,1.0
b,2.0,2.0,False,hiho,2.0
c,3.0,3.0,True,hiho,
d,,,False,hiho,


In [57]:
ser = pd.Series(data=np.arange(3), index=['A', 'B', 'C'])
ser

A    0
B    1
C    2
dtype: int32

In [58]:
ser.drop(labels=['B', 'C'])

A    0
dtype: int32

In [59]:
df1= pd.DataFrame(np.arange(12).reshape(3, 4), columns=['A', 'B', 'C', 'D'])
df1

Unnamed: 0,A,B,C,D
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11


In [60]:
df1.drop(['B', 'C'], axis=1)

Unnamed: 0,A,D
0,0,3
1,4,7
2,8,11


In [61]:
df1.drop([0, 1])

Unnamed: 0,A,B,C,D
2,8,9,10,11


In [62]:
df.loc['b']

one                2
hi                 2
flag           False
ha              hiho
trunced_one        2
Name: b, dtype: object

In [63]:
df.iloc[2]

one               3
hi                3
flag           True
ha             hiho
trunced_one     NaN
Name: c, dtype: object

In [64]:
df = pd.DataFrame(np.random.randn(5, 4), columns=['A', 'B', 'C', 'D'])
df2 = pd.DataFrame(np.random.randn(3, 3), columns=['A', 'B', 'C'])
df+df2

Unnamed: 0,A,B,C,D
0,0.270188,-0.396434,0.794156,
1,1.672591,0.129014,0.597811,
2,2.625965,-1.629937,1.775333,
3,,,,
4,,,,


In [65]:
df - df.iloc[0]

Unnamed: 0,A,B,C,D
0,0.0,0.0,0.0,0.0
1,2.1219,0.860163,-0.326609,1.056635
2,1.574823,-0.28811,-0.440263,0.188779
3,2.029266,0.058574,-0.440267,1.572943
4,-0.876828,1.343589,-0.865059,0.259325


In [66]:
df * 10 + 2

Unnamed: 0,A,B,C,D
0,-6.355615,-4.289723,10.621515,-5.160652
1,14.863381,4.311908,7.355422,5.405699
2,9.39261,-7.170825,6.218886,-3.272861
3,13.937043,-3.703985,6.218843,10.568777
4,-15.123891,9.146163,1.970928,-2.567407


In [67]:
df1 = pd.DataFrame({'a': [1, 0, 1], 'b': [0, 1, 1]}, dtype=bool)
df2 = pd.DataFrame({'a': [0, 1, 1], 'b': [1, 1, 0]}, dtype=bool)
df1

Unnamed: 0,a,b
0,True,False
1,False,True
2,True,True


In [68]:
df2

Unnamed: 0,a,b
0,False,True
1,True,True
2,True,False


In [69]:
df1 & df2

Unnamed: 0,a,b
0,False,False
1,False,True
2,True,False


In [70]:
df1 | df2

Unnamed: 0,a,b
0,True,True
1,True,True
2,True,True


In [71]:
df1 ^ df2

Unnamed: 0,a,b
0,True,True
1,True,False
2,False,True


In [72]:
-df1

Unnamed: 0,a,b
0,False,True
1,True,False
2,False,False


In [73]:
df[:2]

Unnamed: 0,A,B,C,D
0,-0.835562,-0.628972,0.862151,-0.716065
1,1.286338,0.231191,0.535542,0.34057


In [74]:
df[:2].T

Unnamed: 0,0,1
A,-0.835562,1.286338
B,-0.628972,0.231191
C,0.862151,0.535542
D,-0.716065,0.34057


In [75]:
df3 = pd.DataFrame(np.arange(12).reshape(3, 4), columns=['A', 'B', 'C', 'D'])
np.exp(df3)

Unnamed: 0,A,B,C,D
0,1.0,2.718282,7.389056,20.085537
1,54.59815,148.413159,403.428793,1096.633158
2,2980.957987,8103.083928,22026.465795,59874.141715


In [76]:
np.asarray(df3)

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [None]:
ind=pd.Index([1,3,5,7,9,11])
ind
ind[::2]
ind.ndim, ind.shape
ind[1]=6

In [None]:
pd.Index([1,2,3])
pd.Index(list('abc'))

In [None]:
df=pd.DataFrame(np.arnage(12).reshape(2,6), columns=list('ABCDEF'))
df
df.index

In [None]:
ser=pd.Series(['ha', 'hi']*1000)
ser
ser.nbytes
ser.astype('category')
ser.astype('category').nbytes

In [None]:
s1=pd.Categorical([1,2,3,1,2,3])
s1
type(s1)
s1.dtype
s2=pd.Categorical(['a, 'b', 'c', 'a', 'b', 'c'])
s2

In [None]:
s3=pd.Categorical(['a', 'b', 'c', 'a', 'b,', 'c'], ordered=True, categories=['c', 'b', 'a'])
s3
s3.min(), s3.max()

In [77]:
ser = pd.Series(['a', 'b', 'c', 'a'], dtype='category')
ser

0    a
1    b
2    c
3    a
dtype: category
Categories (3, object): ['a', 'b', 'c']

In [78]:
df = pd.DataFrame({'A': ['a', 'b', 'c', 'a']})
df['B'] = df['A'].astype('category')
df

Unnamed: 0,A,B
0,a,a
1,b,b
2,c,c
3,a,a


In [79]:
df.dtypes

A      object
B    category
dtype: object

In [80]:
df = pd.DataFrame({'A': list('abca'), 'B': list('bccd')}, dtype='category')
df.dtypes

A    category
B    category
dtype: object

In [81]:
df['A']

0    a
1    b
2    c
3    a
Name: A, dtype: category
Categories (3, object): ['a', 'b', 'c']

In [82]:
df['B']

0    b
1    c
2    c
3    d
Name: B, dtype: category
Categories (3, object): ['b', 'c', 'd']

In [83]:
df = pd.DataFrame({'A': list('abca'), 'B': list('bccd')})
df_cat = df.astype('category')
df_cat.dtypes

A    category
B    category
dtype: object

In [84]:
arr = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']]
pd.MultiIndex.from_arrays(arr, names=('number', 'color'))

MultiIndex([(1,  'red'),
            (1, 'blue'),
            (2,  'red'),
            (2, 'blue')],
           names=['number', 'color'])

In [85]:
arr=[['ha', 'ha', 'hi', 'hi', 'ho', 'ho',], ['one', 'two', 'one', 'two', 'one', 'two']]
tuples = list(zip(*arr))
tuples

[('ha', 'one'),
 ('ha', 'two'),
 ('hi', 'one'),
 ('hi', 'two'),
 ('ho', 'one'),
 ('ho', 'two')]

In [86]:
ind = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
ind

MultiIndex([('ha', 'one'),
            ('ha', 'two'),
            ('hi', 'one'),
            ('hi', 'two'),
            ('ho', 'one'),
            ('ho', 'two')],
           names=['first', 'second'])

In [87]:
ser = pd.Series(np.random.randn(6), index=ind)
ser

first  second
ha     one      -2.190807
       two       0.842013
hi     one      -1.916039
       two       0.450694
ho     one      -0.501797
       two      -1.042649
dtype: float64

In [88]:
arr = [np.array(['ha', 'ha', 'hi', 'hi', 'ho', 'ho']),
np.array(['one','two','one','two','one','two'])]
ser = pd.Series(np.random.randn(6), index=arr)
ser

ha  one   -0.772656
    two   -0.222008
hi  one    1.022798
    two    0.345339
ho  one    1.194641
    two    2.200740
dtype: float64

In [89]:
df = pd.DataFrame(np.random.randn(6, 4), index=arr)
df

Unnamed: 0,Unnamed: 1,0,1,2,3
ha,one,0.333032,0.286146,-1.358321,-0.235383
ha,two,0.4357,-1.360301,-1.552782,0.641645
hi,one,-1.019732,0.820501,-0.494307,0.502008
hi,two,-1.710146,-0.739259,0.735056,1.551888
ho,one,-1.272207,0.633182,-0.217229,0.866496
ho,two,-0.408724,0.665195,-0.422924,-0.626811


In [90]:
df.index

MultiIndex([('ha', 'one'),
            ('ha', 'two'),
            ('hi', 'one'),
            ('hi', 'two'),
            ('ho', 'one'),
            ('ho', 'two')],
           )

In [91]:
df = pd.DataFrame(np.random.randn(3, 6), index=['A', 'B', 'C'], columns=ind)
df

first,ha,ha,hi,hi,ho,ho
second,one,two,one,two,one,two
A,-0.199263,1.156855,0.782776,-0.477027,-0.312097,-1.291807
B,-0.143447,0.746046,0.098691,0.873773,0.05075,1.379966
C,-0.373708,0.006994,0.292504,-0.991027,-1.262709,-1.583805


In [92]:
pd.DataFrame(np.random.randn(3, 4), index=ind[:3], columns=ind[:4])

Unnamed: 0_level_0,first,ha,ha,hi,hi
Unnamed: 0_level_1,second,one,two,one,two
first,second,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
ha,one,0.750292,1.743748,0.48732,-0.622411
ha,two,1.0578,0.206822,-0.881181,-0.838981
hi,one,-0.130969,-0.730972,0.606049,2.038428


In [93]:
pd.Series(np.random.randn(6), index=tuples)

(ha, one)    0.511563
(ha, two)    0.962255
(hi, one)    1.821547
(hi, two)    0.458133
(ho, one)   -0.268066
(ho, two)    0.799290
dtype: float64

In [94]:
ser = pd.Series(np.random.randn(1000))
ser.head()

0    0.443884
1   -0.193001
2   -0.723272
3   -0.978074
4   -0.694863
dtype: float64

In [95]:
ser.tail(3)

997    0.121917
998   -1.024379
999    0.792218
dtype: float64

In [96]:
ind = pd.date_range('1/1/2021', periods=5)
ser = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
df = pd.DataFrame(np.random.randn(5, 3), index=ind, columns=['A', 'B', 'C'])
df[:2]

Unnamed: 0,A,B,C
2021-01-01,-0.223117,0.686255,-0.206477
2021-01-02,-0.463381,-0.221826,0.60735


In [97]:
df = pd.DataFrame({'one': pd.Series(np.random.randn(2), index=['a','b']),
'two': pd.Series(np.random.randn(3), index=['a','b','c']),
'three': pd.Series(np.random.randn(2), index=['b','c'])})
df

Unnamed: 0,one,two,three
a,-0.285103,0.605265,
b,0.033046,1.532221,1.019538
c,,0.78058,0.905003


In [98]:
df.iloc[1]


one      0.033046
two      1.532221
three    1.019538
Name: b, dtype: float64

In [99]:
df['two']

a    0.605265
b    1.532221
c    0.780580
Name: two, dtype: float64

In [100]:
row = df.iloc[1]
col = df['two']

In [101]:
df.sub(row, axis='columns')

Unnamed: 0,one,two,three
a,-0.318149,-0.926956,
b,0.0,0.0,0.0
c,,-0.75164,-0.114536


In [102]:
df.sub(col, axis=0)

Unnamed: 0,one,two,three
a,-0.890368,0.0,
b,-1.499174,0.0,-0.512682
c,,0.0,0.124423


In [103]:
d = {'one': [1., 2., np.nan], 'two': [3., 2., 1.], 'three': [np.nan, 1., 1.]}
df = pd.DataFrame(d, index=list('abc'))
df

Unnamed: 0,one,two,three
a,1.0,3.0,
b,2.0,2.0,1.0
c,,1.0,1.0


In [104]:
d1 = {'one': pd.Series([1., 2.], index=['a', 'b']),
'two': pd.Series([1., 1., 1.], index=['a', 'b', 'c']),
'three': pd.Series([2., 2., 2.], index=['a', 'b', 'c'])}
df1 = pd.DataFrame(d1)
df1

Unnamed: 0,one,two,three
a,1.0,1.0,2.0
b,2.0,1.0,2.0
c,,1.0,2.0


In [105]:
df + df1

Unnamed: 0,one,two,three
a,2.0,4.0,
b,4.0,3.0,3.0
c,,2.0,3.0


In [106]:
df.add(df1, fill_value=0)

Unnamed: 0,one,two,three
a,2.0,4.0,2.0
b,4.0,3.0,3.0
c,,2.0,3.0


In [107]:
df = pd.DataFrame({'angles': [0, 3, 4], 'degrees': [360, 180, 360]},
index=['circle', 'triangle', 'rectangle'])
df
df+1
df - [1, 2]
df.sub([1, 2], axis='columns')
df1 = df.sub(pd.Series([1, 2, 3], index=['circle', 'triangle', 'rectangle']), axis='index')
df1
df

Unnamed: 0,one,two,three
a,1.0,3.0,
b,2.0,2.0,1.0
c,,1.0,1.0


In [108]:
df.mean(0)

one      1.5
two      2.0
three    1.0
dtype: float64

In [109]:
df.mean(1)

a    2.000000
b    1.666667
c    1.000000
dtype: float64

In [110]:
df.sum(0, skipna=False)

one      NaN
two      6.0
three    NaN
dtype: float64

In [111]:
df.sum(1, skipna=True)

a    4.0
b    5.0
c    2.0
dtype: float64

In [112]:
df.std()

one      0.707107
two      1.000000
three    0.000000
dtype: float64

In [113]:
df.std(axis=1)

a    1.414214
b    0.577350
c    0.000000
dtype: float64

In [114]:
np.std(df, axis=1)

a    1.000000
b    0.471405
c    0.000000
dtype: float64

In [115]:
np.std(df, ddof=1, axis=1)

a    1.414214
b    0.577350
c    0.000000
dtype: float64

In [116]:
df[['one', 'two', 'three']].std()

one      0.707107
two      1.000000
three    0.000000
dtype: float64

In [117]:
df.cumsum()

Unnamed: 0,one,two,three
a,1.0,3.0,
b,3.0,5.0,1.0
c,,6.0,2.0


In [118]:
np.mean(df['one'])

1.5

In [119]:
ser = pd.Series(np.random.randn(500))

In [120]:
ser[20:500] = np.nan
ser[10:20] = 5
ser.nunique()

11

In [121]:
ser = pd.Series(np.random.randn(1000))
ser[::2] = np.nan
ser.describe()

count    500.000000
mean      -0.025883
std        0.967017
min       -2.859892
25%       -0.712638
50%       -0.004027
75%        0.654425
max        2.678931
dtype: float64

In [122]:
df = pd.DataFrame(np.random.randn(1000, 4), columns=['a', 'b', 'c', 'd'])
df.iloc[::2] = np.nan
df.describe()

Unnamed: 0,a,b,c,d
count,500.0,500.0,500.0,500.0
mean,0.051854,0.059304,0.094345,-0.04421
std,1.01667,1.025659,0.986368,0.948177
min,-4.006729,-2.466983,-3.004304,-3.124887
25%,-0.685083,-0.607831,-0.511271,-0.616326
50%,0.047578,0.046677,0.046629,-0.059434
75%,0.751916,0.755081,0.689319,0.522909
max,2.785636,3.058291,3.395941,2.80858


In [123]:
ser.describe(percentiles=[0.05, 0.25, .75, .95])

count    500.000000
mean      -0.025883
std        0.967017
min       -2.859892
5%        -1.539012
25%       -0.712638
50%       -0.004027
75%        0.654425
95%        1.573196
max        2.678931
dtype: float64

In [124]:
ser = pd.Series(['a','a','b','c','c',np.nan, 'c', 'd'])
ser.describe()

count     7
unique    4
top       c
freq      3
dtype: object

In [125]:
df = pd.DataFrame({'a': ['Yes', 'Yes', 'No', 'No'], 'b': range(4)})
df.describe()

Unnamed: 0,b
count,4.0
mean,1.5
std,1.290994
min,0.0
25%,0.75
50%,1.5
75%,2.25
max,3.0


In [126]:
df.describe(include=['object'])

Unnamed: 0,a
count,4
unique,2
top,Yes
freq,2


In [127]:
df.describe(include=['number'])

Unnamed: 0,b
count,4.0
mean,1.5
std,1.290994
min,0.0
25%,0.75
50%,1.5
75%,2.25
max,3.0


In [128]:
df.describe(include='all')

Unnamed: 0,a,b
count,4,4.0
unique,2,
top,Yes,
freq,2,
mean,,1.5
std,,1.290994
min,,0.0
25%,,0.75
50%,,1.5
75%,,2.25


In [129]:
ser = pd.Series(np.random.randn(5))
ser

0   -0.201720
1    0.702317
2   -0.036248
3    0.410143
4   -0.446333
dtype: float64

In [130]:
ser.idxmin(), ser.idxmax()

(4, 1)

In [131]:
df = pd.DataFrame(np.random.randn(4, 3), columns=['A', 'B', 'C'])
df

Unnamed: 0,A,B,C
0,1.720478,0.775014,2.61962
1,-1.280321,0.686845,-1.300439
2,0.873629,-0.403519,-0.141074
3,-2.176719,-1.850905,-0.939048


In [132]:
df.idxmin(axis=0)

A    3
B    3
C    1
dtype: int64

In [133]:
df.idxmin()

A    3
B    3
C    1
dtype: int64

In [134]:
df.idxmax(axis=1)

0    C
1    B
2    A
3    C
dtype: object

In [135]:
df1 = pd.DataFrame([2, 1, 1, 3, np.nan], columns=['A'], index=list('edcba'))
df1

Unnamed: 0,A
e,2.0
d,1.0
c,1.0
b,3.0
a,


In [136]:
df1['A'].idxmin()

'd'

In [137]:
data = np.random.randint(0, 7, size=30)
data

array([6, 2, 2, 4, 5, 1, 6, 0, 6, 0, 4, 3, 5, 5, 5, 4, 2, 0, 1, 0, 0, 5,
       4, 3, 4, 5, 0, 4, 1, 4])

In [138]:
ser1 = pd.Series(data)
ser1.value_counts()
pd.value_counts(data)

4    7
5    6
0    6
6    3
2    3
1    3
3    2
dtype: int64

In [139]:
pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3)

[(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], (0.994, 3.0]]
Categories (3, interval[float64]): [(0.994, 3.0] < (3.0, 5.0] < (5.0, 7.0]]

In [140]:
pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3, retbins=True)

([(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], (0.994, 3.0]]
 Categories (3, interval[float64]): [(0.994, 3.0] < (3.0, 5.0] < (5.0, 7.0]],
 array([0.994, 3.   , 5.   , 7.   ]))

In [141]:
pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3, labels=['bad', 'medium', 'good'])

['bad', 'good', 'medium', 'medium', 'good', 'bad']
Categories (3, object): ['bad' < 'medium' < 'good']

In [142]:
pd.cut([0, 1, 1, 2], bins=4, labels=False)

array([0, 1, 1, 3], dtype=int64)

In [143]:
ser = pd.Series(np.array([2, 4, 6, 8, 10]), index=['a', 'b', 'c', 'd', 'e'])

In [144]:
pd.cut(ser, 3)

a    (1.992, 4.667]
b    (1.992, 4.667]
c    (4.667, 7.333]
d     (7.333, 10.0]
e     (7.333, 10.0]
dtype: category
Categories (3, interval[float64]): [(1.992, 4.667] < (4.667, 7.333] < (7.333, 10.0]]

In [145]:
pd.qcut(range(5), 4)

[(-0.001, 1.0], (-0.001, 1.0], (1.0, 2.0], (2.0, 3.0], (3.0, 4.0]]
Categories (4, interval[float64]): [(-0.001, 1.0] < (1.0, 2.0] < (2.0, 3.0] < (3.0, 4.0]]

In [146]:
pd.qcut(range(5), 3, labels=['good', 'medium', 'bad'])

['good', 'good', 'medium', 'bad', 'bad']
Categories (3, object): ['good' < 'medium' < 'bad']

In [147]:
pd.qcut(range(5), 4, labels=False)

array([0, 0, 1, 2, 3], dtype=int64)

In [148]:
pd.cut(np.random.randn(25), 5).value_counts()

(-2.283, -1.478]    3
(-1.478, -0.677]    6
(-0.677, 0.123]     6
(0.123, 0.924]      8
(0.924, 1.725]      2
dtype: int64

In [149]:
pd.qcut(np.random.randn(25), 5).value_counts()

(-2.2079999999999997, -0.925]    5
(-0.925, -0.0671]                5
(-0.0671, 0.289]                 5
(0.289, 0.657]                   5
(0.657, 1.418]                   5
dtype: int64

In [150]:
data = pd.DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], index=['A', 'B', 'C'],
columns=['one', 'two', 'three'])
data

Unnamed: 0,one,two,three
A,1,1,1
B,2,2,2
C,3,3,3


In [151]:
def add(data, arg1):
    data1 = data + arg1
    return data1
def div(data1, arg2):
    data2 = data1/arg2
    return data2
def mul(data2, arg3):
    data3 = data2 * arg3
    return data3
def sub(data3, arg4):
    data4 = data3 - arg4
    return data4
(data.pipe(add, arg1=2) # data에 2를 더하기
.pipe(div, arg2=3) # data에 2를 더한 결과에 3으로 나누기
.pipe(mul, arg3=5) # 위 결과를 3으로 나눈 후 5를 곱하기
.pipe(sub, arg4=1)) # 위 결과에서 1을 빼기

Unnamed: 0,one,two,three
A,4.0,4.0,4.0
B,5.666667,5.666667,5.666667
C,7.333333,7.333333,7.333333


In [152]:
df = pd.DataFrame()
df['name'] = ['jsun', 'jin', 'ujung', 'naeun', 'suho']
df['sex'] = ['female', 'male', 'female', 'female', 'male']
df['age'] = [84, 58, 53, 27, 18]
df

Unnamed: 0,name,sex,age
0,jsun,female,84
1,jin,male,58
2,ujung,female,53
3,naeun,female,27
4,suho,male,18


In [153]:
def mean_age_group(dataframe, col):
    return dataframe.groupby(col).mean()
def cap_column(dataframe):
    dataframe.columns = dataframe.columns.str.upper()
    return dataframe

In [154]:
(df.pipe(mean_age_group, col='sex')
.pipe(cap_column))

Unnamed: 0_level_0,AGE
sex,Unnamed: 1_level_1
female,54.666667
male,38.0


In [155]:
data = [{'one': 1.0, 'two': 1.2}, {'one': 0.5, 'two': 1.1, 'three': 0.7},
{'one': 0.7, 'two': 0.9, 'three': -1.6}, {'two': 1.4, 'three': -1.2}]
df = pd.DataFrame(data, index=list('abcd'))
df

Unnamed: 0,one,two,three
a,1.0,1.2,
b,0.5,1.1,0.7
c,0.7,0.9,-1.6
d,,1.4,-1.2


In [156]:
df.apply(np.mean)

one      0.733333
two      1.150000
three   -0.700000
dtype: float64

In [157]:
df.apply(np.mean, axis=1)

a    1.100000
b    0.766667
c    0.000000
d    0.100000
dtype: float64

In [158]:
df.apply(lambda x: x.max() - x.min())

one      0.5
two      0.5
three    2.3
dtype: float64

In [159]:
df.apply(np.cumsum)

Unnamed: 0,one,two,three
a,1.0,1.2,
b,1.5,2.3,0.7
c,2.2,3.2,-0.9
d,,4.6,-2.1


In [160]:
df.apply(np.exp)

Unnamed: 0,one,two,three
a,2.718282,3.320117,
b,1.648721,3.004166,2.013753
c,2.013753,2.459603,0.201897
d,,4.0552,0.301194


In [161]:
df.apply('mean')

one      0.733333
two      1.150000
three   -0.700000
dtype: float64

In [162]:
df.apply('mean', axis=1)

a    1.100000
b    0.766667
c    0.000000
d    0.100000
dtype: float64

In [None]:
df = pd.DataFrame([[1, 2, 3],
[4, 5, 6],
[7, 8, 9],
[np.nan, np.nan, np.nan]], columns=['A', 'B', 'C'])
df
df.agg(['sum', 'min'])
df.agg({'A' : ['sum', 'min'], 'B' : ['min', 'max']})
df.agg("mean", axis="columns")

In [163]:
adf = pd.DataFrame(np.random.randn(6, 3), columns=['A','B','C'],
index=pd.date_range('7/1/2021', periods=6))

In [164]:
adf.iloc[2:4] = np.nan
adf

Unnamed: 0,A,B,C
2021-07-01,0.483135,-0.464654,0.040032
2021-07-02,-0.663777,0.622539,1.75253
2021-07-03,,,
2021-07-04,,,
2021-07-05,-0.984351,1.075634,-0.602753
2021-07-06,-0.201708,-1.572522,-0.478173


In [165]:
adf.agg(np.sum)

A   -1.366700
B   -0.339003
C    0.711636
dtype: float64

In [166]:
adf.agg('sum')

A   -1.366700
B   -0.339003
C    0.711636
dtype: float64

In [167]:
adf.sum()

A   -1.366700
B   -0.339003
C    0.711636
dtype: float64

In [168]:
adf.A.agg('sum')

-1.366700316487332

In [169]:
adf.agg(['sum'])

Unnamed: 0,A,B,C
sum,-1.3667,-0.339003,0.711636


In [170]:
adf.agg(['sum', 'mean'])

Unnamed: 0,A,B,C
sum,-1.3667,-0.339003,0.711636
mean,-0.341675,-0.084751,0.177909


In [171]:
adf.A.agg(['sum', lambda x: x.mean()])

sum        -1.366700
<lambda>   -0.341675
Name: A, dtype: float64

In [172]:
def mymean(x):
    return x.mean()

adf.A.agg(['sum', mymean])

sum      -1.366700
mymean   -0.341675
Name: A, dtype: float64

In [173]:
adf.agg({'A': 'mean', 'B': 'sum'})

A   -0.341675
B   -0.339003
dtype: float64

In [174]:
adf.agg({'A': ['mean', 'min'], 'B': 'sum'})

Unnamed: 0,A,B
mean,-0.341675,
min,-0.984351,
sum,,-0.339003


In [None]:
df = pd.DataFrame({'A': range(3), 'B': range(1, 4)})

In [175]:
adf.transform(np.abs)

Unnamed: 0,A,B,C
2021-07-01,0.483135,0.464654,0.040032
2021-07-02,0.663777,0.622539,1.75253
2021-07-03,,,
2021-07-04,,,
2021-07-05,0.984351,1.075634,0.602753
2021-07-06,0.201708,1.572522,0.478173


In [176]:
adf.A.transform(np.abs)

2021-07-01    0.483135
2021-07-02    0.663777
2021-07-03         NaN
2021-07-04         NaN
2021-07-05    0.984351
2021-07-06    0.201708
Freq: D, Name: A, dtype: float64

In [177]:
adf.transform([np.abs, lambda x: x + 1])

Unnamed: 0_level_0,A,A,B,B,C,C
Unnamed: 0_level_1,absolute,<lambda>,absolute,<lambda>,absolute,<lambda>
2021-07-01,0.483135,1.483135,0.464654,0.535346,0.040032,1.040032
2021-07-02,0.663777,0.336223,0.622539,1.622539,1.75253,2.75253
2021-07-03,,,,,,
2021-07-04,,,,,,
2021-07-05,0.984351,0.015649,1.075634,2.075634,0.602753,0.397247
2021-07-06,0.201708,0.798292,1.572522,-0.572522,0.478173,0.521827


In [178]:
adf.A.transform([np.abs, lambda x: x + 1])

Unnamed: 0,absolute,<lambda>
2021-07-01,0.483135,1.483135
2021-07-02,0.663777,0.336223
2021-07-03,,
2021-07-04,,
2021-07-05,0.984351,0.015649
2021-07-06,0.201708,0.798292


In [179]:
adf.transform({'A': np.abs, 'B': lambda x: x + 1})

Unnamed: 0,A,B
2021-07-01,0.483135,0.535346
2021-07-02,0.663777,1.622539
2021-07-03,,
2021-07-04,,
2021-07-05,0.984351,2.075634
2021-07-06,0.201708,-0.572522


In [180]:
adf.transform({'A': np.abs, 'B': [lambda x: x + 1, 'sqrt']})

  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0_level_0,A,B,B
Unnamed: 0_level_1,absolute,<lambda>,sqrt
2021-07-01,0.483135,0.535346,
2021-07-02,0.663777,1.622539,0.789012
2021-07-03,,,
2021-07-04,,,
2021-07-05,0.984351,2.075634,1.037128
2021-07-06,0.201708,-0.572522,


In [181]:
df = pd.DataFrame([[1, 2.12], [3.356, 4.567]])
df

Unnamed: 0,0,1
0,1.0,2.12
1,3.356,4.567


In [182]:
df.applymap(lambda x: len(str(x)))

Unnamed: 0,0,1
0,3,4
1,5,5


In [183]:
df.applymap(lambda x: x**2)

Unnamed: 0,0,1
0,1.0,4.4944
1,11.262736,20.857489


In [184]:
df ** 2

Unnamed: 0,0,1
0,1.0,4.4944
1,11.262736,20.857489


In [185]:
ser = pd.Series(['cat', 'dog', np.nan, 'rabbit'])
ser

0       cat
1       dog
2       NaN
3    rabbit
dtype: object

In [186]:
ser.map({'cat': 'kitten', 'dog': 'puppy'})

0    kitten
1     puppy
2       NaN
3       NaN
dtype: object

In [187]:
ser.map('I am a {}'.format)

0       I am a cat
1       I am a dog
2       I am a nan
3    I am a rabbit
dtype: object

In [188]:
ser.map('I am a {}'.format, na_action='ignore')

0       I am a cat
1       I am a dog
2              NaN
3    I am a rabbit
dtype: object

In [189]:
df1 = pd.DataFrame(np.random.randn(5, 4), columns=list('ABCD'),
index=pd.date_range('20210701', periods=5))
df1

Unnamed: 0,A,B,C,D
2021-07-01,-0.524846,0.185653,-1.166374,0.991508
2021-07-02,-1.427521,0.398024,0.223145,0.335193
2021-07-03,-1.21352,-0.61419,0.034336,0.662748
2021-07-04,0.432688,-0.352535,-1.766213,0.459002
2021-07-05,0.074163,1.129233,-1.349782,0.588014


In [190]:
df1.loc['20210702':'20210703']

Unnamed: 0,A,B,C,D
2021-07-02,-1.427521,0.398024,0.223145,0.335193
2021-07-03,-1.21352,-0.61419,0.034336,0.662748


In [191]:
ser1 = pd.Series(np.random.randn(4), index=list('abcd'))
ser1

a   -1.413671
b    0.136665
c    1.520398
d   -0.408562
dtype: float64

In [192]:
ser1.loc['c':]

c    1.520398
d   -0.408562
dtype: float64

In [193]:
ser1.loc['b']

0.1366648525726898

In [194]:
ser1.loc['c':] = 0
ser1

a   -1.413671
b    0.136665
c    0.000000
d    0.000000
dtype: float64

In [195]:
df1 = pd.DataFrame(np.random.randn(5, 4), index=list('abcde'), columns=list('ABCD'))
df1

Unnamed: 0,A,B,C,D
a,-1.892066,1.093842,-0.389955,1.146634
b,0.374786,-0.992391,0.944316,0.528259
c,-0.869031,0.550556,0.587877,0.511641
d,-0.2394,-0.511048,-0.231562,0.94928
e,-0.679697,-1.314385,-1.287576,-0.600074


In [196]:
df1.loc[['a', 'b', 'd'], :]

Unnamed: 0,A,B,C,D
a,-1.892066,1.093842,-0.389955,1.146634
b,0.374786,-0.992391,0.944316,0.528259
d,-0.2394,-0.511048,-0.231562,0.94928


In [197]:
df1.loc['c':, 'A':'C']

Unnamed: 0,A,B,C
c,-0.869031,0.550556,0.587877
d,-0.2394,-0.511048,-0.231562
e,-0.679697,-1.314385,-1.287576


In [198]:
df1.loc['a']

A   -1.892066
B    1.093842
C   -0.389955
D    1.146634
Name: a, dtype: float64

In [199]:
ser = pd.Series(list('abcde'), index=[0, 3, 2, 5, 4])
ser

0    a
3    b
2    c
5    d
4    e
dtype: object

In [200]:
ser.loc[3:5]

3    b
2    c
5    d
dtype: object

In [201]:
ser.sort_index()

0    a
2    c
3    b
4    e
5    d
dtype: object

In [202]:
ser.sort_index().loc[1:6]

2    c
3    b
4    e
5    d
dtype: object

In [203]:
ser1 = pd.Series(np.random.randn(5), index=list(range(0, 10, 2)))
ser1

0   -1.058447
2    0.157270
4   -0.821298
6   -0.432927
8   -0.937127
dtype: float64

In [204]:
ser1.iloc[:3]

0   -1.058447
2    0.157270
4   -0.821298
dtype: float64

In [205]:
ser1.iloc[3]

-0.4329271060148354

In [206]:
df1 = pd.DataFrame( np.random.randn(5, 4), index=list(range(0, 10, 2)),
columns=list(range(0, 8, 2)))
df1

Unnamed: 0,0,2,4,6
0,-0.958493,-0.588519,-0.274708,0.031265
2,0.087331,0.190744,0.48798,-0.596437
4,-0.178229,-0.028932,0.667267,0.41436
6,0.035945,-0.782341,-0.553934,-0.684415
8,0.303719,-0.286138,0.326047,-0.274953


In [207]:
df1.iloc[:2]

Unnamed: 0,0,2,4,6
0,-0.958493,-0.588519,-0.274708,0.031265
2,0.087331,0.190744,0.48798,-0.596437


In [208]:
df1.iloc[1:3, 0:3]

Unnamed: 0,0,2,4
2,0.087331,0.190744,0.48798
4,-0.178229,-0.028932,0.667267


In [209]:
df1.iloc[[0, 2, 3], [1, 3]]

Unnamed: 0,2,6
0,-0.588519,0.031265
4,-0.028932,0.41436
6,-0.782341,-0.684415


In [210]:
df1.iloc[1]

0    0.087331
2    0.190744
4    0.487980
6   -0.596437
Name: 2, dtype: float64

In [211]:
df1 = pd.DataFrame(np.random.randn(5, 4), index=list('abcde'), columns=list('ABCD'))
df1

Unnamed: 0,A,B,C,D
a,-0.96458,-1.142675,1.104248,-1.803831
b,0.786939,-0.218394,-2.782687,0.116504
c,0.351728,-0.693599,1.600629,-0.777265
d,2.155382,-0.89643,0.110466,-1.030414
e,0.338592,-0.309232,-0.309803,-0.630078


In [212]:
df1.loc[lambda df: df.A>0, :]

Unnamed: 0,A,B,C,D
b,0.786939,-0.218394,-2.782687,0.116504
c,0.351728,-0.693599,1.600629,-0.777265
d,2.155382,-0.89643,0.110466,-1.030414
e,0.338592,-0.309232,-0.309803,-0.630078


In [213]:
df1.loc[:, lambda df: ['A', 'B']]

Unnamed: 0,A,B
a,-0.96458,-1.142675
b,0.786939,-0.218394
c,0.351728,-0.693599
d,2.155382,-0.89643
e,0.338592,-0.309232


In [214]:
df1.iloc[:, lambda df: [0, 1]]

Unnamed: 0,A,B
a,-0.96458,-1.142675
b,0.786939,-0.218394
c,0.351728,-0.693599
d,2.155382,-0.89643
e,0.338592,-0.309232


In [215]:
df1[lambda df: df.columns[0]]

a   -0.964580
b    0.786939
c    0.351728
d    2.155382
e    0.338592
Name: A, dtype: float64

In [216]:
df1.A.loc[lambda ser: ser > 0]

b    0.786939
c    0.351728
d    2.155382
e    0.338592
Name: A, dtype: float64

In [217]:
ser = pd.Series(np.arange(3))
ser

0    0
1    1
2    2
dtype: int32

In [218]:
ser[5] = 7
ser

0    0
1    1
2    2
5    7
dtype: int64

In [219]:
df = pd.DataFrame(np.arange(9).reshape(3, 3), columns=['A', 'B', 'C'])
df

Unnamed: 0,A,B,C
0,0,1,2
1,3,4,5
2,6,7,8


In [220]:
df.loc[:, 'D'] = df.loc[:, 'A']

In [221]:
df

Unnamed: 0,A,B,C,D
0,0,1,2,0
1,3,4,5,3
2,6,7,8,6


In [222]:
df.loc[3] = 7
df

Unnamed: 0,A,B,C,D
0,0,1,2,0
1,3,4,5,3
2,6,7,8,6
3,7,7,7,7


In [223]:
ser.iat[3]
ser.at[5]
df.at[3, 'E'] = 7
df.iat[3, 0] = 2
df

Unnamed: 0,A,B,C,D,E
0,0,1,2,0,
1,3,4,5,3,
2,6,7,8,6,
3,2,7,7,7,7.0


In [224]:
ser = pd.Series(range(-3, 3))
ser

0   -3
1   -2
2   -1
3    0
4    1
5    2
dtype: int64

In [225]:
ser[ser > 0]

4    1
5    2
dtype: int64

In [226]:
ser[(ser < -1) | (ser > 1)]

0   -3
1   -2
5    2
dtype: int64

In [227]:
ser[~(ser < 2)]

5    2
dtype: int64

In [228]:
df[df['A'] < 3]

Unnamed: 0,A,B,C,D,E
0,0,1,2,0,
3,2,7,7,7,7.0


In [229]:
ser[::-1].isin([-3, -1, 2])

5     True
4    False
3    False
2     True
1    False
0     True
dtype: bool

In [230]:
ser[ser[::-1].isin([-3, -1, 2])]

0   -3
2   -1
5    2
dtype: int64

In [231]:
ser.index.isin([2, 4, 6])

array([False, False,  True, False,  True, False])

In [232]:
ser[ser.index.isin([2, 4, 6])]

2   -1
4    1
dtype: int64

In [233]:
df = pd.DataFrame({'no': [1, 2, 3], 'ha': ['a', 'b', 'c'], 'hi': ['m', 'n', 'o']})
val = ['a', 'n', 1, 3]
df

Unnamed: 0,no,ha,hi
0,1,a,m
1,2,b,n
2,3,c,o


In [234]:
df.isin(val)

Unnamed: 0,no,ha,hi
0,True,True,False
1,False,False,True
2,True,False,False


In [235]:
val = {'ha': ['a', 'c'], 'no': [1, 2]}
df.isin(val)

Unnamed: 0,no,ha,hi
0,True,True,False
1,True,False,False
2,False,True,False


In [236]:
val = {'ha': ['a', 'c'], 'hi': ['m', 'o'], 'no': [1, 2]}
mask = df.isin(val).all(1)

In [237]:
df[mask]

Unnamed: 0,no,ha,hi
0,1,a,m


In [238]:
index = pd.Index(np.random.randint(0, 1000, 6))
index

Int64Index([702, 485, 470, 493, 220, 442], dtype='int64')

In [239]:
positions = [0, 2, 5]
index[positions]


Int64Index([702, 470, 442], dtype='int64')

In [240]:
index.take(positions)

Int64Index([702, 470, 442], dtype='int64')

In [241]:
ser = pd.Series(np.random.randn(10))
ser.iloc[positions]

0   -1.800911
2    1.351300
5    1.461708
dtype: float64

In [242]:
ser.take(positions)

0   -1.800911
2    1.351300
5    1.461708
dtype: float64

In [243]:
df = pd.DataFrame(np.random.randn(5, 3))
df.take([1, 4, 3])

Unnamed: 0,0,1,2
1,-0.383912,-1.395507,-0.347213
4,-0.154255,0.612329,1.00643
3,-0.867623,-0.985242,-1.134114


In [244]:
df.take([0, 2], axis=1)

Unnamed: 0,0,2
0,2.097273,-0.998531
1,-0.383912,-0.347213
2,-0.309686,0.106538
3,-0.867623,-1.134114
4,-0.154255,1.00643


In [245]:
d = {'one': [1.5, 2.2, -3.0], 'two': [1.0, -1.2, 5.0], 'three': [-1.1, 2.0, 4.0]}
df = pd.DataFrame(d, index = ['a', 'c', 'f'])
df['four'] = 'ha'
df['five'] = df['one'] > 0
df

Unnamed: 0,one,two,three,four,five
a,1.5,1.0,-1.1,ha,True
c,2.2,-1.2,2.0,ha,True
f,-3.0,5.0,4.0,ha,False


In [246]:
df1 = df.reindex(['a', 'b', 'c', 'd', 'e', 'f'])
df1

Unnamed: 0,one,two,three,four,five
a,1.5,1.0,-1.1,ha,True
b,,,,,
c,2.2,-1.2,2.0,ha,True
d,,,,,
e,,,,,
f,-3.0,5.0,4.0,ha,False


In [247]:
df1['one']

a    1.5
b    NaN
c    2.2
d    NaN
e    NaN
f   -3.0
Name: one, dtype: float64

In [248]:
pd.isna(df1['one'])

a    False
b     True
c    False
d     True
e     True
f    False
Name: one, dtype: bool

In [249]:
df1['four'].notna()

a     True
b    False
c     True
d    False
e    False
f     True
Name: four, dtype: bool

In [250]:
None == None

True

In [251]:
np.nan == np.nan

False

In [252]:
d1 = {'one': [1.0, 2.0, 3.0], 'two': [4.0, 5.0, 6.0]}

In [253]:
df1 = pd.DataFrame(d1, index = ['a', 'b', 'c'])
df2 = df1.copy()
df2.loc['d'] = np.nan
df2['three'] = 2.0
df2.iloc[1:2, 1:2] = np.nan
df1

Unnamed: 0,one,two
a,1.0,4.0
b,2.0,5.0
c,3.0,6.0


In [254]:
df2

Unnamed: 0,one,two,three
a,1.0,4.0,2.0
b,2.0,,2.0
c,3.0,6.0,2.0
d,,,2.0


In [255]:
df1 + df2

Unnamed: 0,one,three,two
a,2.0,,8.0
b,4.0,,
c,6.0,,12.0
d,,,


In [256]:
pd.Series([np.nan]).sum()

0.0

In [257]:
pd.Series([], dtype=object).sum()

0

In [258]:
pd.Series([np.nan]).prod()

1.0

In [259]:
pd.Series([], dtype=object).prod()

1

In [260]:
df2

Unnamed: 0,one,two,three
a,1.0,4.0,2.0
b,2.0,,2.0
c,3.0,6.0,2.0
d,,,2.0


In [261]:
df2.groupby('two').mean()

Unnamed: 0_level_0,one,three
two,Unnamed: 1_level_1,Unnamed: 2_level_1
4.0,1.0,2.0
6.0,3.0,2.0


In [262]:
df2.fillna(0)

Unnamed: 0,one,two,three
a,1.0,4.0,2.0
b,2.0,0.0,2.0
c,3.0,6.0,2.0
d,0.0,0.0,2.0


In [263]:
df2['one'].fillna('missing')

a          1
b          2
c          3
d    missing
Name: one, dtype: object

In [264]:
df2

Unnamed: 0,one,two,three
a,1.0,4.0,2.0
b,2.0,,2.0
c,3.0,6.0,2.0
d,,,2.0


In [265]:
df2.fillna(method='pad')

Unnamed: 0,one,two,three
a,1.0,4.0,2.0
b,2.0,4.0,2.0
c,3.0,6.0,2.0
d,3.0,6.0,2.0


In [266]:
df2.loc['c', 'three'] = np.nan
df2

Unnamed: 0,one,two,three
a,1.0,4.0,2.0
b,2.0,,2.0
c,3.0,6.0,
d,,,2.0


In [267]:
df2.mean()

one      2.0
two      5.0
three    2.0
dtype: float64

In [268]:
df2.fillna(df2.mean())

Unnamed: 0,one,two,three
a,1.0,4.0,2.0
b,2.0,5.0,2.0
c,3.0,6.0,2.0
d,2.0,5.0,2.0


In [None]:
df = pd.DataFrame([ [np.nan, 2, 0, np.nan], [3, 4, np.nan, 1], [np.nan, 5, np.nan, 2],
[np.nan, 1, 2, 3]], columns=list('ABCD'))
df
df.fillna(0)
df.fillna(method='ffill')
val = {'A': 0, 'B': 1, 'C': 2, 'D': 3}
df.fillna(value=val)
df.fillna(value=val, limit=1)

In [269]:
df2.iloc[2:3, 2:3] = 2.0
df2

Unnamed: 0,one,two,three
a,1.0,4.0,2.0
b,2.0,,2.0
c,3.0,6.0,2.0
d,,,2.0


In [270]:
df2.dropna(axis=0)

Unnamed: 0,one,two,three
a,1.0,4.0,2.0
c,3.0,6.0,2.0


In [271]:
df2.dropna(axis=1)

Unnamed: 0,three
a,2.0
b,2.0
c,2.0
d,2.0


In [272]:
df2['two'].dropna()

a    4.0
c    6.0
Name: two, dtype: float64

In [None]:
df = pd.DataFrame({'name': ['haena', 'suho', 'naeun'],
'hobby': ['jogging', 'reading', np.nan],
'born': [pd.NaT, pd.Timestamp('2001-01-01'), pd.NaT]})
df
df.dropna()
df.dropna(axis='columns')
df.drpopna(how='all')
df.dropna(thresh=2)
df.dropna(subset=['name', 'born'])
df.dropna(subset=['hobby'])
df.dropna(inplace=True)
df

In [273]:
ser = pd.Series([0, np.nan, 2, 3, 5])
ser

0    0.0
1    NaN
2    2.0
3    3.0
4    5.0
dtype: float64

In [274]:
ser.replace(np.nan, 1.0)

0    0.0
1    1.0
2    2.0
3    3.0
4    5.0
dtype: float64

In [275]:
ser.replace({np.nan: 1, 5: 4})

0    0.0
1    1.0
2    2.0
3    3.0
4    4.0
dtype: float64

In [276]:
ser.replace([0, 2], 1)

0    1.0
1    NaN
2    1.0
3    3.0
4    5.0
dtype: float64

In [277]:
ser.replace([np.nan, 5], [1, np.nan])

0    0.0
1    1.0
2    2.0
3    3.0
4    NaN
dtype: float64

In [278]:
df = pd.DataFrame({'A': [0, 1, np.nan], 'B': [3, 4, 5]})
df.replace({'A': np.nan, 'B': 3}, 10)

Unnamed: 0,A,B
0,0.0,10
1,1.0,4
2,10.0,5


In [279]:
######################################### 4.3.4.

li = [['ha', 'ha', 'hi', 'hi', 'ho', 'ho'], ['one', 'two', 'one', 'two', 'one', 'two']]
li1 = list(zip(*li))
li1

[('ha', 'one'),
 ('ha', 'two'),
 ('hi', 'one'),
 ('hi', 'two'),
 ('ho', 'one'),
 ('ho', 'two')]

In [280]:
ind = pd.MultiIndex.from_tuples(li1, names=['1st', '2nd'])
ind

MultiIndex([('ha', 'one'),
            ('ha', 'two'),
            ('hi', 'one'),
            ('hi', 'two'),
            ('ho', 'one'),
            ('ho', 'two')],
           names=['1st', '2nd'])

In [281]:
ser = pd.Series(np.random.randn(6), index=ind)
ser

1st  2nd
ha   one   -0.339264
     two   -0.615008
hi   one   -0.269377
     two    0.187064
ho   one   -0.652559
     two    0.178951
dtype: float64

In [282]:
iter = [['ha', 'hi', 'ho'], ['one', 'two']]
pd.MultiIndex.from_product(iter, names=['1st', '2nd'])

MultiIndex([('ha', 'one'),
            ('ha', 'two'),
            ('hi', 'one'),
            ('hi', 'two'),
            ('ho', 'one'),
            ('ho', 'two')],
           names=['1st', '2nd'])

In [283]:
df = pd.DataFrame([['ha', 'one'], ['ha', 'two'], ['ho', 'one'], ['ho', 'two']],
columns=['1st', '2nd'])
pd.MultiIndex.from_frame(df)

MultiIndex([('ha', 'one'),
            ('ha', 'two'),
            ('ho', 'one'),
            ('ho', 'two')],
           names=['1st', '2nd'])

In [284]:
arr = [ np.array(['ha', 'ha', 'hi', 'hi', 'ho', 'ho']), np.array(['one', 'two', 'one',
'two', 'one', 'two'])]
ser = pd.Series(np.random.randn(6), index=arr)
ser

ha  one    1.338737
    two    1.077762
hi  one    1.539513
    two   -0.106579
ho  one   -1.090840
    two    0.370244
dtype: float64

In [285]:
df = pd.DataFrame(np.random.randn(6, 3), index=arr)
df

Unnamed: 0,Unnamed: 1,0,1,2
ha,one,0.650021,0.349582,-0.85582
ha,two,-1.101569,-0.276073,1.517586
hi,one,0.376528,0.211025,-0.864834
hi,two,-0.446479,-0.196813,-2.008206
ho,one,0.27292,-0.146606,-0.949356
ho,two,1.864464,0.6721,0.680391


In [286]:
df = pd.DataFrame(np.random.randn(3, 6), index=['A', 'B', 'C'], columns=ind)
df

1st,ha,ha,hi,hi,ho,ho
2nd,one,two,one,two,one,two
A,1.588027,-1.258368,-0.097677,0.00288,0.769906,-0.791949
B,-2.613225,-1.097652,1.599259,-0.319328,-1.237801,1.710869
C,-1.317432,0.636064,-1.46188,-0.765002,1.195946,0.103484


In [287]:
df['ha']

2nd,one,two
A,1.588027,-1.258368
B,-2.613225,-1.097652
C,-1.317432,0.636064


In [288]:
df['ha']['one']

A    1.588027
B   -2.613225
C   -1.317432
Name: one, dtype: float64

In [289]:
ser.reindex(ind[:3])

1st  2nd
ha   one    1.338737
     two    1.077762
hi   one    1.539513
dtype: float64

In [290]:
ser.reindex([('ho', 'one'), ('ha', 'two')])

ho  one   -1.090840
ha  two    1.077762
dtype: float64

In [291]:
df = df.T
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C
1st,2nd,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ha,one,1.588027,-2.613225,-1.317432
ha,two,-1.258368,-1.097652,0.636064
hi,one,-0.097677,1.599259,-1.46188
hi,two,0.00288,-0.319328,-0.765002
ho,one,0.769906,-1.237801,1.195946
ho,two,-0.791949,1.710869,0.103484


In [292]:
df.loc[('ha', 'two')]

A   -1.258368
B   -1.097652
C    0.636064
Name: (ha, two), dtype: float64

In [293]:
df.loc[('ha', 'two'), 'A']
df.loc['ha']

Unnamed: 0_level_0,A,B,C
2nd,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,1.588027,-2.613225,-1.317432
two,-1.258368,-1.097652,0.636064


In [294]:
df.loc['ha':'hi']

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C
1st,2nd,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ha,one,1.588027,-2.613225,-1.317432
ha,two,-1.258368,-1.097652,0.636064
hi,one,-0.097677,1.599259,-1.46188
hi,two,0.00288,-0.319328,-0.765002


In [295]:
df.loc[('hi', 'two'):('ho', 'one')]

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C
1st,2nd,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
hi,two,0.00288,-0.319328,-0.765002
ho,one,0.769906,-1.237801,1.195946


In [296]:
df.loc[('hi', 'two'):'ho']

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C
1st,2nd,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
hi,two,0.00288,-0.319328,-0.765002
ho,one,0.769906,-1.237801,1.195946
ho,two,-0.791949,1.710869,0.103484


In [297]:
df.loc[[('ha', 'two'), ('ho', 'one')]]

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C
1st,2nd,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ha,two,-1.258368,-1.097652,0.636064
ho,one,0.769906,-1.237801,1.195946


In [298]:
li1

[('ha', 'one'),
 ('ha', 'two'),
 ('hi', 'one'),
 ('hi', 'two'),
 ('ho', 'one'),
 ('ho', 'two')]

In [299]:
np.random.shuffle(li1)
li1

[('ho', 'one'),
 ('ha', 'two'),
 ('ho', 'two'),
 ('hi', 'one'),
 ('ha', 'one'),
 ('hi', 'two')]

In [300]:
ser = pd.Series(np.random.randn(6), index=pd.MultiIndex.from_tuples(li1))
ser


ho  one    1.249362
ha  two    0.409203
ho  two    1.100604
hi  one   -0.441275
ha  one    0.384636
hi  two    2.083151
dtype: float64

In [301]:
ser.sort_index()

ha  one    0.384636
    two    0.409203
hi  one   -0.441275
    two    2.083151
ho  one    1.249362
    two    1.100604
dtype: float64

In [302]:
ser.sort_index(level=0)

ha  one    0.384636
    two    0.409203
hi  one   -0.441275
    two    2.083151
ho  one    1.249362
    two    1.100604
dtype: float64

In [303]:
ser.sort_index(level=1)

ha  one    0.384636
hi  one   -0.441275
ho  one    1.249362
ha  two    0.409203
hi  two    2.083151
ho  two    1.100604
dtype: float64

In [304]:
ser.index.set_names(['1st', '2nd'], inplace=True)
ser.sort_index(level='1st')

1st  2nd
ha   one    0.384636
     two    0.409203
hi   one   -0.441275
     two    2.083151
ho   one    1.249362
     two    1.100604
dtype: float64

In [305]:
ser.sort_index(level='2nd')

1st  2nd
ha   one    0.384636
hi   one   -0.441275
ho   one    1.249362
ha   two    0.409203
hi   two    2.083151
ho   two    1.100604
dtype: float64

In [306]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C
1st,2nd,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ha,one,1.588027,-2.613225,-1.317432
ha,two,-1.258368,-1.097652,0.636064
hi,one,-0.097677,1.599259,-1.46188
hi,two,0.00288,-0.319328,-0.765002
ho,one,0.769906,-1.237801,1.195946
ho,two,-0.791949,1.710869,0.103484


In [307]:
df.T.sort_index(level=1, axis=1)

1st,ha,hi,ho,ha,hi,ho
2nd,one,one,one,two,two,two
A,1.588027,-0.097677,0.769906,-1.258368,0.00288,-0.791949
B,-2.613225,1.599259,-1.237801,-1.097652,-0.319328,1.710869
C,-1.317432,-1.46188,1.195946,0.636064,-0.765002,0.103484


In [308]:
###############################################################################


# 4.4.1 ##
###


In [309]:
data = {'name': ['haena', 'naeun', 'una', 'bum', 'suho'],
'age': [30, 27, 28, 23, 18],'address': ['dogok', 'suwon', 'mapo', 'ilsan', 'yeoyi'],
'grade': ['A', 'B', 'C', 'B', 'A'],
'score': [100, 88, 73, 83, 95]}
df = pd.DataFrame(data, columns=['name', 'age', 'address', 'score', 'grade'])
df

Unnamed: 0,name,age,address,score,grade
0,haena,30,dogok,100,A
1,naeun,27,suwon,88,B
2,una,28,mapo,73,C
3,bum,23,ilsan,83,B
4,suho,18,yeoyi,95,A


In [310]:
df.to_csv('student_grade.csv')

In [311]:
!type student_grade.csv

,name,age,address,score,grade
0,haena,30,dogok,100,A
1,naeun,27,suwon,88,B
2,una,28,mapo,73,C
3,bum,23,ilsan,83,B
4,suho,18,yeoyi,95,A


In [312]:
df1=pd.read_csv('student_grade.csv')
df1

Unnamed: 0.1,Unnamed: 0,name,age,address,score,grade
0,0,haena,30,dogok,100,A
1,1,naeun,27,suwon,88,B
2,2,una,28,mapo,73,C
3,3,bum,23,ilsan,83,B
4,4,suho,18,yeoyi,95,A


In [313]:
df1 = df1.iloc[0:5, 1:6]
df1

Unnamed: 0,name,age,address,score,grade
0,haena,30,dogok,100,A
1,naeun,27,suwon,88,B
2,una,28,mapo,73,C
3,bum,23,ilsan,83,B
4,suho,18,yeoyi,95,A


In [314]:
df.to_csv('student_grade.csv')
df2 = pd.read_csv('student_grade.csv', header=None, nrows=3)
df2

Unnamed: 0,0,1,2,3,4,5
0,,name,age,address,score,grade
1,0.0,haena,30,dogok,100,A
2,1.0,naeun,27,suwon,88,B


In [315]:
df2 = pd.read_csv('student_grade.csv', index_col=0)
df2

Unnamed: 0,name,age,address,score,grade
0,haena,30,dogok,100,A
1,naeun,27,suwon,88,B
2,una,28,mapo,73,C
3,bum,23,ilsan,83,B
4,suho,18,yeoyi,95,A


In [316]:
df2 = pd.read_csv('student_grade.csv', index_col=['name'])
df2

Unnamed: 0_level_0,Unnamed: 0,age,address,score,grade
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
haena,0,30,dogok,100,A
naeun,1,27,suwon,88,B
una,2,28,mapo,73,C
bum,3,23,ilsan,83,B
suho,4,18,yeoyi,95,A


In [317]:
df2 = pd.read_csv('student_grade.csv', names=['No', 'name', 'age', 'address', 'score',
'grade'], nrows=3)
df2

Unnamed: 0,No,name,age,address,score,grade
0,,name,age,address,score,grade
1,0.0,haena,30,dogok,100,A
2,1.0,naeun,27,suwon,88,B


In [318]:
to_na = {'address': ['mapo', 'NA'], 'score': [83]}
df2 = pd.read_csv('student_grade.csv', na_values=to_na)
df2

Unnamed: 0.1,Unnamed: 0,name,age,address,score,grade
0,0,haena,30,dogok,100.0,A
1,1,naeun,27,suwon,88.0,B
2,2,una,28,,73.0,C
3,3,bum,23,ilsan,,B
4,4,suho,18,yeoyi,95.0,A


In [319]:
df2 = pd.read_csv('student_grade.csv', skiprows=3)
df2

Unnamed: 0,2,una,28,mapo,73,C
0,3,bum,23,ilsan,83,B
1,4,suho,18,yeoyi,95,A


In [320]:
!type student_grade1.csv

name|age|address|score|grade
haena|30|dogok|100|A
naeun|27|suwon|88|B
una|28|mapo|73|C
bum|23|ilsan|83|B
suho|18|yeoyi|95|A


In [321]:
pd.read_csv('student_grade1.csv', sep='|')

Unnamed: 0,name,age,address,score,grade
0,haena,30,dogok,100,A
1,naeun,27,suwon,88,B
2,una,28,mapo,73,C
3,bum,23,ilsan,83,B
4,suho,18,yeoyi,95,A


In [322]:
dfj = pd.DataFrame([[ 'a', 'b'], ['c', 'd']], index=['row1', 'row2'], columns=['col1',
'col2'])
dfj.to_json()

'{"col1":{"row1":"a","row2":"c"},"col2":{"row1":"b","row2":"d"}}'

In [323]:
dfj.to_json(orient='split')

'{"columns":["col1","col2"],"index":["row1","row2"],"data":[["a","b"],["c","d"]]}'

In [324]:
dfj.to_json(orient='records')

'[{"col1":"a","col2":"b"},{"col1":"c","col2":"d"}]'

In [325]:
dfj.to_json(orient='index')

'{"row1":{"col1":"a","col2":"b"},"row2":{"col1":"c","col2":"d"}}'

In [326]:
dfj.to_json(orient='columns')

'{"col1":{"row1":"a","row2":"c"},"col2":{"row1":"b","row2":"d"}}'

In [327]:
dfj.to_json(orient='values')

'[["a","b"],["c","d"]]'

In [328]:
dfj.to_json(orient='table')

'{"schema":{"fields":[{"name":"index","type":"string"},{"name":"col1","type":"string"},{"name":"col2","type":"string"}],"primaryKey":["index"],"pandas_version":"0.20.0"},"data":[{"index":"row1","col1":"a","col2":"b"},{"index":"row2","col1":"c","col2":"d"}]}'

In [329]:
df = pd.DataFrame(data, columns=['name', 'age', 'address', 'score', 'grade'])
df.to_json('student_grade.json')

In [330]:
pd.read_json('student_grade.json')

Unnamed: 0,name,age,address,score,grade
0,haena,30,dogok,100,A
1,naeun,27,suwon,88,B
2,una,28,mapo,73,C
3,bum,23,ilsan,83,B
4,suho,18,yeoyi,95,A


In [331]:
df = pd.DataFrame({'ha': [1, 2, 3, 4],
'hi': ['a', 'b', 'c', 'd'],
'ho': pd.date_range('2021-09-01', freq='d', periods=4),
                   'hu': pd.Categorical(['a', 'b', 'c', 'd'])},
index=pd.Index(range(4), name='ind'));df

Unnamed: 0_level_0,ha,hi,ho,hu
ind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1,a,2021-09-01,a
1,2,b,2021-09-02,b
2,3,c,2021-09-03,c
3,4,d,2021-09-04,d


In [332]:
df.dtypes

ha             int64
hi            object
ho    datetime64[ns]
hu          category
dtype: object

In [333]:
df.to_json('hello.json', orient='table')
dfj = pd.read_json('hello.json', orient='table')
dfj

Unnamed: 0_level_0,ha,hi,ho,hu
ind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1,a,2021-09-01,a
1,2,b,2021-09-02,b
2,3,c,2021-09-03,c
3,4,d,2021-09-04,d


In [334]:
dfj.dtypes

ha             int64
hi            object
ho    datetime64[ns]
hu          category
dtype: object

In [335]:
url = 'https://finance.yahoo.com/quote/NNOX/profile?p=NNOX'
dfh = pd.read_html(url)
dfh

[                             Name                     Title  Pay  Exercised  \
 0               Mr. Ran Poliakine   Founder, Chairman & CEO  NaN        NaN   
 1               Mr. Itzhak Maayan   Chief Financial Officer  NaN        NaN   
 2               Mr. James M. Dara   Chief Operating Officer  NaN        NaN   
 3                  Mr. Ofir Koren  Chief Technology Officer  NaN        NaN   
 4                   Mr. Tal Shank        VP of Corp. Devel.  NaN        NaN   
 5                 Ms. Anat Kaphan   VP of Product Marketing  NaN        NaN   
 6          Ms. Tamar Aharon Cohen   Chief Marketing Officer  NaN        NaN   
 7  Ms. Shirly Kaufman-Kirshenbaum                  VP of HR  NaN        NaN   
 8                  Mr. Gilad Yron        Chief Bus. Officer  NaN        NaN   
 9               Ms. Lydia Edwards          Pres of Nanox US  NaN        NaN   
 
    Year Born  
 0     1968.0  
 1     1966.0  
 2     1970.0  
 3     1970.0  
 4     1978.0  
 5     1970.0  
 6    

In [336]:
df = pd.DataFrame(np.random.randn(2, 2))
df

Unnamed: 0,0,1
0,1.098443,-3.070569
1,0.206129,0.921344


In [337]:
print(df.to_html())

<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>0</th>
      <th>1</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>1.098443</td>
      <td>-3.070569</td>
    </tr>
    <tr>
      <th>1</th>
      <td>0.206129</td>
      <td>0.921344</td>
    </tr>
  </tbody>
</table>


In [338]:
df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['A', 'B', 'C'])
df.to_clipboard(sep=',', index=False)
pd.read_clipboard()


Unnamed: 0,"A,B,C"
0,123
1,456


In [339]:
df1 = pd.read_excel('shoppingcenter.xlsx')
df2 = pd.read_excel('shoppingcenter.xlsx')
df = pd.read_excel('shoppingcenter.xlsx')
df

Unnamed: 0,광역시도,시군구,업종대분류,업종중분류,1년미만,1~2년,2~3년,3~5년,5년 이상
0,서울특별시,종로구,관광/여가/오락,연극/영화/극장,1.0,2.0,42.0,39.0,16.0
1,서울특별시,종로구,관광/여가/오락,전시/관람,0.0,6.0,18.0,35.0,44.0
2,서울특별시,종로구,관광/여가/오락,PC/오락/당구/볼링등,0.0,5.0,12.0,86.0,20.0
3,서울특별시,종로구,관광/여가/오락,경마/경륜/성인오락,0.0,1.0,1.0,1.0,2.0
4,서울특별시,종로구,관광/여가/오락,스포츠/운동,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
20620,제주특별자치도,서귀포시,학문/교육,학원-예능취미체육,5.0,1.0,1.0,0.0,22.0
20621,제주특별자치도,서귀포시,학문/교육,학원-보습교습입시,6.0,2.0,3.0,47.0,18.0
20622,제주특별자치도,서귀포시,학문/교육,학원기타,8.0,1.0,17.0,53.0,28.0
20623,제주특별자치도,서귀포시,학문/교육,유아교육,0.0,1.0,7.0,5.0,115.0


In [340]:
df1 = pd.read_excel('shoppingcenter.xlsx', sheet_name='2018년 하반기 업력현황')
df1

Unnamed: 0,광역시도,시군구,업종대분류,업종중분류,1년미만,1~2년,2~3년,3~5년,5년 이상
0,서울특별시,종로구,관광/여가/오락,연극/영화/극장,1,1,16,64,19
1,서울특별시,종로구,관광/여가/오락,전시/관람,0,0,13,45,46
2,서울특별시,종로구,관광/여가/오락,PC/오락/당구/볼링등,0,3,9,68,43
3,서울특별시,종로구,관광/여가/오락,경마/경륜/성인오락,0,1,2,0,3
4,서울특별시,종로구,관광/여가/오락,스포츠/운동,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...
20620,제주특별자치도,서귀포시,학문/교육,학원-예능취미체육,2,3,1,1,22
20621,제주특별자치도,서귀포시,학문/교육,학원-보습교습입시,3,4,3,47,18
20622,제주특별자치도,서귀포시,학문/교육,학원기타,4,4,6,64,28
20623,제주특별자치도,서귀포시,학문/교육,유아교육,0,0,3,9,115


In [341]:
df.to_excel('file_with_path.xlsx', sheet_name='Sheet1')

In [342]:
hfs = pd.HDFStore('store.h5')
hfs

<class 'pandas.io.pytables.HDFStore'>
File path: store.h5

In [343]:
ind = pd.date_range('1/1/2021', periods=8)
ser = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
df = pd.DataFrame(np.random.randn(8, 3), index=ind, columns=['A', 'B', 'C'])
country = ['KOR', 'US', 'ITALY']
mind = pd.MultiIndex.from_product([country, ind])
col = ["item_%d" % i for i in range(1, 4)]
data = np.random.randn(24, 3)

In [344]:
df1 = pd.DataFrame(data, index=mind, columns=col)
df1

Unnamed: 0,Unnamed: 1,item_1,item_2,item_3
KOR,2021-01-01,0.386394,-0.977768,-1.993946
KOR,2021-01-02,-0.19989,-0.243835,0.337446
KOR,2021-01-03,0.009669,2.778919,-0.134262
KOR,2021-01-04,0.482102,0.159601,-0.350513
KOR,2021-01-05,0.646647,2.08411,-0.215803
KOR,2021-01-06,-1.624177,-1.189844,-1.365084
KOR,2021-01-07,-1.364114,0.373186,-0.097014
KOR,2021-01-08,1.044384,1.811946,0.02979
US,2021-01-01,-0.06219,-0.26236,-0.000571
US,2021-01-02,-0.429171,-0.11471,-1.153265


In [345]:
hfs['ser'] = ser
hfs['df'] = df
hfs['df1'] = df1
hfs

<class 'pandas.io.pytables.HDFStore'>
File path: store.h5

In [346]:
hfs['df']

Unnamed: 0,A,B,C
2021-01-01,-2.194225,1.334529,1.079093
2021-01-02,-1.240425,0.607491,1.107732
2021-01-03,-1.845978,1.199316,0.230913
2021-01-04,-0.57374,-0.970969,0.504786
2021-01-05,0.46787,1.713269,-1.246676
2021-01-06,1.256595,-0.912099,-0.682193
2021-01-07,-0.002263,-1.743175,-1.043452
2021-01-08,-0.655988,-0.127559,-0.572072


In [347]:
del hfs['df1']

In [348]:
#hfs['df1']

In [349]:
hfs.close()

In [350]:
#hfs['ser']
#hfs.df

In [351]:
df_s = pd.DataFrame({'A': list(range(5)), 'B': list(range(5))})
df_s.to_hdf('hfs_s.h5', 'table', append=True)
pd.read_hdf('hfs_s.h5', 'table', where=['index>2'])

Unnamed: 0,A,B
3,3,3
4,4,4
3,3,3
4,4,4


In [352]:
#from sqlalchemy import create_engine
#engine = create_engine('postgresql://scott:tiger@localhost:5432/mydb')
# sqlite://<nohostname>/<path>
# where <path> is relative:
#engine = create_engine('sqlite:///jin.db')
#engine = create_engine('sqlite://')
#engine = create_engine('sqlite:///:memory:')

In [359]:
from sqlalchemy import create_engine
engine = create_engine('sqlite:///:memory:')

In [360]:
df2  = pd.read_csv('student_grade.csv', index_col=0) 
df2.to_sql('data_db', engine)
df2.to_sql('data_db1', engine, chunksize=1000)

In [361]:
pd.read_sql_table('data_db', engine)

Unnamed: 0,index,name,age,address,score,grade
0,0,haena,30,dogok,100,A
1,1,naeun,27,suwon,88,B
2,2,una,28,mapo,73,C
3,3,bum,23,ilsan,83,B
4,4,suho,18,yeoyi,95,A


In [362]:
pd.read_sql_table('data_db', engine, index_col='name')

Unnamed: 0_level_0,index,age,address,score,grade
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
haena,0,30,dogok,100,A
naeun,1,27,suwon,88,B
una,2,28,mapo,73,C
bum,3,23,ilsan,83,B
suho,4,18,yeoyi,95,A


In [363]:
pd.read_sql_table('data_db', engine, columns=['name', 'grade'])

Unnamed: 0,name,grade
0,haena,A
1,naeun,B
2,una,C
3,bum,B
4,suho,A


In [364]:
pd.read_sql_query('SELECT * FROM data_db', engine)

Unnamed: 0,index,name,age,address,score,grade
0,0,haena,30,dogok,100,A
1,1,naeun,27,suwon,88,B
2,2,una,28,mapo,73,C
3,3,bum,23,ilsan,83,B
4,4,suho,18,yeoyi,95,A


In [365]:
pd.read_sql_query("SELECT name, address FROM data_db WHERE grade='A';", engine)

Unnamed: 0,name,address
0,haena,dogok
1,suho,yeoyi


In [366]:
dfc = pd.DataFrame(np.random.randn(9, 3), columns=list('abc'))
dfc.to_sql('data_ck', engine, index=False)
for chunk in pd.read_sql_query("SELECT * FROM data_ck", engine, chunksize=3):
    print(chunk)

          a         b         c
0 -0.733973 -0.919681 -0.116925
1  0.351232 -0.376313 -0.983268
2 -0.199314 -1.068167 -0.264015
          a         b         c
0  0.230098  0.676113 -0.046839
1  0.129894  0.307725  0.971459
2  1.460186  1.152541 -0.906164
          a         b         c
0  1.872970  0.699920 -1.106343
1  0.545566  1.396238 -1.129041
2 -0.451837  0.448178 -0.203647


In [367]:
df = pd.read_csv('air_test.csv')
df

Unnamed: 0,site_code,test_no,average
0,104,1,0.008
1,102,8,101.000
2,102,9,64.000
3,102,6,0.071
4,102,1,0.005
...,...,...,...
291595,116,3,0.021
291596,116,8,19.000
291597,124,6,0.014
291598,110,9,11.000


In [372]:
df_chunk = pd.read_csv('air_test.csv', chunksize=10000)
df_chunk

<pandas.io.parsers.TextFileReader at 0x2171db746a0>

In [369]:
#for chunk in df_chunk:
#    print(chunk)

      site_code  test_no  average
0           104        1    0.008
1           102        8  101.000
2           102        9   64.000
3           102        6    0.071
4           102        1    0.005
...         ...      ...      ...
9995        107        3    0.065
9996        125        9   98.000
9997        125        8  137.000
9998        125        6    0.029
9999        125        5    0.900

[10000 rows x 3 columns]
       site_code  test_no  average
10000        125        1    0.005
10001        125        3    0.050
10002        109        9   84.000
10003        109        8  114.000
10004        109        6    0.026
...          ...      ...      ...
19995        106        6    0.005
19996        106        5    0.800
19997        106        1    0.005
19998        106        3    0.058
19999        110        9   38.000

[10000 rows x 3 columns]
       site_code  test_no  average
20000        110        8   51.000
20001        110        6    0.005
20002        11

In [370]:
type(chunk)

pandas.core.frame.DataFrame

In [373]:
pd.concat(df_chunk, ignore_index=True)

Unnamed: 0,site_code,test_no,average
0,104,1,0.008
1,102,8,101.000
2,102,9,64.000
3,102,6,0.071
4,102,1,0.005
...,...,...,...
291595,116,3,0.021
291596,116,8,19.000
291597,124,6,0.014
291598,110,9,11.000
