### 1. Series

In [1]:
import numpy as np
import pandas as pd
s1 = pd.Series(np.random.randint(10,size=5))
s1

0    0
1    8
2    3
3    8
4    2
dtype: int64

In [2]:
type(s1)

pandas.core.series.Series

In [3]:
s1[0]

0

In [4]:
s2 = pd.Series(np.random.randint(10,size=5), index=['a','b','c','d','e'])
s2

a    8
b    2
c    7
d    3
e    5
dtype: int64

In [5]:
s2[0]

8

In [6]:
s2['a']

8

In [7]:
s3 = pd.Series({'a':21,'b':31,'c':45,'d':42,'e':12})
s3

a    21
b    31
c    45
d    42
e    12
dtype: int64

In [8]:
s3['b']

31

In [9]:
s4 = pd.Series(3, index=[0,1,2,3,4,5])
s4

0    3
1    3
2    3
3    3
4    3
5    3
dtype: int64

In [12]:
s3['b':'c']

b    31
c    45
dtype: int64

In [14]:
s5 = pd.Series(np.random.randint(10,size=10), 
               index=['hong', 'hoa', 'hue', 'tham','tuoi','nhat','nheo','tam','chin','muoi'])
s5

hong    0
hoa     0
hue     9
tham    3
tuoi    6
nhat    3
nheo    8
tam     5
chin    9
muoi    2
dtype: int64

In [15]:
s5['tham':'tam']

tham    3
tuoi    6
nhat    3
nheo    8
tam     5
dtype: int64

In [17]:
s5[3:8]

tham    3
tuoi    6
nhat    3
nheo    8
tam     5
dtype: int64

In [18]:
s5[['hoa','tham','tam']]

hoa     0
tham    3
tam     5
dtype: int64

In [19]:
s5.median()

4.0

In [20]:
s5[s5>4.0]

hue     9
tuoi    6
nheo    8
tam     5
chin    9
dtype: int64

In [21]:
s5[s5%2==0]

hong    0
hoa     0
tuoi    6
nheo    8
muoi    2
dtype: int64

In [22]:
s5.max()

9

In [23]:
t = s5>s5.median()
t

hong    False
hoa     False
hue      True
tham    False
tuoi     True
nhat    False
nheo     True
tam      True
chin     True
muoi    False
dtype: bool

In [27]:
s5[s5<s5.median()]

hong    0
hoa     0
tham    3
nhat    3
muoi    2
dtype: int64

In [28]:
'hoa' in s5

True

In [29]:
0 in s5

False

In [30]:
s5.head(3)

hong    0
hoa     0
hue     9
dtype: int64

In [31]:
s5.tail(3)

tam     5
chin    9
muoi    2
dtype: int64

In [32]:
s5.head(-3)

hong    0
hoa     0
hue     9
tham    3
tuoi    6
nhat    3
nheo    8
dtype: int64

In [33]:
s5.tail(-3)

tham    3
tuoi    6
nhat    3
nheo    8
tam     5
chin    9
muoi    2
dtype: int64

In [46]:
sd = pd.Series({'mon':21,'tue':31,'wed':42,'thu':12,'fri':52,'sat':45,'sun':75})
sd

mon    21
tue    31
wed    42
thu    12
fri    52
sat    45
sun    75
dtype: int64

In [35]:
sd1 = sd[:'fri']
sd1

mon    21
tue    31
wed    42
thu    12
fri    52
dtype: int64

In [36]:
sd2 = sd['wed':]
sd2

wed    42
thu    12
fri    52
sat    45
sun    14
dtype: int64

In [38]:
sd3 = sd1 * sd2
sd3

fri    2704.0
mon       NaN
sat       NaN
sun       NaN
thu     144.0
tue       NaN
wed    1764.0
dtype: float64

In [44]:
# filter: boolean expression 
sd[(sd<25) | (sd>50)]

mon    21
thu    12
fri    52
sun    14
dtype: int64

In [45]:
sd[(sd>25) & (sd<50)]

tue    31
wed    42
sat    45
dtype: int64

In [50]:
sd[(sd>65) & (sd>50) | (sd < 25)]

mon    21
thu    12
sun    75
dtype: int64

In [51]:
sd3.median()

1764.0

In [52]:
sd3.dropna()

fri    2704.0
thu     144.0
wed    1764.0
dtype: float64

In [53]:
sd3.ffill()

fri    2704.0
mon    2704.0
sat    2704.0
sun    2704.0
thu     144.0
tue     144.0
wed    1764.0
dtype: float64

In [54]:
sd3.fillna(0.1)

fri    2704.0
mon       0.1
sat       0.1
sun       0.1
thu     144.0
tue       0.1
wed    1764.0
dtype: float64

In [55]:
sd.cumsum()

mon     21
tue     52
wed     94
thu    106
fri    158
sat    203
sun    278
dtype: int64

In [59]:
sd.quantile()

42.0

In [65]:
n = np.random.randint(20,size=15)
n.sort()
n

array([ 0,  2,  2,  4,  5,  9,  9, 10, 11, 12, 15, 16, 17, 18, 18])

In [67]:
s6 = pd.Series(n)
s6

0      0
1      2
2      2
3      4
4      5
5      9
6      9
7     10
8     11
9     12
10    15
11    16
12    17
13    18
14    18
dtype: int64

In [72]:
s6.quantile(.3)

5.800000000000001

In [71]:
s6.describe()

count    15.000000
mean      9.866667
std       6.174448
min       0.000000
25%       4.500000
50%      10.000000
75%      15.500000
max      18.000000
dtype: float64

### 2. DataFrame

In [73]:
d = {'one':pd.Series([1., 2., 3.], index=['a','b','c']),
    'two':pd.Series([1., 2., 3., 4.], index=['a','b','c','d'])}
df = pd.DataFrame(d)
df

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


In [74]:
d1 = {'one' : [1., 2., 3., 4.], #dict of list/darray
    'two' : [4., 3., 2., 1.]}
d2 = [(1, 2.,'Hello'), (2, 3.,'World')] #structure
df1 = pd.DataFrame(d1) # index and columns argument
df2 = pd.DataFrame(d2)
print(df1)
print(df2)

   one  two
0  1.0  4.0
1  2.0  3.0
2  3.0  2.0
3  4.0  1.0
   0    1      2
0  1  2.0  Hello
1  2  3.0  World


In [75]:
df1['one']

0    1.0
1    2.0
2    3.0
3    4.0
Name: one, dtype: float64

In [79]:
df2[0][1]

2

In [80]:
df3 = pd.DataFrame(d2, columns=['A','B','C'])
df3

Unnamed: 0,A,B,C
0,1,2.0,Hello
1,2,3.0,World


In [81]:
df3['A']

0    1
1    2
Name: A, dtype: int64

In [84]:
d3 = [{'a': 1, 'b': 2}, {'a': 5, 'b': 10, 'c': 20}]
df4 = pd.DataFrame(d3, index=['r_1st','r_2nd'])
df5 = pd.DataFrame(d3, columns=['a','b'])
print(df4)
print(df5)

       a   b     c
r_1st  1   2   NaN
r_2nd  5  10  20.0
   a   b
0  1   2
1  5  10


In [86]:
df4.shape

(2, 3)

In [87]:
df4.index

Index(['r_1st', 'r_2nd'], dtype='object')

In [88]:
df4.columns

Index(['a', 'b', 'c'], dtype='object')

In [90]:
dfi = pd.read_csv('iris.csv')
dfi.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,label
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [93]:
dfi['sepal_length'].describe()

count    150.000000
mean       5.843333
std        0.828066
min        4.300000
25%        5.100000
50%        5.800000
75%        6.400000
max        7.900000
Name: sepal_length, dtype: float64

In [94]:
dfi[['sepal_length','sepal_width']].head(10)

Unnamed: 0,sepal_length,sepal_width
0,5.1,3.5
1,4.9,3.0
2,4.7,3.2
3,4.6,3.1
4,5.0,3.6
5,5.4,3.9
6,4.6,3.4
7,5.0,3.4
8,4.4,2.9
9,4.9,3.1


In [95]:
dfi.loc[1]

sepal_length            4.9
sepal_width               3
petal_length            1.4
petal_width             0.2
label           Iris-setosa
Name: 1, dtype: object

In [98]:
dfi.iloc[1][:'petal_width'].median()

2.2

In [102]:
dfi[['sepal_length','sepal_width']][:10]

Unnamed: 0,sepal_length,sepal_width
0,5.1,3.5
1,4.9,3.0
2,4.7,3.2
3,4.6,3.1
4,5.0,3.6
5,5.4,3.9
6,4.6,3.4
7,5.0,3.4
8,4.4,2.9
9,4.9,3.1


In [107]:
dfi[(dfi.sepal_length > 5) & (dfi.sepal_width > 4)]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,label
15,5.7,4.4,1.5,0.4,Iris-setosa
32,5.2,4.1,1.5,0.1,Iris-setosa
33,5.5,4.2,1.4,0.2,Iris-setosa


In [109]:
dfi[dfi.label == 'Iris-setosa']

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,label
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
5,5.4,3.9,1.7,0.4,Iris-setosa
6,4.6,3.4,1.4,0.3,Iris-setosa
7,5.0,3.4,1.5,0.2,Iris-setosa
8,4.4,2.9,1.4,0.2,Iris-setosa
9,4.9,3.1,1.5,0.1,Iris-setosa


In [110]:
sn = dfi.sepal_length + dfi.sepal_width
sn

0       8.6
1       7.9
2       7.9
3       7.7
4       8.6
5       9.3
6       8.0
7       8.4
8       7.3
9       8.0
10      9.1
11      8.2
12      7.8
13      7.3
14      9.8
15     10.1
16      9.3
17      8.6
18      9.5
19      8.9
20      8.8
21      8.8
22      8.2
23      8.4
24      8.2
25      8.0
26      8.4
27      8.7
28      8.6
29      7.9
       ... 
120    10.1
121     8.4
122    10.5
123     9.0
124    10.0
125    10.4
126     9.0
127     9.1
128     9.2
129    10.2
130    10.2
131    11.7
132     9.2
133     9.1
134     8.7
135    10.7
136     9.7
137     9.5
138     9.0
139    10.0
140     9.8
141    10.0
142     8.5
143    10.0
144    10.0
145     9.7
146     8.8
147     9.5
148     9.6
149     8.9
Length: 150, dtype: float64

In [111]:
dfi[2:5] + dfi[3:6]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,label
2,,,,,
3,9.2,6.2,3.0,0.4,Iris-setosaIris-setosa
4,10.0,7.2,2.8,0.4,Iris-setosaIris-setosa
5,,,,,


In [112]:
dfi['newc'] = dfi.sepal_length + dfi.sepal_width
dfi.head(10)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,label,newc
0,5.1,3.5,1.4,0.2,Iris-setosa,8.6
1,4.9,3.0,1.4,0.2,Iris-setosa,7.9
2,4.7,3.2,1.3,0.2,Iris-setosa,7.9
3,4.6,3.1,1.5,0.2,Iris-setosa,7.7
4,5.0,3.6,1.4,0.2,Iris-setosa,8.6
5,5.4,3.9,1.7,0.4,Iris-setosa,9.3
6,4.6,3.4,1.4,0.3,Iris-setosa,8.0
7,5.0,3.4,1.5,0.2,Iris-setosa,8.4
8,4.4,2.9,1.4,0.2,Iris-setosa,7.3
9,4.9,3.1,1.5,0.1,Iris-setosa,8.0


In [113]:
dfi['newc1'] = dfi.sepal_length + dfi.sepal_width

In [114]:
dfi.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,label,newc,newc1
0,5.1,3.5,1.4,0.2,Iris-setosa,8.6,8.6
1,4.9,3.0,1.4,0.2,Iris-setosa,7.9,7.9
2,4.7,3.2,1.3,0.2,Iris-setosa,7.9,7.9
3,4.6,3.1,1.5,0.2,Iris-setosa,7.7,7.7
4,5.0,3.6,1.4,0.2,Iris-setosa,8.6,8.6


In [115]:
dfi.pop('newc')
dfi.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,label,newc1
0,5.1,3.5,1.4,0.2,Iris-setosa,8.6
1,4.9,3.0,1.4,0.2,Iris-setosa,7.9
2,4.7,3.2,1.3,0.2,Iris-setosa,7.9
3,4.6,3.1,1.5,0.2,Iris-setosa,7.7
4,5.0,3.6,1.4,0.2,Iris-setosa,8.6


In [119]:
dfin = dfi.drop(columns=['newc1'])
dfin.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,label
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [120]:
del(dfi['newc1'])
dfi.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,label
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [122]:
dfi.head().assign(ratio = dfi['sepal_length']/dfi['sepal_width'])

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,label,ratio
0,5.1,3.5,1.4,0.2,Iris-setosa,1.457143
1,4.9,3.0,1.4,0.2,Iris-setosa,1.633333
2,4.7,3.2,1.3,0.2,Iris-setosa,1.46875
3,4.6,3.1,1.5,0.2,Iris-setosa,1.483871
4,5.0,3.6,1.4,0.2,Iris-setosa,1.388889


In [126]:
df = dfi.head()
df['ratio'] = df['sepal_length']/df['sepal_width']
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,label,ratio
0,5.1,3.5,1.4,0.2,Iris-setosa,1.457143
1,4.9,3.0,1.4,0.2,Iris-setosa,1.633333
2,4.7,3.2,1.3,0.2,Iris-setosa,1.46875
3,4.6,3.1,1.5,0.2,Iris-setosa,1.483871
4,5.0,3.6,1.4,0.2,Iris-setosa,1.388889


In [127]:
dfi.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5
