In [2]:
import pandas as pd
import numpy as np

In [3]:
# create a small dictionary with different data types

dft = pd.DataFrame(dict(A = np.random.rand(3),
                        B = 1,
                        C = 'foo',
                        D = pd.Timestamp('20010102'),
                        E = pd.Series([1.0]*3).astype('float32'),
                                F = False,
                                G = pd.Series([1]*3,dtype='int8')))

dft

Unnamed: 0,A,B,C,D,E,F,G
0,0.128434,1,foo,2001-01-02,1.0,False,1
1,0.960969,1,foo,2001-01-02,1.0,False,1
2,0.703165,1,foo,2001-01-02,1.0,False,1


In [4]:
# There is a really easy way to see what kind of dtypes 
# are in each column. 

dft.dtypes

A           float64
B             int64
C            object
D    datetime64[ns]
E           float32
F              bool
G              int8
dtype: object

In [5]:
# If a pandas object contains data multiple dtypes IN A 
# SINGLE COLUMN, the dtype of the column will be chosen 
# to accommodate all of the data types (object is the 
# most general).
# these ints are coerced to floats

pd.Series([1, 2, 3, 4, 5, 6.])

0    1.0
1    2.0
2    3.0
3    4.0
4    5.0
5    6.0
dtype: float64

In [15]:
# string data forces an ``object`` dtype

test = pd.Series([1, 2, 3, 6., 'foo'])
test


0      1
1      2
2      3
3      6
4    foo
dtype: object

In [17]:
test[4]

'foo'

In [7]:
# The method get_dtype_counts() will return the number 
# of columns of each type in a DataFrame:

dft.get_dtype_counts()

bool              1
datetime64[ns]    1
float32           1
float64           1
int64             1
int8              1
object            1
dtype: int64

In [19]:
# create a small data frame. 

df = pd.DataFrame(np.random.randn(5, 4), columns=['a', 'b', 'c', 'd'])
df

Unnamed: 0,a,b,c,d
0,-0.478906,0.914618,0.941338,1.569319
1,0.00627,0.46835,0.483795,0.118471
2,1.463132,0.773577,-0.589309,1.401843
3,-1.177397,1.948374,-1.844425,-1.448602
4,0.235092,0.006146,-1.045874,0.819477


In [21]:
# Use df.apply to find the square root of all the values. 
# NaN means not a number

dfnan = df.apply(np.sqrt)
dfnan

Unnamed: 0,a,b,c,d
0,,0.956357,0.970226,1.252725
1,0.079184,0.684361,0.695553,0.344196
2,1.2096,0.879532,,1.183994
3,,1.395842,,
4,0.484863,0.078397,,0.90525


In [29]:
# find the mean of all of the columns, axis =0 is for columns

df.apply(np.mean, axis=0)

a    0.009638
b    0.822213
c   -0.410895
d    0.492102
dtype: float64

In [28]:
#this fills the NaN values in 'a' column
dfnan["a"] = dfnan.fillna(np.mean(dfnan['a']))
dfnan

Unnamed: 0,a,b,c,d
0,0.009638,0.956357,0.009638,1.252725
1,0.079184,0.684361,0.079184,0.344196
2,1.2096,0.879532,1.2096,1.183994
3,0.009638,1.395842,0.009638,
4,0.484863,0.078397,0.484863,0.90525


In [11]:
# find the mean of all of the rows-axis 1 is for rows

df.apply(np.mean, axis=1)

0    0.699858
1   -0.197451
2   -1.522729
3    0.552612
4    0.005904
dtype: float64

In [33]:
df.fillna()

ValueError: must specify a fill method or value

In [4]:
import numpy as np
import pandas as pd
# Let's create a random array with 50 numbers, ranging 
# from 0 to 7.

data = np.random.randint(0, 7, size = 50)
data

array([6, 5, 1, 2, 0, 3, 3, 2, 6, 3, 3, 3, 5, 5, 3, 3, 4, 3, 4, 1, 1, 1, 1,
       3, 2, 1, 6, 4, 5, 1, 1, 2, 4, 6, 4, 4, 5, 2, 5, 5, 2, 0, 6, 6, 4, 0,
       2, 0, 0, 2])

In [7]:
# convert the array into a series

s = pd.Series(data)
s


0     6
1     5
2     1
3     2
4     0
5     3
6     3
7     2
8     6
9     3
10    3
11    3
12    5
13    5
14    3
15    3
16    4
17    3
18    4
19    1
20    1
21    1
22    1
23    3
24    2
25    1
26    6
27    4
28    5
29    1
30    1
31    2
32    4
33    6
34    4
35    4
36    5
37    2
38    5
39    5
40    2
41    0
42    6
43    6
44    4
45    0
46    2
47    0
48    0
49    2
dtype: int64

In [14]:
# How many of each number is there in the series? Enter 
# value_counts()

pd.value_counts(s)

6    10
0     9
4     8
2     8
1     8
5     5
3     2
dtype: int64