# 10 MINUTES TO PANDAS

# SERIES AND DATAFRAMES

In [13]:
import pandas as pd
import numpy as np

In [14]:
s = pd.Series([1,3,5,np.nan,6,8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [15]:
dates = pd.date_range('20130101', periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [16]:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.216231,-0.781546,-2.281249,-0.364239
2013-01-02,-0.199891,0.764357,-0.303624,0.735805
2013-01-03,1.484778,-1.634738,-0.382943,0.003341
2013-01-04,0.806115,1.658027,0.195311,0.879748
2013-01-05,-1.256429,1.023362,-0.839977,1.13446
2013-01-06,-0.072067,-0.341988,0.623587,0.176876


In [17]:
df2 = pd.DataFrame({ 'A' : 1., 'B' : pd.Timestamp('20130102'), 'C' : pd.Series(1,index=list(range(4)),dtype='float32'), 'D' : np.array([3] * 4,dtype='int32'), 'E' : pd.Categorical(["test","train","test","train"]), 'F' : 'foo' })
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [18]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

# VIEWING DATA

In [20]:
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,-0.216231,-0.781546,-2.281249,-0.364239
2013-01-02,-0.199891,0.764357,-0.303624,0.735805
2013-01-03,1.484778,-1.634738,-0.382943,0.003341
2013-01-04,0.806115,1.658027,0.195311,0.879748
2013-01-05,-1.256429,1.023362,-0.839977,1.13446


In [21]:
df.tail(3)

Unnamed: 0,A,B,C,D
2013-01-04,0.806115,1.658027,0.195311,0.879748
2013-01-05,-1.256429,1.023362,-0.839977,1.13446
2013-01-06,-0.072067,-0.341988,0.623587,0.176876


In [30]:
# Display the index, columns, and the underlying numpy data

In [31]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [32]:
df.columns

Index([u'A', u'B', u'C', u'D'], dtype='object')

In [33]:
df.values

array([[-0.21623058, -0.78154627, -2.28124938, -0.36423922],
       [-0.19989093,  0.76435739, -0.30362414,  0.73580452],
       [ 1.48477829, -1.63473797, -0.38294301,  0.00334058],
       [ 0.80611525,  1.65802746,  0.19531144,  0.87974805],
       [-1.25642924,  1.02336203, -0.83997669,  1.13445977],
       [-0.07206715, -0.34198834,  0.62358748,  0.17687575]])

In [34]:
df.describe() 
# Describe shows a quick statistic summary of the data

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.091046,0.114579,-0.498149,0.427665
std,0.946063,1.241123,1.008621,0.577745
min,-1.256429,-1.634738,-2.281249,-0.364239
25%,-0.212146,-0.671657,-0.725718,0.046724
50%,-0.135979,0.211185,-0.343284,0.45634
75%,0.58657,0.958611,0.070578,0.843762
max,1.484778,1.658027,0.623587,1.13446


In [35]:
df.T
# T gives the Transpose of the Data

Unnamed: 0,2013-01-01 00:00:00,2013-01-02 00:00:00,2013-01-03 00:00:00,2013-01-04 00:00:00,2013-01-05 00:00:00,2013-01-06 00:00:00
A,-0.216231,-0.199891,1.484778,0.806115,-1.256429,-0.072067
B,-0.781546,0.764357,-1.634738,1.658027,1.023362,-0.341988
C,-2.281249,-0.303624,-0.382943,0.195311,-0.839977,0.623587
D,-0.364239,0.735805,0.003341,0.879748,1.13446,0.176876


In [37]:
df.sort_index(axis = 1, ascending = False)
# sort_index gives the Sorting by an axis

Unnamed: 0,D,C,B,A
2013-01-01,-0.364239,-2.281249,-0.781546,-0.216231
2013-01-02,0.735805,-0.303624,0.764357,-0.199891
2013-01-03,0.003341,-0.382943,-1.634738,1.484778
2013-01-04,0.879748,0.195311,1.658027,0.806115
2013-01-05,1.13446,-0.839977,1.023362,-1.256429
2013-01-06,0.176876,0.623587,-0.341988,-0.072067


In [38]:
df.sort_values(by='B')

# sort_values gives the Sorting by Values

Unnamed: 0,A,B,C,D
2013-01-03,1.484778,-1.634738,-0.382943,0.003341
2013-01-01,-0.216231,-0.781546,-2.281249,-0.364239
2013-01-06,-0.072067,-0.341988,0.623587,0.176876
2013-01-02,-0.199891,0.764357,-0.303624,0.735805
2013-01-05,-1.256429,1.023362,-0.839977,1.13446
2013-01-04,0.806115,1.658027,0.195311,0.879748


# SELECTION

In [43]:
df['A']

2013-01-01   -0.216231
2013-01-02   -0.199891
2013-01-03    1.484778
2013-01-04    0.806115
2013-01-05   -1.256429
2013-01-06   -0.072067
Freq: D, Name: A, dtype: float64

In [44]:
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,-0.216231,-0.781546,-2.281249,-0.364239
2013-01-02,-0.199891,0.764357,-0.303624,0.735805
2013-01-03,1.484778,-1.634738,-0.382943,0.003341


In [45]:
df['20130102' : '20130104']

Unnamed: 0,A,B,C,D
2013-01-02,-0.199891,0.764357,-0.303624,0.735805
2013-01-03,1.484778,-1.634738,-0.382943,0.003341
2013-01-04,0.806115,1.658027,0.195311,0.879748


In [46]:
df.loc[dates[0]]

A   -0.216231
B   -0.781546
C   -2.281249
D   -0.364239
Name: 2013-01-01 00:00:00, dtype: float64