<a href="https://www.kaggle.com/code/sagorkumarmitra/pandas-walkthrough?scriptVersionId=149125577" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# 10 minutes to pandas

In [1]:
import numpy as np
import pandas as pd

# Creating a Series by passing a list of values, letting pandas create a default integer index:

In [2]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])

In [3]:
dates=pd.date_range("20130101",periods=6)

In [4]:
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [5]:
df=pd.DataFrame(np.random.randn(6,4),index=dates,columns=list("ABCD"))

In [6]:
df

Unnamed: 0,A,B,C,D
2013-01-01,1.033164,1.77015,0.342261,0.054187
2013-01-02,0.342947,-2.353434,2.022709,0.275738
2013-01-03,-1.09632,-0.16426,-0.738465,1.48571
2013-01-04,-0.071846,0.419916,0.213267,-0.801184
2013-01-05,0.081534,-1.561322,2.04379,0.091152
2013-01-06,-1.09378,-0.720409,-0.805572,-0.759418


# Creating a DataFrame by passing a dictionary of objects that can be converted into a series-like structure:

In [7]:
df2=pd.DataFrame(
    {
        "A":1.0,
        "B":pd.Timestamp("20130102"),
        "C":pd.Series(1,index=list(range(4)),dtype="float32"),
        "D":np.array([3]*4,dtype="int32"),
        "E":pd.Categorical(["test","train","test","train"]),
        "F":"foo",
    }
)

In [8]:
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [9]:
df # previous DataFrame

Unnamed: 0,A,B,C,D
2013-01-01,1.033164,1.77015,0.342261,0.054187
2013-01-02,0.342947,-2.353434,2.022709,0.275738
2013-01-03,-1.09632,-0.16426,-0.738465,1.48571
2013-01-04,-0.071846,0.419916,0.213267,-0.801184
2013-01-05,0.081534,-1.561322,2.04379,0.091152
2013-01-06,-1.09378,-0.720409,-0.805572,-0.759418


In [10]:
df.to_numpy()

array([[ 1.03316381,  1.77015011,  0.34226134,  0.05418675],
       [ 0.3429475 , -2.35343423,  2.02270922,  0.27573848],
       [-1.09631955, -0.16425971, -0.73846507,  1.48570976],
       [-0.07184585,  0.4199159 ,  0.21326684, -0.80118373],
       [ 0.08153428, -1.56132181,  2.04378964,  0.0911516 ],
       [-1.09378012, -0.72040914, -0.80557233, -0.75941803]])

In [11]:
df2.to_numpy()

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [12]:
df.sort_index(axis=1,ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,0.054187,0.342261,1.77015,1.033164
2013-01-02,0.275738,2.022709,-2.353434,0.342947
2013-01-03,1.48571,-0.738465,-0.16426,-1.09632
2013-01-04,-0.801184,0.213267,0.419916,-0.071846
2013-01-05,0.091152,2.04379,-1.561322,0.081534
2013-01-06,-0.759418,-0.805572,-0.720409,-1.09378


**DataFrame.loc**

In [13]:
df.loc['2013-01-01']

A    1.033164
B    1.770150
C    0.342261
D    0.054187
Name: 2013-01-01 00:00:00, dtype: float64

In [14]:
df

Unnamed: 0,A,B,C,D
2013-01-01,1.033164,1.77015,0.342261,0.054187
2013-01-02,0.342947,-2.353434,2.022709,0.275738
2013-01-03,-1.09632,-0.16426,-0.738465,1.48571
2013-01-04,-0.071846,0.419916,0.213267,-0.801184
2013-01-05,0.081534,-1.561322,2.04379,0.091152
2013-01-06,-1.09378,-0.720409,-0.805572,-0.759418


In [15]:
df.loc[dates[0]]

A    1.033164
B    1.770150
C    0.342261
D    0.054187
Name: 2013-01-01 00:00:00, dtype: float64

In [16]:
df.loc["20130102":"20130104", ["A", "B"]]

Unnamed: 0,A,B
2013-01-02,0.342947,-2.353434
2013-01-03,-1.09632,-0.16426
2013-01-04,-0.071846,0.419916


In [17]:
df.loc[:, ["A", "B"]]

Unnamed: 0,A,B
2013-01-01,1.033164,1.77015
2013-01-02,0.342947,-2.353434
2013-01-03,-1.09632,-0.16426
2013-01-04,-0.071846,0.419916
2013-01-05,0.081534,-1.561322
2013-01-06,-1.09378,-0.720409


In [18]:
df.loc["20130102":"20130104", ["A", "B"]]

Unnamed: 0,A,B
2013-01-02,0.342947,-2.353434
2013-01-03,-1.09632,-0.16426
2013-01-04,-0.071846,0.419916


In [19]:
df.loc["20130102", ["A", "B"]]

A    0.342947
B   -2.353434
Name: 2013-01-02 00:00:00, dtype: float64

In [20]:
df.loc[dates[0], "A"]

1.0331638104171987

In [21]:
df.at[dates[0], "A"]

1.0331638104171987

In [22]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-04,-0.071846,0.419916
2013-01-05,0.081534,-1.561322


In [23]:
df.iloc[[1, 2, 4], [0, 2]]

Unnamed: 0,A,C
2013-01-02,0.342947,2.022709
2013-01-03,-1.09632,-0.738465
2013-01-05,0.081534,2.04379


In [24]:
df

Unnamed: 0,A,B,C,D
2013-01-01,1.033164,1.77015,0.342261,0.054187
2013-01-02,0.342947,-2.353434,2.022709,0.275738
2013-01-03,-1.09632,-0.16426,-0.738465,1.48571
2013-01-04,-0.071846,0.419916,0.213267,-0.801184
2013-01-05,0.081534,-1.561322,2.04379,0.091152
2013-01-06,-1.09378,-0.720409,-0.805572,-0.759418


In [25]:
df[df["A"]>0]

Unnamed: 0,A,B,C,D
2013-01-01,1.033164,1.77015,0.342261,0.054187
2013-01-02,0.342947,-2.353434,2.022709,0.275738
2013-01-05,0.081534,-1.561322,2.04379,0.091152


In [26]:
df2=df.copy()

In [27]:
df2["E"]=["one","one","two","three","four","three"]

In [28]:
df2

Unnamed: 0,A,B,C,D,E
2013-01-01,1.033164,1.77015,0.342261,0.054187,one
2013-01-02,0.342947,-2.353434,2.022709,0.275738,one
2013-01-03,-1.09632,-0.16426,-0.738465,1.48571,two
2013-01-04,-0.071846,0.419916,0.213267,-0.801184,three
2013-01-05,0.081534,-1.561322,2.04379,0.091152,four
2013-01-06,-1.09378,-0.720409,-0.805572,-0.759418,three


In [29]:
df2[df2["E"].isin(["two","four"])]

Unnamed: 0,A,B,C,D,E
2013-01-03,-1.09632,-0.16426,-0.738465,1.48571,two
2013-01-05,0.081534,-1.561322,2.04379,0.091152,four


In [30]:
df

Unnamed: 0,A,B,C,D
2013-01-01,1.033164,1.77015,0.342261,0.054187
2013-01-02,0.342947,-2.353434,2.022709,0.275738
2013-01-03,-1.09632,-0.16426,-0.738465,1.48571
2013-01-04,-0.071846,0.419916,0.213267,-0.801184
2013-01-05,0.081534,-1.561322,2.04379,0.091152
2013-01-06,-1.09378,-0.720409,-0.805572,-0.759418


In [31]:
df.at[dates[0],"A"]=0

In [32]:
df

Unnamed: 0,A,B,C,D
2013-01-01,0.0,1.77015,0.342261,0.054187
2013-01-02,0.342947,-2.353434,2.022709,0.275738
2013-01-03,-1.09632,-0.16426,-0.738465,1.48571
2013-01-04,-0.071846,0.419916,0.213267,-0.801184
2013-01-05,0.081534,-1.561322,2.04379,0.091152
2013-01-06,-1.09378,-0.720409,-0.805572,-0.759418


In [33]:
df.iat[0, 1] = 0

In [34]:
df

Unnamed: 0,A,B,C,D
2013-01-01,0.0,0.0,0.342261,0.054187
2013-01-02,0.342947,-2.353434,2.022709,0.275738
2013-01-03,-1.09632,-0.16426,-0.738465,1.48571
2013-01-04,-0.071846,0.419916,0.213267,-0.801184
2013-01-05,0.081534,-1.561322,2.04379,0.091152
2013-01-06,-1.09378,-0.720409,-0.805572,-0.759418


In [35]:
df.loc[:,"D"]=np.array([5] * len(df))

  df.loc[:,"D"]=np.array([5] * len(df))


In [36]:
df

Unnamed: 0,A,B,C,D
2013-01-01,0.0,0.0,0.342261,5
2013-01-02,0.342947,-2.353434,2.022709,5
2013-01-03,-1.09632,-0.16426,-0.738465,5
2013-01-04,-0.071846,0.419916,0.213267,5
2013-01-05,0.081534,-1.561322,2.04379,5
2013-01-06,-1.09378,-0.720409,-0.805572,5


In [37]:
df2=df.copy()

In [38]:
df2[df2>0]=-df2

In [39]:
df2

Unnamed: 0,A,B,C,D
2013-01-01,0.0,0.0,-0.342261,-5
2013-01-02,-0.342947,-2.353434,-2.022709,-5
2013-01-03,-1.09632,-0.16426,-0.738465,-5
2013-01-04,-0.071846,-0.419916,-0.213267,-5
2013-01-05,-0.081534,-1.561322,-2.04379,-5
2013-01-06,-1.09378,-0.720409,-0.805572,-5


In [40]:
-df2

Unnamed: 0,A,B,C,D
2013-01-01,-0.0,-0.0,0.342261,5
2013-01-02,0.342947,2.353434,2.022709,5
2013-01-03,1.09632,0.16426,0.738465,5
2013-01-04,0.071846,0.419916,0.213267,5
2013-01-05,0.081534,1.561322,2.04379,5
2013-01-06,1.09378,0.720409,0.805572,5


In [41]:
import pandas as pd
import numpy as np

s = pd.Series(["A", "B", "C", "Aaba", "Baca", np.nan, "CABA", "dog", "cat"])
s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

In [42]:
df = pd.DataFrame(np.random.randn(10, 4))

In [43]:
df

Unnamed: 0,0,1,2,3
0,0.883798,-0.734552,-0.255279,-1.145458
1,-1.818027,-1.676989,0.479719,1.35279
2,-0.975617,0.309102,-0.922664,2.178866
3,-1.562412,-0.709633,0.031665,1.12982
4,-0.338633,0.124751,0.164523,-0.228128
5,0.557428,-0.246826,-0.248562,0.160703
6,0.812187,-2.214218,-0.616473,-0.042274
7,-0.241355,1.304137,0.116589,1.138737
8,-0.379015,-1.182839,0.773423,-0.481455
9,0.318634,-0.760937,0.306291,-1.525037


In [44]:
left = pd.DataFrame({"key": ["foo", "foo"], "lval": [1, 2]})
right = pd.DataFrame({"key": ["foo", "foo"], "rval": [4, 5]})
pd.merge(left, right, on="key")

Unnamed: 0,key,lval,rval
0,foo,1,4
1,foo,1,5
2,foo,2,4
3,foo,2,5


In [45]:
df = pd.DataFrame(
     {
         "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
         "B": ["one", "one", "two", "three", "two", "two", "one", "three"],
         "C": np.random.randn(8),
         "D": np.random.randn(8),
     }
 )
 

In [46]:
df

Unnamed: 0,A,B,C,D
0,foo,one,1.666191,-0.828884
1,bar,one,1.233109,1.913421
2,foo,two,-1.954985,0.03119
3,bar,three,0.004694,-1.291613
4,foo,two,-0.165479,-0.682585
5,bar,two,-0.261697,-1.154581
6,foo,one,-0.653582,0.236788
7,foo,three,0.672182,-1.050119


In [47]:
df.groupby("A")[["C","D"]].sum()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,0.976105,-0.532773
foo,-0.435672,-2.29361


In [48]:
df.groupby(["A", "B"]).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,1.233109,1.913421
bar,three,0.004694,-1.291613
bar,two,-0.261697,-1.154581
foo,one,1.012609,-0.592096
foo,three,0.672182,-1.050119
foo,two,-2.120463,-0.651395


# Stack

In [49]:
tuples=list(
    zip(
        ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
        ["one", "two", "one", "two", "one", "two", "one", "two"],
    )
)

In [50]:
tuples

[('bar', 'one'),
 ('bar', 'two'),
 ('baz', 'one'),
 ('baz', 'two'),
 ('foo', 'one'),
 ('foo', 'two'),
 ('qux', 'one'),
 ('qux', 'two')]

In [51]:
zip(
        ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
        ["one", "two", "one", "two", "one", "two", "one", "two"],
    )

<zip at 0x7c7f4473ce40>

In [52]:
index = pd.MultiIndex.from_tuples(tuples, names=["first", "second"])

In [53]:
index

MultiIndex([('bar', 'one'),
            ('bar', 'two'),
            ('baz', 'one'),
            ('baz', 'two'),
            ('foo', 'one'),
            ('foo', 'two'),
            ('qux', 'one'),
            ('qux', 'two')],
           names=['first', 'second'])

In [54]:
arrays=[
        ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
        ["one", "two", "one", "two", "one", "two", "one", "two"],
]

In [55]:
arrays

[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
 ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]

In [56]:
list(zip(*arrays))

[('bar', 'one'),
 ('bar', 'two'),
 ('baz', 'one'),
 ('baz', 'two'),
 ('foo', 'one'),
 ('foo', 'two'),
 ('qux', 'one'),
 ('qux', 'two')]

In [57]:
index = pd.MultiIndex.from_tuples(tuples, names=["first", "second"])

In [58]:
index

MultiIndex([('bar', 'one'),
            ('bar', 'two'),
            ('baz', 'one'),
            ('baz', 'two'),
            ('foo', 'one'),
            ('foo', 'two'),
            ('qux', 'one'),
            ('qux', 'two')],
           names=['first', 'second'])

In [59]:
s = pd.Series(np.random.randn(8), index=index)

In [60]:
s

first  second
bar    one       0.034913
       two       1.608566
baz    one      -1.196574
       two      -0.116082
foo    one      -0.449305
       two      -0.119083
qux    one      -2.375527
       two       1.137415
dtype: float64

In [61]:
df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=["A", "B"])

In [62]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.108315,-2.235064
bar,two,-0.100466,-0.721393
baz,one,-0.751018,0.154739
baz,two,-1.526633,1.07714
foo,one,-0.079263,-0.75534
foo,two,-1.399374,-0.720533
qux,one,0.691208,1.155086
qux,two,-1.611064,0.889171


In [63]:
df2=df[:4]

In [64]:
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.108315,-2.235064
bar,two,-0.100466,-0.721393
baz,one,-0.751018,0.154739
baz,two,-1.526633,1.07714


In [65]:
stacked=df2.stack()

In [66]:
stacked

first  second   
bar    one     A    0.108315
               B   -2.235064
       two     A   -0.100466
               B   -0.721393
baz    one     A   -0.751018
               B    0.154739
       two     A   -1.526633
               B    1.077140
dtype: float64

In [67]:
stacked.unstack()

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.108315,-2.235064
bar,two,-0.100466,-0.721393
baz,one,-0.751018,0.154739
baz,two,-1.526633,1.07714


In [68]:
stacked.unstack(1)

Unnamed: 0_level_0,second,one,two
first,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,A,0.108315,-0.100466
bar,B,-2.235064,-0.721393
baz,A,-0.751018,-1.526633
baz,B,0.154739,1.07714


In [69]:
stacked.unstack(0)

Unnamed: 0_level_0,first,bar,baz
second,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,A,0.108315,-0.751018
one,B,-2.235064,0.154739
two,A,-0.100466,-1.526633
two,B,-0.721393,1.07714


# Pivot Tables

In [70]:
df = pd.DataFrame(
    {
        "A": ["one", "one", "two", "three"] * 3,
        "B": ["A", "B", "C"] * 4,
        "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 2,
        "D": np.random.randn(12),
        "E": np.random.randn(12),
    }
)

In [71]:
df

Unnamed: 0,A,B,C,D,E
0,one,A,foo,-0.60492,0.173996
1,one,B,foo,-0.762342,-0.423677
2,two,C,foo,1.010396,-0.635711
3,three,A,bar,-0.266559,1.175046
4,one,B,bar,-0.414316,-0.427779
5,one,C,bar,-0.742574,-0.700377
6,two,A,foo,0.927181,1.347522
7,three,B,foo,-0.474591,0.918487
8,one,C,foo,-0.933497,-0.471965
9,one,A,bar,-0.583051,-0.050272


In [72]:
pd.pivot_table(df, values="D", index=["A", "B"], columns=["C"])

Unnamed: 0_level_0,C,bar,foo
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
one,A,-0.583051,-0.60492
one,B,-0.414316,-0.762342
one,C,-0.742574,-0.933497
three,A,-0.266559,
three,B,,-0.474591
three,C,0.347256,
two,A,,0.927181
two,B,1.855374,
two,C,,1.010396


In [73]:
arrays = [
   ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
   ["one", "two", "one", "two", "one", "two", "one", "two"],
]
arrays

[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
 ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]

In [74]:
import numpy as np
import pandas as pd
index=pd.MultiIndex.from_arrays(arrays,names=["first","second"])

In [75]:
index

MultiIndex([('bar', 'one'),
            ('bar', 'two'),
            ('baz', 'one'),
            ('baz', 'two'),
            ('foo', 'one'),
            ('foo', 'two'),
            ('qux', 'one'),
            ('qux', 'two')],
           names=['first', 'second'])

In [76]:
df=pd.DataFrame(np.random.randn(8,2),index=index,columns=["A","B"])

In [77]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.941362,0.737096
bar,two,-1.256819,0.657966
baz,one,0.591443,-0.698156
baz,two,-0.118981,0.186346
foo,one,-2.713973,1.1555
foo,two,-0.195862,0.578664
qux,one,-0.69064,0.558462
qux,two,-0.204393,-0.405058


In [78]:
df2=df[:4]

In [79]:
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.941362,0.737096
bar,two,-1.256819,0.657966
baz,one,0.591443,-0.698156
baz,two,-0.118981,0.186346


In [80]:
stacked = df2.stack()

In [81]:
stacked

first  second   
bar    one     A    0.941362
               B    0.737096
       two     A   -1.256819
               B    0.657966
baz    one     A    0.591443
               B   -0.698156
       two     A   -0.118981
               B    0.186346
dtype: float64

In [82]:
df = pd.DataFrame(
    {
        "A": ["one", "one", "two", "three"] * 3,
        "B": ["A", "B", "C"] * 4,
        "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 2,
        "D": np.random.randn(12),
        "E": np.random.randn(12),
    }
)
df

Unnamed: 0,A,B,C,D,E
0,one,A,foo,1.044343,-0.250256
1,one,B,foo,-0.648968,1.446932
2,two,C,foo,0.330859,-0.972489
3,three,A,bar,-0.277884,-0.000551
4,one,B,bar,0.388493,0.352952
5,one,C,bar,-0.266253,0.340092
6,two,A,foo,0.307687,0.274314
7,three,B,foo,-0.975462,-0.349713
8,one,C,foo,-0.850637,-1.499469
9,one,A,bar,-0.385186,0.541642


In [83]:
pd.pivot_table(df,values="D",index=["A","B"],columns=["C"])

Unnamed: 0_level_0,C,bar,foo
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
one,A,-0.385186,1.044343
one,B,0.388493,-0.648968
one,C,-0.266253,-0.850637
three,A,-0.277884,
three,B,,-0.975462
three,C,1.059962,
two,A,,0.307687
two,B,-0.065333,
two,C,,0.330859


# Time series

In [84]:
rng=pd.date_range("1/1/2012",periods=100,freq="S")
ts=pd.Series(np.random.randint(0,500,len(rng)),index=rng)

In [85]:
ts

2012-01-01 00:00:00    179
2012-01-01 00:00:01    240
2012-01-01 00:00:02    175
2012-01-01 00:00:03    200
2012-01-01 00:00:04    417
                      ... 
2012-01-01 00:01:35    274
2012-01-01 00:01:36    336
2012-01-01 00:01:37     49
2012-01-01 00:01:38    406
2012-01-01 00:01:39    439
Freq: S, Length: 100, dtype: int64

In [86]:
ts.resample("5Min").sum()

2012-01-01    26031
Freq: 5T, dtype: int64

In [87]:
ts

2012-01-01 00:00:00    179
2012-01-01 00:00:01    240
2012-01-01 00:00:02    175
2012-01-01 00:00:03    200
2012-01-01 00:00:04    417
                      ... 
2012-01-01 00:01:35    274
2012-01-01 00:01:36    336
2012-01-01 00:01:37     49
2012-01-01 00:01:38    406
2012-01-01 00:01:39    439
Freq: S, Length: 100, dtype: int64

In [88]:
rng = pd.date_range("3/6/2012 00:00", periods=5, freq="D")

In [89]:
ts = pd.Series(np.random.randn(len(rng)), rng)

In [90]:
ts

2012-03-06   -0.778081
2012-03-07    1.000824
2012-03-08    1.012013
2012-03-09   -0.897392
2012-03-10    0.941231
Freq: D, dtype: float64

In [91]:
ts_utc = ts.tz_localize("UTC")

In [92]:
ts_utc.tz_convert("US/Eastern")

2012-03-05 19:00:00-05:00   -0.778081
2012-03-06 19:00:00-05:00    1.000824
2012-03-07 19:00:00-05:00    1.012013
2012-03-08 19:00:00-05:00   -0.897392
2012-03-09 19:00:00-05:00    0.941231
Freq: D, dtype: float64

In [93]:
rng

DatetimeIndex(['2012-03-06', '2012-03-07', '2012-03-08', '2012-03-09',
               '2012-03-10'],
              dtype='datetime64[ns]', freq='D')

In [94]:
# rng+pd.offsets.Businessdasy(5)

 # Categoricals

In [95]:
df = pd.DataFrame(
    {"id": [1, 2, 3, 4, 5, 6], "raw_grade": ["a", "b", "b", "a", "a", "e"]}
)
df["grade"] = df["raw_grade"].astype("category")
df["grade"]

0    a
1    b
2    b
3    a
4    a
5    e
Name: grade, dtype: category
Categories (3, object): ['a', 'b', 'e']

In [96]:
pd.DataFrame(np.random.randint(0, 5, (10, 5)))

Unnamed: 0,0,1,2,3,4
0,1,1,1,4,1
1,4,1,1,0,2
2,4,2,4,4,2
3,1,2,2,0,1
4,3,2,2,3,3
5,3,4,0,3,0
6,3,1,1,3,3
7,4,3,3,2,4
8,1,3,2,0,4
9,2,0,3,3,2


# Intro to data structures

In [97]:
import numpy as np
import pandas as pd

**Series**

In [98]:
s=pd.Series(np.random.randn(5),index=["a","b","c","d","e"])

In [99]:
s.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [100]:
d = {"b": 1, "a": 0, "c": 2}

pd.Series(d)

b    1
a    0
c    2
dtype: int64

# Time series frequencies

In [101]:
# Alias    Description
# B        business day frequency
# C        custom business day frequency
# D        calendar day frequency
# W        weekly frequency
# M        month end frequency
# SM       semi-month end frequency (15th and end of month)
# BM       business month end frequency
# CBM      custom business month end frequency
# MS       month start frequency
# SMS      semi-month start frequency (1st and 15th)
# BMS      business month start frequency
# CBMS     custom business month start frequency
# Q        quarter end frequency
# BQ       business quarter end frequency
# QS       quarter start frequency
# BQS      business quarter start frequency
# A, Y     year end frequency
# BA, BY   business year end frequency
# AS, YS   year start frequency
# BAS, BYS business year start frequency
# BH       business hour frequency
# H        hourly frequency
# T, min   minutely frequency
# S        secondly frequency
# L, ms    milliseconds
# U, us    microseconds
# N        nanoseconds

# Cleaning Empty Cells

In [102]:
# import pandas as pd

# df = pd.read_csv('data.csv')

# new_df = df.dropna()

# print(new_df.to_string())

In [103]:
import numpy as np
import pandas as pd

from io import StringIO

data = "a,b,c,d\n1,2,3,4\n5,6,7,8\n9,10,11"
df = pd.read_csv(StringIO(data), dtype=object)

In [104]:
df

Unnamed: 0,a,b,c,d
0,1,2,3,4.0
1,5,6,7,8.0
2,9,10,11,


In [105]:
df=pd.read_csv(StringIO(data),dtype={"b":object,"c":np.float64,"d":"Int64"})

In [106]:
df.dtypes

a      int64
b     object
c    float64
d      Int64
dtype: object

In [107]:
df=pd.read_csv(StringIO(data),converters={"col_1":str})

In [108]:
# df["col_1"].apply(type).value_counts()

In [109]:
col_1 = list(range(500000)) + ["a", "b"] + list(range(500000))

In [110]:
df = pd.DataFrame({"col_1": col_1})

In [111]:
df

Unnamed: 0,col_1
0,0
1,1
2,2
3,3
4,4
...,...
999997,499995
999998,499996
999999,499997
1000000,499998


In [112]:
df.to_csv("foo.csv")

In [113]:
mixed_df=pd.read_csv("foo.csv")

  mixed_df=pd.read_csv("foo.csv")


In [114]:
mixed_df

Unnamed: 0.1,Unnamed: 0,col_1
0,0,0
1,1,1
2,2,2
3,3,3
4,4,4
...,...,...
999997,999997,499995
999998,999998,499996
999999,999999,499997
1000000,1000000,499998


In [115]:
mixed_df["col_1"].apply(type).value_counts()

<class 'int'>    737858
<class 'str'>    262144
Name: col_1, dtype: int64

# GroupBy

**Groupby iterator.**

In [116]:
import numpy as np
import pandas as pd

lst=['a','a','b']
ser=pd.Series([1,2,3], index=lst)

for x,y in ser.groupby(level=0):
    print(f'{x}\n{y}\n')

a
a    1
a    2
dtype: int64

b
b    3
dtype: int64



In [117]:
data = [[1, 2, 3], [1, 5, 6], [7, 8, 9]]
df = pd.DataFrame(data, columns=["a", "b", "c"])

for x,y in df.groupby(by=["a"]):
    print(f'{x}\n{y}\n')

1
   a  b  c
0  1  2  3
1  1  5  6

7
   a  b  c
2  7  8  9



  for x,y in df.groupby(by=["a"]):


# Group by: split-apply-combine

**By “group by” we are referring to a process involving one or more of the following steps:**

* Splitting the data into groups based on some criteria.

* Applying a function to each group independently.

* Combining the results into a data structure.

**Out of these, the split step is the most straightforward. In fact, in many situations we may wish to split the data set into groups and do something with those groups. In the apply step, we might wish to do one of the following:**

**Aggregation: compute a summary statistic (or statistics) for each group. Some examples:**

* Compute group sums or means.

* Compute group sizes / counts.

**Transformation: perform some group-specific computations and return a like-indexed object. Some examples:**

* Standardize data (zscore) within a group.

* Filling NAs within groups with a value derived from each group.

**Filtration: discard some groups, according to a group-wise computation that evaluates to True or False. Some examples:**

* Discard data that belong to groups with only a few members.

* Filter out data based on the group sum or mean.

***Splitting an object into groups***

In [118]:

import numpy as np
import pandas as pd

speeds=pd.DataFrame(
    [
        ("bird", "Falconiformes", 389.0),
        ("bird", "Psittaciformes", 24.0),
        ("mammal", "Carnivora", 80.2),
        ("mammal", "Primates", np.nan),
        ("mammal", "Carnivora", 58),
    ],
    index=["falcon", "parrot", "lion", "monkey", "leopard"],
    columns=("class", "order", "max_speed"),
)

In [119]:
speeds

Unnamed: 0,class,order,max_speed
falcon,bird,Falconiformes,389.0
parrot,bird,Psittaciformes,24.0
lion,mammal,Carnivora,80.2
monkey,mammal,Primates,
leopard,mammal,Carnivora,58.0


In [120]:
speeds.groupby("class").head()

Unnamed: 0,class,order,max_speed
falcon,bird,Falconiformes,389.0
parrot,bird,Psittaciformes,24.0
lion,mammal,Carnivora,80.2
monkey,mammal,Primates,
leopard,mammal,Carnivora,58.0


In [121]:
speeds.groupby(["class", "order"]).head()

Unnamed: 0,class,order,max_speed
falcon,bird,Falconiformes,389.0
parrot,bird,Psittaciformes,24.0
lion,mammal,Carnivora,80.2
monkey,mammal,Primates,
leopard,mammal,Carnivora,58.0


In [122]:
df=pd.DataFrame({
    "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
    "B": ["one", "one", "two", "three", "two", "two", "one", "three"],
    "C": np.random.randn(8),
    "D": np.random.randn(8),
})

In [123]:
df

Unnamed: 0,A,B,C,D
0,foo,one,1.157019,-1.550396
1,bar,one,-1.222328,-0.937805
2,foo,two,-0.291582,-0.001866
3,bar,three,0.462042,-0.613906
4,foo,two,1.877184,1.507131
5,bar,two,0.013912,0.436249
6,foo,one,-1.166284,-0.220298
7,foo,three,-0.206771,0.501958


In [124]:
df.groupby("A").head()

Unnamed: 0,A,B,C,D
0,foo,one,1.157019,-1.550396
1,bar,one,-1.222328,-0.937805
2,foo,two,-0.291582,-0.001866
3,bar,three,0.462042,-0.613906
4,foo,two,1.877184,1.507131
5,bar,two,0.013912,0.436249
6,foo,one,-1.166284,-0.220298
7,foo,three,-0.206771,0.501958


In [125]:
df.groupby(["A","B"]).head()

Unnamed: 0,A,B,C,D
0,foo,one,1.157019,-1.550396
1,bar,one,-1.222328,-0.937805
2,foo,two,-0.291582,-0.001866
3,bar,three,0.462042,-0.613906
4,foo,two,1.877184,1.507131
5,bar,two,0.013912,0.436249
6,foo,one,-1.166284,-0.220298
7,foo,three,-0.206771,0.501958


In [126]:
np.random.normal(0, 1.5, 200)

array([-0.92695772, -0.6015403 ,  0.61917037, -1.31798392,  0.1280348 ,
        1.05541874,  1.55259975, -0.24340718,  0.62754974, -1.41555956,
        0.82179728,  0.47941624, -0.45294451, -0.57943491, -0.47184005,
        3.22138932, -0.4388413 ,  1.86680822,  3.20923993,  0.20870024,
       -0.83921139, -3.75447584, -0.15681297,  1.3047252 , -0.99135784,
        1.30612104, -2.23501126,  0.80724537, -0.22761553, -2.22972172,
        0.38865841,  0.19700782,  1.32572342,  0.08324667,  0.053352  ,
        2.1518703 ,  0.76968778,  2.79194569, -3.22844388,  0.75093267,
        1.44367732,  2.06472918, -1.03510404,  1.38711533,  0.19664822,
        2.15379298,  1.31308747, -0.59367388, -0.60778958, -0.11305045,
        1.79299955, -0.93483199, -1.06198051,  1.77655699,  0.18458354,
       -0.78639972,  0.31085672,  1.64388225,  1.58063684, -0.29362682,
       -1.63968259, -0.19126256,  2.04013976, -0.01607975, -3.0842114 ,
        0.430366  , -1.82451537,  1.68923929, -0.13968364,  0.08

# Pandas datetime

In [127]:
import pandas as pd
import numpy as np

In [128]:
pd.date_range('2020-01-01',periods=7,freq='D')

DatetimeIndex(['2020-01-01', '2020-01-02', '2020-01-03', '2020-01-04',
               '2020-01-05', '2020-01-06', '2020-01-07'],
              dtype='datetime64[ns]', freq='D')

In [129]:
pd.date_range('Jan 01, 2018', periods=7, freq='D')

DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
               '2018-01-05', '2018-01-06', '2018-01-07'],
              dtype='datetime64[ns]', freq='D')

In [130]:
pd.to_datetime(['2--1--2018','3--1--2018'],format='%d--%m--%Y')

DatetimeIndex(['2018-01-02', '2018-01-03'], dtype='datetime64[ns]', freq=None)

In [131]:
data = np.random.randn(3,2)
cols = ['A','B']
print(data)

[[-1.24222198  1.72685285]
 [ 1.35305153  0.47338506]
 [-0.07846003  1.40927691]]


In [132]:
idx = pd.date_range('2020-01-01',periods=3,freq='D')

df=pd.DataFrame(data,index=idx, columns=cols)

In [133]:
df

Unnamed: 0,A,B
2020-01-01,-1.242222,1.726853
2020-01-02,1.353052,0.473385
2020-01-03,-0.07846,1.409277


In [134]:
df.index

DatetimeIndex(['2020-01-01', '2020-01-02', '2020-01-03'], dtype='datetime64[ns]', freq='D')

In [135]:
df.index.max()

Timestamp('2020-01-03 00:00:00', freq='D')

In [136]:
df.index.argmax()

2