<a href="https://www.kaggle.com/code/sagorkumarmitra/pandas-walkthrough?scriptVersionId=145335034" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# 10 minutes to pandas

In [1]:
import numpy as np
import pandas as pd

# Creating a Series by passing a list of values, letting pandas create a default integer index:

In [2]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])

In [3]:
dates=pd.date_range("20130101",periods=6)

In [4]:
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [5]:
df=pd.DataFrame(np.random.randn(6,4),index=dates,columns=list("ABCD"))

In [6]:
df

Unnamed: 0,A,B,C,D
2013-01-01,0.860147,-1.933634,-0.47983,-0.836053
2013-01-02,-0.772807,-0.407766,-0.481902,-0.056114
2013-01-03,-1.399026,-1.446114,-0.892167,1.489156
2013-01-04,-1.970745,-0.55111,1.090467,-0.971139
2013-01-05,-0.401375,1.4416,0.033352,1.229984
2013-01-06,-0.534859,-0.823849,0.6842,-0.938376


# Creating a DataFrame by passing a dictionary of objects that can be converted into a series-like structure:

In [7]:
df2=pd.DataFrame(
    {
        "A":1.0,
        "B":pd.Timestamp("20130102"),
        "C":pd.Series(1,index=list(range(4)),dtype="float32"),
        "D":np.array([3]*4,dtype="int32"),
        "E":pd.Categorical(["test","train","test","train"]),
        "F":"foo",
    }
)

In [8]:
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [9]:
df # previous DataFrame

Unnamed: 0,A,B,C,D
2013-01-01,0.860147,-1.933634,-0.47983,-0.836053
2013-01-02,-0.772807,-0.407766,-0.481902,-0.056114
2013-01-03,-1.399026,-1.446114,-0.892167,1.489156
2013-01-04,-1.970745,-0.55111,1.090467,-0.971139
2013-01-05,-0.401375,1.4416,0.033352,1.229984
2013-01-06,-0.534859,-0.823849,0.6842,-0.938376


In [10]:
df.to_numpy()

array([[ 0.86014703, -1.93363417, -0.47983027, -0.83605256],
       [-0.77280737, -0.40776648, -0.48190224, -0.05611372],
       [-1.39902558, -1.44611363, -0.89216704,  1.48915572],
       [-1.97074457, -0.55111047,  1.09046731, -0.9711388 ],
       [-0.40137488,  1.44159968,  0.03335179,  1.22998448],
       [-0.53485884, -0.82384911,  0.68420011, -0.93837615]])

In [11]:
df2.to_numpy()

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [12]:
df.sort_index(axis=1,ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,-0.836053,-0.47983,-1.933634,0.860147
2013-01-02,-0.056114,-0.481902,-0.407766,-0.772807
2013-01-03,1.489156,-0.892167,-1.446114,-1.399026
2013-01-04,-0.971139,1.090467,-0.55111,-1.970745
2013-01-05,1.229984,0.033352,1.4416,-0.401375
2013-01-06,-0.938376,0.6842,-0.823849,-0.534859


**DataFrame.loc**

In [13]:
df.loc['2013-01-01']

A    0.860147
B   -1.933634
C   -0.479830
D   -0.836053
Name: 2013-01-01 00:00:00, dtype: float64

In [14]:
df

Unnamed: 0,A,B,C,D
2013-01-01,0.860147,-1.933634,-0.47983,-0.836053
2013-01-02,-0.772807,-0.407766,-0.481902,-0.056114
2013-01-03,-1.399026,-1.446114,-0.892167,1.489156
2013-01-04,-1.970745,-0.55111,1.090467,-0.971139
2013-01-05,-0.401375,1.4416,0.033352,1.229984
2013-01-06,-0.534859,-0.823849,0.6842,-0.938376


In [15]:
df.loc[dates[0]]

A    0.860147
B   -1.933634
C   -0.479830
D   -0.836053
Name: 2013-01-01 00:00:00, dtype: float64

In [16]:
df.loc["20130102":"20130104", ["A", "B"]]

Unnamed: 0,A,B
2013-01-02,-0.772807,-0.407766
2013-01-03,-1.399026,-1.446114
2013-01-04,-1.970745,-0.55111


In [17]:
df.loc[:, ["A", "B"]]

Unnamed: 0,A,B
2013-01-01,0.860147,-1.933634
2013-01-02,-0.772807,-0.407766
2013-01-03,-1.399026,-1.446114
2013-01-04,-1.970745,-0.55111
2013-01-05,-0.401375,1.4416
2013-01-06,-0.534859,-0.823849


In [18]:
df.loc["20130102":"20130104", ["A", "B"]]

Unnamed: 0,A,B
2013-01-02,-0.772807,-0.407766
2013-01-03,-1.399026,-1.446114
2013-01-04,-1.970745,-0.55111


In [19]:
df.loc["20130102", ["A", "B"]]

A   -0.772807
B   -0.407766
Name: 2013-01-02 00:00:00, dtype: float64

In [20]:
df.loc[dates[0], "A"]

0.8601470318417227

In [21]:
df.at[dates[0], "A"]

0.8601470318417227

In [22]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-04,-1.970745,-0.55111
2013-01-05,-0.401375,1.4416


In [23]:
df.iloc[[1, 2, 4], [0, 2]]

Unnamed: 0,A,C
2013-01-02,-0.772807,-0.481902
2013-01-03,-1.399026,-0.892167
2013-01-05,-0.401375,0.033352


In [24]:
df

Unnamed: 0,A,B,C,D
2013-01-01,0.860147,-1.933634,-0.47983,-0.836053
2013-01-02,-0.772807,-0.407766,-0.481902,-0.056114
2013-01-03,-1.399026,-1.446114,-0.892167,1.489156
2013-01-04,-1.970745,-0.55111,1.090467,-0.971139
2013-01-05,-0.401375,1.4416,0.033352,1.229984
2013-01-06,-0.534859,-0.823849,0.6842,-0.938376


In [25]:
df[df["A"]>0]

Unnamed: 0,A,B,C,D
2013-01-01,0.860147,-1.933634,-0.47983,-0.836053


In [26]:
df2=df.copy()

In [27]:
df2["E"]=["one","one","two","three","four","three"]

In [28]:
df2

Unnamed: 0,A,B,C,D,E
2013-01-01,0.860147,-1.933634,-0.47983,-0.836053,one
2013-01-02,-0.772807,-0.407766,-0.481902,-0.056114,one
2013-01-03,-1.399026,-1.446114,-0.892167,1.489156,two
2013-01-04,-1.970745,-0.55111,1.090467,-0.971139,three
2013-01-05,-0.401375,1.4416,0.033352,1.229984,four
2013-01-06,-0.534859,-0.823849,0.6842,-0.938376,three


In [29]:
df2[df2["E"].isin(["two","four"])]

Unnamed: 0,A,B,C,D,E
2013-01-03,-1.399026,-1.446114,-0.892167,1.489156,two
2013-01-05,-0.401375,1.4416,0.033352,1.229984,four


In [30]:
df

Unnamed: 0,A,B,C,D
2013-01-01,0.860147,-1.933634,-0.47983,-0.836053
2013-01-02,-0.772807,-0.407766,-0.481902,-0.056114
2013-01-03,-1.399026,-1.446114,-0.892167,1.489156
2013-01-04,-1.970745,-0.55111,1.090467,-0.971139
2013-01-05,-0.401375,1.4416,0.033352,1.229984
2013-01-06,-0.534859,-0.823849,0.6842,-0.938376


In [31]:
df.at[dates[0],"A"]=0

In [32]:
df

Unnamed: 0,A,B,C,D
2013-01-01,0.0,-1.933634,-0.47983,-0.836053
2013-01-02,-0.772807,-0.407766,-0.481902,-0.056114
2013-01-03,-1.399026,-1.446114,-0.892167,1.489156
2013-01-04,-1.970745,-0.55111,1.090467,-0.971139
2013-01-05,-0.401375,1.4416,0.033352,1.229984
2013-01-06,-0.534859,-0.823849,0.6842,-0.938376


In [33]:
df.iat[0, 1] = 0

In [34]:
df

Unnamed: 0,A,B,C,D
2013-01-01,0.0,0.0,-0.47983,-0.836053
2013-01-02,-0.772807,-0.407766,-0.481902,-0.056114
2013-01-03,-1.399026,-1.446114,-0.892167,1.489156
2013-01-04,-1.970745,-0.55111,1.090467,-0.971139
2013-01-05,-0.401375,1.4416,0.033352,1.229984
2013-01-06,-0.534859,-0.823849,0.6842,-0.938376


In [35]:
df.loc[:,"D"]=np.array([5] * len(df))

  df.loc[:,"D"]=np.array([5] * len(df))


In [36]:
df

Unnamed: 0,A,B,C,D
2013-01-01,0.0,0.0,-0.47983,5
2013-01-02,-0.772807,-0.407766,-0.481902,5
2013-01-03,-1.399026,-1.446114,-0.892167,5
2013-01-04,-1.970745,-0.55111,1.090467,5
2013-01-05,-0.401375,1.4416,0.033352,5
2013-01-06,-0.534859,-0.823849,0.6842,5


In [37]:
df2=df.copy()

In [38]:
df2[df2>0]=-df2

In [39]:
df2

Unnamed: 0,A,B,C,D
2013-01-01,0.0,0.0,-0.47983,-5
2013-01-02,-0.772807,-0.407766,-0.481902,-5
2013-01-03,-1.399026,-1.446114,-0.892167,-5
2013-01-04,-1.970745,-0.55111,-1.090467,-5
2013-01-05,-0.401375,-1.4416,-0.033352,-5
2013-01-06,-0.534859,-0.823849,-0.6842,-5


In [40]:
-df2

Unnamed: 0,A,B,C,D
2013-01-01,-0.0,-0.0,0.47983,5
2013-01-02,0.772807,0.407766,0.481902,5
2013-01-03,1.399026,1.446114,0.892167,5
2013-01-04,1.970745,0.55111,1.090467,5
2013-01-05,0.401375,1.4416,0.033352,5
2013-01-06,0.534859,0.823849,0.6842,5


In [41]:
import pandas as pd
import numpy as np

s = pd.Series(["A", "B", "C", "Aaba", "Baca", np.nan, "CABA", "dog", "cat"])
s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

In [42]:
df = pd.DataFrame(np.random.randn(10, 4))

In [43]:
df

Unnamed: 0,0,1,2,3
0,-1.633571,-0.01679,1.198109,-0.540091
1,-0.360663,-0.915368,-0.594786,0.167495
2,-0.321762,0.142117,-1.343312,0.631665
3,0.433198,0.402008,-1.106269,-0.4563
4,-0.648203,1.147896,2.009484,0.347734
5,0.811997,1.150661,2.176003,1.040776
6,-0.594183,-0.654127,0.201597,0.600299
7,0.575565,-0.823939,-1.036429,1.271397
8,0.303185,0.185887,-0.169357,0.784936
9,-1.164668,0.730173,0.391492,-0.733102


In [44]:
left = pd.DataFrame({"key": ["foo", "foo"], "lval": [1, 2]})
right = pd.DataFrame({"key": ["foo", "foo"], "rval": [4, 5]})
pd.merge(left, right, on="key")

Unnamed: 0,key,lval,rval
0,foo,1,4
1,foo,1,5
2,foo,2,4
3,foo,2,5


In [45]:
df = pd.DataFrame(
     {
         "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
         "B": ["one", "one", "two", "three", "two", "two", "one", "three"],
         "C": np.random.randn(8),
         "D": np.random.randn(8),
     }
 )
 

In [46]:
df

Unnamed: 0,A,B,C,D
0,foo,one,0.949009,-0.391814
1,bar,one,-0.175666,-1.910016
2,foo,two,-0.73032,2.666664
3,bar,three,0.173587,-0.840697
4,foo,two,0.349091,1.138259
5,bar,two,-0.643931,-1.456664
6,foo,one,-0.33848,-1.699517
7,foo,three,1.521962,-0.033256


In [47]:
df.groupby("A")[["C","D"]].sum()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,-0.64601,-4.207377
foo,1.751262,1.680337


In [48]:
df.groupby(["A", "B"]).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.175666,-1.910016
bar,three,0.173587,-0.840697
bar,two,-0.643931,-1.456664
foo,one,0.610529,-2.091331
foo,three,1.521962,-0.033256
foo,two,-0.381229,3.804923


# Stack

In [49]:
tuples=list(
    zip(
        ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
        ["one", "two", "one", "two", "one", "two", "one", "two"],
    )
)

In [50]:
tuples

[('bar', 'one'),
 ('bar', 'two'),
 ('baz', 'one'),
 ('baz', 'two'),
 ('foo', 'one'),
 ('foo', 'two'),
 ('qux', 'one'),
 ('qux', 'two')]

In [51]:
zip(
        ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
        ["one", "two", "one", "two", "one", "two", "one", "two"],
    )

<zip at 0x7a5538a12040>

In [52]:
index = pd.MultiIndex.from_tuples(tuples, names=["first", "second"])

In [53]:
index

MultiIndex([('bar', 'one'),
            ('bar', 'two'),
            ('baz', 'one'),
            ('baz', 'two'),
            ('foo', 'one'),
            ('foo', 'two'),
            ('qux', 'one'),
            ('qux', 'two')],
           names=['first', 'second'])

In [54]:
arrays=[
        ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
        ["one", "two", "one", "two", "one", "two", "one", "two"],
]

In [55]:
arrays

[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
 ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]

In [56]:
list(zip(*arrays))

[('bar', 'one'),
 ('bar', 'two'),
 ('baz', 'one'),
 ('baz', 'two'),
 ('foo', 'one'),
 ('foo', 'two'),
 ('qux', 'one'),
 ('qux', 'two')]

In [57]:
index = pd.MultiIndex.from_tuples(tuples, names=["first", "second"])

In [58]:
index

MultiIndex([('bar', 'one'),
            ('bar', 'two'),
            ('baz', 'one'),
            ('baz', 'two'),
            ('foo', 'one'),
            ('foo', 'two'),
            ('qux', 'one'),
            ('qux', 'two')],
           names=['first', 'second'])

In [59]:
s = pd.Series(np.random.randn(8), index=index)

In [60]:
s

first  second
bar    one      -0.938663
       two       0.241603
baz    one      -0.645996
       two      -0.392015
foo    one      -0.025045
       two      -0.002669
qux    one      -0.094438
       two      -1.797596
dtype: float64

In [61]:
df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=["A", "B"])

In [62]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.694574,-0.849773
bar,two,-0.185691,0.298765
baz,one,0.286282,-0.707001
baz,two,-0.697431,-1.049139
foo,one,-1.665758,0.853795
foo,two,2.823737,-0.953395
qux,one,-1.211285,-0.360972
qux,two,1.38092,-0.463413


In [63]:
df2=df[:4]

In [64]:
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.694574,-0.849773
bar,two,-0.185691,0.298765
baz,one,0.286282,-0.707001
baz,two,-0.697431,-1.049139


In [65]:
stacked=df2.stack()

In [66]:
stacked

first  second   
bar    one     A   -0.694574
               B   -0.849773
       two     A   -0.185691
               B    0.298765
baz    one     A    0.286282
               B   -0.707001
       two     A   -0.697431
               B   -1.049139
dtype: float64

In [67]:
stacked.unstack()

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.694574,-0.849773
bar,two,-0.185691,0.298765
baz,one,0.286282,-0.707001
baz,two,-0.697431,-1.049139


In [68]:
stacked.unstack(1)

Unnamed: 0_level_0,second,one,two
first,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,A,-0.694574,-0.185691
bar,B,-0.849773,0.298765
baz,A,0.286282,-0.697431
baz,B,-0.707001,-1.049139


In [69]:
stacked.unstack(0)

Unnamed: 0_level_0,first,bar,baz
second,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,A,-0.694574,0.286282
one,B,-0.849773,-0.707001
two,A,-0.185691,-0.697431
two,B,0.298765,-1.049139


# Pivot Tables

In [70]:
df = pd.DataFrame(
    {
        "A": ["one", "one", "two", "three"] * 3,
        "B": ["A", "B", "C"] * 4,
        "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 2,
        "D": np.random.randn(12),
        "E": np.random.randn(12),
    }
)

In [71]:
df

Unnamed: 0,A,B,C,D,E
0,one,A,foo,-1.248174,0.408096
1,one,B,foo,1.573161,0.571375
2,two,C,foo,-0.522075,0.17168
3,three,A,bar,0.759574,-1.218231
4,one,B,bar,-0.577737,0.231107
5,one,C,bar,-1.491902,-1.423369
6,two,A,foo,0.190556,0.654786
7,three,B,foo,1.27316,-1.446386
8,one,C,foo,-0.040131,-1.239096
9,one,A,bar,0.794275,0.687174


In [72]:
pd.pivot_table(df, values="D", index=["A", "B"], columns=["C"])

Unnamed: 0_level_0,C,bar,foo
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
one,A,0.794275,-1.248174
one,B,-0.577737,1.573161
one,C,-1.491902,-0.040131
three,A,0.759574,
three,B,,1.27316
three,C,-1.009025,
two,A,,0.190556
two,B,1.857932,
two,C,,-0.522075


In [73]:
arrays = [
   ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
   ["one", "two", "one", "two", "one", "two", "one", "two"],
]
arrays

[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
 ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]

In [74]:
import numpy as np
import pandas as pd
index=pd.MultiIndex.from_arrays(arrays,names=["first","second"])

In [75]:
index

MultiIndex([('bar', 'one'),
            ('bar', 'two'),
            ('baz', 'one'),
            ('baz', 'two'),
            ('foo', 'one'),
            ('foo', 'two'),
            ('qux', 'one'),
            ('qux', 'two')],
           names=['first', 'second'])

In [76]:
df=pd.DataFrame(np.random.randn(8,2),index=index,columns=["A","B"])

In [77]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.256255,0.49499
bar,two,-0.348,-1.809996
baz,one,0.982005,-0.612843
baz,two,0.462446,-0.268282
foo,one,-2.491033,-1.52183
foo,two,-0.824925,-0.254274
qux,one,-1.267682,-1.132781
qux,two,1.697986,1.272514


In [78]:
df2=df[:4]

In [79]:
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.256255,0.49499
bar,two,-0.348,-1.809996
baz,one,0.982005,-0.612843
baz,two,0.462446,-0.268282


In [80]:
stacked = df2.stack()

In [81]:
stacked

first  second   
bar    one     A   -0.256255
               B    0.494990
       two     A   -0.348000
               B   -1.809996
baz    one     A    0.982005
               B   -0.612843
       two     A    0.462446
               B   -0.268282
dtype: float64

In [82]:
df = pd.DataFrame(
    {
        "A": ["one", "one", "two", "three"] * 3,
        "B": ["A", "B", "C"] * 4,
        "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 2,
        "D": np.random.randn(12),
        "E": np.random.randn(12),
    }
)
df

Unnamed: 0,A,B,C,D,E
0,one,A,foo,0.595291,-0.772828
1,one,B,foo,1.1557,-1.187484
2,two,C,foo,-1.076652,-0.051279
3,three,A,bar,1.272819,-0.268071
4,one,B,bar,-0.281873,-2.966412
5,one,C,bar,0.516983,-0.282756
6,two,A,foo,1.840931,0.355233
7,three,B,foo,-0.807529,0.940504
8,one,C,foo,-0.835728,1.374761
9,one,A,bar,-0.214748,-0.479428


In [83]:
pd.pivot_table(df,values="D",index=["A","B"],columns=["C"])

Unnamed: 0_level_0,C,bar,foo
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
one,A,-0.214748,0.595291
one,B,-0.281873,1.1557
one,C,0.516983,-0.835728
three,A,1.272819,
three,B,,-0.807529
three,C,1.284301,
two,A,,1.840931
two,B,1.239337,
two,C,,-1.076652


# Time series

In [84]:
rng=pd.date_range("1/1/2012",periods=100,freq="S")
ts=pd.Series(np.random.randint(0,500,len(rng)),index=rng)

In [85]:
ts

2012-01-01 00:00:00    338
2012-01-01 00:00:01    445
2012-01-01 00:00:02    440
2012-01-01 00:00:03    102
2012-01-01 00:00:04    211
                      ... 
2012-01-01 00:01:35     20
2012-01-01 00:01:36    182
2012-01-01 00:01:37    204
2012-01-01 00:01:38    369
2012-01-01 00:01:39    374
Freq: S, Length: 100, dtype: int64

In [86]:
ts.resample("5Min").sum()

2012-01-01    25087
Freq: 5T, dtype: int64

In [87]:
ts

2012-01-01 00:00:00    338
2012-01-01 00:00:01    445
2012-01-01 00:00:02    440
2012-01-01 00:00:03    102
2012-01-01 00:00:04    211
                      ... 
2012-01-01 00:01:35     20
2012-01-01 00:01:36    182
2012-01-01 00:01:37    204
2012-01-01 00:01:38    369
2012-01-01 00:01:39    374
Freq: S, Length: 100, dtype: int64

In [88]:
rng = pd.date_range("3/6/2012 00:00", periods=5, freq="D")

In [89]:
ts = pd.Series(np.random.randn(len(rng)), rng)

In [90]:
ts

2012-03-06   -1.429559
2012-03-07   -1.731505
2012-03-08   -0.059579
2012-03-09   -0.047880
2012-03-10    1.491555
Freq: D, dtype: float64

In [91]:
ts_utc = ts.tz_localize("UTC")

In [92]:
ts_utc.tz_convert("US/Eastern")

2012-03-05 19:00:00-05:00   -1.429559
2012-03-06 19:00:00-05:00   -1.731505
2012-03-07 19:00:00-05:00   -0.059579
2012-03-08 19:00:00-05:00   -0.047880
2012-03-09 19:00:00-05:00    1.491555
Freq: D, dtype: float64

In [93]:
rng

DatetimeIndex(['2012-03-06', '2012-03-07', '2012-03-08', '2012-03-09',
               '2012-03-10'],
              dtype='datetime64[ns]', freq='D')

In [94]:
# rng+pd.offsets.Businessdasy(5)

 # Categoricals

In [95]:
df = pd.DataFrame(
    {"id": [1, 2, 3, 4, 5, 6], "raw_grade": ["a", "b", "b", "a", "a", "e"]}
)
df["grade"] = df["raw_grade"].astype("category")
df["grade"]

0    a
1    b
2    b
3    a
4    a
5    e
Name: grade, dtype: category
Categories (3, object): ['a', 'b', 'e']

In [96]:
pd.DataFrame(np.random.randint(0, 5, (10, 5)))

Unnamed: 0,0,1,2,3,4
0,4,0,4,4,1
1,1,1,1,0,1
2,0,4,3,1,1
3,1,4,2,0,1
4,2,1,0,0,2
5,1,2,0,0,0
6,0,4,3,2,0
7,4,0,4,4,3
8,3,1,3,4,2
9,0,4,1,3,1


# Intro to data structures

In [97]:
import numpy as np
import pandas as pd

**Series**

In [98]:
s=pd.Series(np.random.randn(5),index=["a","b","c","d","e"])

In [99]:
s.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [100]:
d = {"b": 1, "a": 0, "c": 2}

pd.Series(d)

b    1
a    0
c    2
dtype: int64

# Time series frequencies

In [101]:
# Alias    Description
# B        business day frequency
# C        custom business day frequency
# D        calendar day frequency
# W        weekly frequency
# M        month end frequency
# SM       semi-month end frequency (15th and end of month)
# BM       business month end frequency
# CBM      custom business month end frequency
# MS       month start frequency
# SMS      semi-month start frequency (1st and 15th)
# BMS      business month start frequency
# CBMS     custom business month start frequency
# Q        quarter end frequency
# BQ       business quarter end frequency
# QS       quarter start frequency
# BQS      business quarter start frequency
# A, Y     year end frequency
# BA, BY   business year end frequency
# AS, YS   year start frequency
# BAS, BYS business year start frequency
# BH       business hour frequency
# H        hourly frequency
# T, min   minutely frequency
# S        secondly frequency
# L, ms    milliseconds
# U, us    microseconds
# N        nanoseconds

# Cleaning Empty Cells

In [102]:
# import pandas as pd

# df = pd.read_csv('data.csv')

# new_df = df.dropna()

# print(new_df.to_string())

In [103]:
import numpy as np
import pandas as pd

from io import StringIO

data = "a,b,c,d\n1,2,3,4\n5,6,7,8\n9,10,11"
df = pd.read_csv(StringIO(data), dtype=object)

In [104]:
df

Unnamed: 0,a,b,c,d
0,1,2,3,4.0
1,5,6,7,8.0
2,9,10,11,


In [105]:
df=pd.read_csv(StringIO(data),dtype={"b":object,"c":np.float64,"d":"Int64"})

In [106]:
df.dtypes

a      int64
b     object
c    float64
d      Int64
dtype: object

In [107]:
df=pd.read_csv(StringIO(data),converters={"col_1":str})

In [108]:
# df["col_1"].apply(type).value_counts()

In [109]:
col_1 = list(range(500000)) + ["a", "b"] + list(range(500000))

In [110]:
df = pd.DataFrame({"col_1": col_1})

In [111]:
df

Unnamed: 0,col_1
0,0
1,1
2,2
3,3
4,4
...,...
999997,499995
999998,499996
999999,499997
1000000,499998


In [112]:
df.to_csv("foo.csv")

In [113]:
mixed_df=pd.read_csv("foo.csv")

  mixed_df=pd.read_csv("foo.csv")


In [114]:
mixed_df

Unnamed: 0.1,Unnamed: 0,col_1
0,0,0
1,1,1
2,2,2
3,3,3
4,4,4
...,...,...
999997,999997,499995
999998,999998,499996
999999,999999,499997
1000000,1000000,499998


In [115]:
mixed_df["col_1"].apply(type).value_counts()

<class 'int'>    737858
<class 'str'>    262144
Name: col_1, dtype: int64