**Object creation**

In [1]:
#https://pandas.pydata.org/docs/user_guide/10min.html

In [2]:
import pandas as pd
import numpy as np

In [3]:
df2 = pd.DataFrame(
{
    "A":1.0,
    "B":pd.Timestamp("20230101"),
    "C":pd.Series(1,index=list(range(4)),dtype='float32'),
    "D":pd.array([3]*4,dtype='float32'),
    "F":pd.Categorical(["Alpha","Beta","Gamma","Delta"])
})
df2

Unnamed: 0,A,B,C,D,F
0,1.0,2023-01-01,1.0,3.0,Alpha
1,1.0,2023-01-01,1.0,3.0,Beta
2,1.0,2023-01-01,1.0,3.0,Gamma
3,1.0,2023-01-01,1.0,3.0,Delta


In [4]:

df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D           float32
F          category
dtype: object

In [5]:
dates = pd.date_range("20230101",periods=6)
df = pd.DataFrame(np.random.rand(6,4),index=dates,columns=list("ABCD"))
df

Unnamed: 0,A,B,C,D
2023-01-01,0.758088,0.854673,0.432484,0.914823
2023-01-02,0.509351,0.348552,0.221522,0.830087
2023-01-03,0.451705,0.536114,0.996157,0.400197
2023-01-04,0.8355,0.644957,0.127148,0.284533
2023-01-05,0.527048,0.147405,0.867396,0.429417
2023-01-06,0.492675,0.524191,0.629641,0.253977


**Viewing data**

In [6]:
df.head()

Unnamed: 0,A,B,C,D
2023-01-01,0.758088,0.854673,0.432484,0.914823
2023-01-02,0.509351,0.348552,0.221522,0.830087
2023-01-03,0.451705,0.536114,0.996157,0.400197
2023-01-04,0.8355,0.644957,0.127148,0.284533
2023-01-05,0.527048,0.147405,0.867396,0.429417


In [7]:
df.tail()

Unnamed: 0,A,B,C,D
2023-01-02,0.509351,0.348552,0.221522,0.830087
2023-01-03,0.451705,0.536114,0.996157,0.400197
2023-01-04,0.8355,0.644957,0.127148,0.284533
2023-01-05,0.527048,0.147405,0.867396,0.429417
2023-01-06,0.492675,0.524191,0.629641,0.253977


In [8]:
df.index

DatetimeIndex(['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04',
               '2023-01-05', '2023-01-06'],
              dtype='datetime64[ns]', freq='D')

In [9]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

**DataFrame.to_numpy()**  gives a NumPy representation of the underlying data. Note that this can be an expensive operation when your DataFrame has columns with different data types, which comes down to a fundamental difference between pandas and NumPy: NumPy arrays have one dtype for the entire array, while pandas DataFrames have one dtype per column. When you call DataFrame.to_numpy(), pandas will find the NumPy dtype that can hold all of the dtypes in the DataFrame. This may end up being object, which requires casting every value to a Python object.

In [10]:
df2.to_numpy()

array([[1.0, Timestamp('2023-01-01 00:00:00'), 1.0, 3.0, 'Alpha'],
       [1.0, Timestamp('2023-01-01 00:00:00'), 1.0, 3.0, 'Beta'],
       [1.0, Timestamp('2023-01-01 00:00:00'), 1.0, 3.0, 'Gamma'],
       [1.0, Timestamp('2023-01-01 00:00:00'), 1.0, 3.0, 'Delta']],
      dtype=object)

In [11]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.595728,0.509315,0.545725,0.518839
std,0.159619,0.243078,0.348293,0.283126
min,0.451705,0.147405,0.127148,0.253977
25%,0.496844,0.392462,0.274263,0.313449
50%,0.518199,0.530152,0.531062,0.414807
75%,0.700328,0.617746,0.807957,0.729919
max,0.8355,0.854673,0.996157,0.914823


In [12]:
df.T

Unnamed: 0,2023-01-01,2023-01-02,2023-01-03,2023-01-04,2023-01-05,2023-01-06
A,0.758088,0.509351,0.451705,0.8355,0.527048,0.492675
B,0.854673,0.348552,0.536114,0.644957,0.147405,0.524191
C,0.432484,0.221522,0.996157,0.127148,0.867396,0.629641
D,0.914823,0.830087,0.400197,0.284533,0.429417,0.253977


In [13]:
df.sort_index(axis=1,ascending=False)

Unnamed: 0,D,C,B,A
2023-01-01,0.914823,0.432484,0.854673,0.758088
2023-01-02,0.830087,0.221522,0.348552,0.509351
2023-01-03,0.400197,0.996157,0.536114,0.451705
2023-01-04,0.284533,0.127148,0.644957,0.8355
2023-01-05,0.429417,0.867396,0.147405,0.527048
2023-01-06,0.253977,0.629641,0.524191,0.492675


In [14]:
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2023-01-05,0.527048,0.147405,0.867396,0.429417
2023-01-02,0.509351,0.348552,0.221522,0.830087
2023-01-06,0.492675,0.524191,0.629641,0.253977
2023-01-03,0.451705,0.536114,0.996157,0.400197
2023-01-04,0.8355,0.644957,0.127148,0.284533
2023-01-01,0.758088,0.854673,0.432484,0.914823


__Selection__

In [15]:
df['A']

2023-01-01    0.758088
2023-01-02    0.509351
2023-01-03    0.451705
2023-01-04    0.835500
2023-01-05    0.527048
2023-01-06    0.492675
Freq: D, Name: A, dtype: float64

In [16]:
df[0:3]

Unnamed: 0,A,B,C,D
2023-01-01,0.758088,0.854673,0.432484,0.914823
2023-01-02,0.509351,0.348552,0.221522,0.830087
2023-01-03,0.451705,0.536114,0.996157,0.400197


In [17]:
df['2023-01-01':'2023-01-03']

Unnamed: 0,A,B,C,D
2023-01-01,0.758088,0.854673,0.432484,0.914823
2023-01-02,0.509351,0.348552,0.221522,0.830087
2023-01-03,0.451705,0.536114,0.996157,0.400197


<b>Selection by label</b>

In [18]:
df.loc[dates[0]]

A    0.758088
B    0.854673
C    0.432484
D    0.914823
Name: 2023-01-01 00:00:00, dtype: float64

In [19]:
df.loc[:,["A","B"]]

Unnamed: 0,A,B
2023-01-01,0.758088,0.854673
2023-01-02,0.509351,0.348552
2023-01-03,0.451705,0.536114
2023-01-04,0.8355,0.644957
2023-01-05,0.527048,0.147405
2023-01-06,0.492675,0.524191


In [20]:
df.loc["2023-01-01":"2023-01-03",["A","B"]]

Unnamed: 0,A,B
2023-01-01,0.758088,0.854673
2023-01-02,0.509351,0.348552
2023-01-03,0.451705,0.536114


In [21]:
df.loc["2023-01-01",["A","B"]]

A    0.758088
B    0.854673
Name: 2023-01-01 00:00:00, dtype: float64

In [22]:
df.loc[dates[0],"A"]

0.7580884379270217

In [23]:
<b>Selection by position</b>

SyntaxError: invalid syntax (1536854253.py, line 1)

<b>Selection by position</b>

In [24]:
df.iloc[3]

A    0.835500
B    0.644957
C    0.127148
D    0.284533
Name: 2023-01-04 00:00:00, dtype: float64

In [25]:
df.iloc[3:5,0:2]

Unnamed: 0,A,B
2023-01-04,0.8355,0.644957
2023-01-05,0.527048,0.147405


In [26]:
df.iloc[[1,2,4],[0,2]]

Unnamed: 0,A,C
2023-01-02,0.509351,0.221522
2023-01-03,0.451705,0.996157
2023-01-05,0.527048,0.867396


In [27]:
df.iloc[:,1:3]

Unnamed: 0,B,C
2023-01-01,0.854673,0.432484
2023-01-02,0.348552,0.221522
2023-01-03,0.536114,0.996157
2023-01-04,0.644957,0.127148
2023-01-05,0.147405,0.867396
2023-01-06,0.524191,0.629641


In [28]:
df

Unnamed: 0,A,B,C,D
2023-01-01,0.758088,0.854673,0.432484,0.914823
2023-01-02,0.509351,0.348552,0.221522,0.830087
2023-01-03,0.451705,0.536114,0.996157,0.400197
2023-01-04,0.8355,0.644957,0.127148,0.284533
2023-01-05,0.527048,0.147405,0.867396,0.429417
2023-01-06,0.492675,0.524191,0.629641,0.253977


In [29]:
df.iloc[1:3,:]

Unnamed: 0,A,B,C,D
2023-01-02,0.509351,0.348552,0.221522,0.830087
2023-01-03,0.451705,0.536114,0.996157,0.400197


In [30]:
df.iloc[:,1:3]

Unnamed: 0,B,C
2023-01-01,0.854673,0.432484
2023-01-02,0.348552,0.221522
2023-01-03,0.536114,0.996157
2023-01-04,0.644957,0.127148
2023-01-05,0.147405,0.867396
2023-01-06,0.524191,0.629641


In [31]:
#For getting a value explicitly:
df.iloc[1,1]

0.34855220608683746

In [32]:
#For getting fast access to a scalar (equivalent to the prior method):
df.iat[1,1]

0.34855220608683746

In [33]:
#diff between loc vs iloc vs iat
#https://stackoverflow.com/questions/28757389/pandas-loc-vs-iloc-vs-at-vs-iat

__Boolean indexing__

In [34]:
df[df['A']>0]

Unnamed: 0,A,B,C,D
2023-01-01,0.758088,0.854673,0.432484,0.914823
2023-01-02,0.509351,0.348552,0.221522,0.830087
2023-01-03,0.451705,0.536114,0.996157,0.400197
2023-01-04,0.8355,0.644957,0.127148,0.284533
2023-01-05,0.527048,0.147405,0.867396,0.429417
2023-01-06,0.492675,0.524191,0.629641,0.253977


In [35]:
df[df>0]

Unnamed: 0,A,B,C,D
2023-01-01,0.758088,0.854673,0.432484,0.914823
2023-01-02,0.509351,0.348552,0.221522,0.830087
2023-01-03,0.451705,0.536114,0.996157,0.400197
2023-01-04,0.8355,0.644957,0.127148,0.284533
2023-01-05,0.527048,0.147405,0.867396,0.429417
2023-01-06,0.492675,0.524191,0.629641,0.253977


In [36]:
df_g =["Alpha","Beta","Gamma","Deleta","Eplison","Zeta"]
df4 = pd.DataFrame([df_g])
df4
df4[df4.isin(["two","three"])]

Unnamed: 0,0,1,2,3,4,5
0,,,,,,


In [37]:
df4[df4.isin(["Alpha"])]

Unnamed: 0,0,1,2,3,4,5
0,Alpha,,,,,


In [38]:
df5 = df.copy()
df5["E"] =["Alpha","Beta","Gamma","Deleta","Eplison","Zeta"]

In [39]:
df5

Unnamed: 0,A,B,C,D,E
2023-01-01,0.758088,0.854673,0.432484,0.914823,Alpha
2023-01-02,0.509351,0.348552,0.221522,0.830087,Beta
2023-01-03,0.451705,0.536114,0.996157,0.400197,Gamma
2023-01-04,0.8355,0.644957,0.127148,0.284533,Deleta
2023-01-05,0.527048,0.147405,0.867396,0.429417,Eplison
2023-01-06,0.492675,0.524191,0.629641,0.253977,Zeta


In [40]:
df5[df5["E"].isin(["Zeta"])]

Unnamed: 0,A,B,C,D,E
2023-01-06,0.492675,0.524191,0.629641,0.253977,Zeta


__Setting__    

Setting a new column automatically aligns the data by the indexes:

In [41]:
df.mean()

A    0.595728
B    0.509315
C    0.545725
D    0.518839
dtype: float64

In [42]:
dates = pd.date_range("20230101",periods=6)
df7 = pd.DataFrame(np.random.rand(6,4),index=dates,columns=list("ABCD"))
df7

Unnamed: 0,A,B,C,D
2023-01-01,0.398727,0.014947,0.509706,0.757375
2023-01-02,0.956574,0.374508,0.145201,0.236831
2023-01-03,0.57193,0.005769,0.902947,0.062408
2023-01-04,0.676427,0.821988,0.152398,0.941422
2023-01-05,0.995931,0.821382,0.70748,0.432666
2023-01-06,0.85422,0.243216,0.087409,0.816946


In [43]:
df7.mean()

A    0.742301
B    0.380302
C    0.417524
D    0.541275
dtype: float64

In [44]:
df7.mean(1)

2023-01-01    0.420189
2023-01-02    0.428278
2023-01-03    0.385763
2023-01-04    0.648059
2023-01-05    0.739365
2023-01-06    0.500448
Freq: D, dtype: float64

In [47]:
s = pd.Series([1,3,5,np.nan,6,8],index=dates).shiftt(2)
s

2023-01-01    NaN
2023-01-02    NaN
2023-01-03    1.0
2023-01-04    3.0
2023-01-05    5.0
2023-01-06    NaN
Freq: D, dtype: float64

In [48]:
df.sub(s,axis="index")

Unnamed: 0,A,B,C,D
2023-01-01,,,,
2023-01-02,,,,
2023-01-03,-0.548295,-0.463886,-0.003843,-0.599803
2023-01-04,-2.1645,-2.355043,-2.872852,-2.715467
2023-01-05,-4.472952,-4.852595,-4.132604,-4.570583
2023-01-06,,,,


In [50]:
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D
2023-01-01,0.758088,0.854673,0.432484,0.914823
2023-01-02,1.267439,1.203226,0.654006,1.744909
2023-01-03,1.719145,1.739339,1.650163,2.145107
2023-01-04,2.554644,2.384296,1.777311,2.42964
2023-01-05,3.081692,2.531701,2.644707,2.859056
2023-01-06,3.574368,3.055892,3.274348,3.113033


In [51]:
df.apply(lambda x : x.max()- x.min())


A    0.383794
B    0.707268
C    0.869009
D    0.660846
dtype: float64

__Histogramming__

In [54]:
s2 = pd.Series(np.random.randint(0,7,size=10))
s2

0    1
1    4
2    5
3    4
4    1
5    0
6    4
7    2
8    5
9    3
dtype: int32

In [55]:
s.value_counts()

1.0    1
3.0    1
5.0    1
Name: count, dtype: int64

__String Methods__

Series is equipped with a set of string processing methods in the str attribute that make it easy to operate on each element of the array, as in the code snippet below. Note that pattern-matching in str generally uses regular expressions by default (and in some cases always uses them). See more at Vectorized String Methods.

In [56]:
s5 = pd.Series(["A","B","C","AaBa","Baca",np.nan,"CABA","dog","cat"])
s5

0       A
1       B
2       C
3    AaBa
4    Baca
5     NaN
6    CABA
7     dog
8     cat
dtype: object

In [57]:
s5.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

In [59]:
s5.str.upper()

0       A
1       B
2       C
3    AABA
4    BACA
5     NaN
6    CABA
7     DOG
8     CAT
dtype: object

In [60]:
__Merge__

NameError: name '__Merge__' is not defined

__Merge__