In [22]:
import pandas as pd
import numpy as np

# Series

In [23]:
s = pd.Series([1,3,4,np.nan,9])

In [24]:
s

0    1.0
1    3.0
2    4.0
3    NaN
4    9.0
dtype: float64

# Creating a DataFrame by passing a NumPy array, with a datetime index and labeled columns:

In [25]:
dates = pd.date_range("20200101", periods=5)

In [26]:
dates

DatetimeIndex(['2020-01-01', '2020-01-02', '2020-01-03', '2020-01-04',
               '2020-01-05'],
              dtype='datetime64[ns]', freq='D')

# Creating a DataFrame by passing a dict of objects that can be converted to series-like.

In [27]:
df2 = pd.DataFrame(
        {
            "A": 1.0,
            "B": pd.Timestamp("20130102"),
            "C": 2,
            "D": np.array([3] * 4),
            "E": pd.Categorical(["test", "train", "test", "train"]),
            "F": "foo",
       }
    )

In [28]:
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,2,3,test,foo
1,1.0,2013-01-02,2,3,train,foo
2,1.0,2013-01-02,2,3,test,foo
3,1.0,2013-01-02,2,3,train,foo


In [29]:
df2.dtypes

A           float64
B    datetime64[ns]
C             int64
D             int64
E          category
F            object
dtype: object

In [30]:
df2.to_numpy()

array([[1.0, Timestamp('2013-01-02 00:00:00'), 2, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 2, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 2, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 2, 3, 'train', 'foo']],
      dtype=object)

# describe() shows a quick statistic summary of your data:

In [32]:
df = pd.DataFrame(np.random.randn(5, 4), index=dates, columns=list("ABCD"))

In [33]:
df

Unnamed: 0,A,B,C,D
2020-01-01,0.24582,0.365968,0.200425,0.327773
2020-01-02,-1.086055,-0.072459,-0.337405,0.252292
2020-01-03,0.810356,0.149699,1.480366,-1.21615
2020-01-04,1.148903,-0.414203,2.571014,-1.920743
2020-01-05,-0.567064,-2.221353,-0.304009,-0.160939


In [34]:
df.describe()

Unnamed: 0,A,B,C,D
count,5.0,5.0,5.0,5.0
mean,0.110392,-0.43847,0.722078,-0.543553
std,0.932366,1.03758,1.268961,0.985902
min,-1.086055,-2.221353,-0.337405,-1.920743
25%,-0.567064,-0.414203,-0.304009,-1.21615
50%,0.24582,-0.072459,0.200425,-0.160939
75%,0.810356,0.149699,1.480366,0.252292
max,1.148903,0.365968,2.571014,0.327773


# Sorting by values:

In [35]:
df.sort_values(by = 'B')

Unnamed: 0,A,B,C,D
2020-01-05,-0.567064,-2.221353,-0.304009,-0.160939
2020-01-04,1.148903,-0.414203,2.571014,-1.920743
2020-01-02,-1.086055,-0.072459,-0.337405,0.252292
2020-01-03,0.810356,0.149699,1.480366,-1.21615
2020-01-01,0.24582,0.365968,0.200425,0.327773


In [36]:
df['A']

2020-01-01    0.245820
2020-01-02   -1.086055
2020-01-03    0.810356
2020-01-04    1.148903
2020-01-05   -0.567064
Freq: D, Name: A, dtype: float64

# Using a single column’s values to select data.

In [37]:
df[df['A']>0]

Unnamed: 0,A,B,C,D
2020-01-01,0.24582,0.365968,0.200425,0.327773
2020-01-03,0.810356,0.149699,1.480366,-1.21615
2020-01-04,1.148903,-0.414203,2.571014,-1.920743


In [38]:
df[df>0]

Unnamed: 0,A,B,C,D
2020-01-01,0.24582,0.365968,0.200425,0.327773
2020-01-02,,,,0.252292
2020-01-03,0.810356,0.149699,1.480366,
2020-01-04,1.148903,,2.571014,
2020-01-05,,,,


# Operations


In [40]:
df.mean()

A    0.110392
B   -0.438470
C    0.722078
D   -0.543553
dtype: float64