In [1]:
import pandas as pd

In [2]:
# Indexing and selecting data

In [3]:
# Slicing with labels

In [4]:
# Attribute access

In [5]:
# You may access an index on a Series or column on a DataFrame directly as an attribute:

In [6]:
sa = pd.Series([1, 2, 3], index=list('abc'))

In [7]:
# Setting with enlargement

In [8]:
# The .loc/[] operations can perform enlargement when setting a non-existent key for that axis.

In [9]:
se = pd.Series([1, 2, 3])
se[5]=6

In [10]:
se

0    1
1    2
2    3
5    6
dtype: int64

In [11]:
# Duplicate data

In [12]:
# A DataFrame can be enlarged on either axis via .loc.

In [13]:
import numpy as np
dfi=pd.DataFrame(np.arange(6).reshape(3,2),columns=['A','B'])
dfi

Unnamed: 0,A,B
0,0,1
1,2,3
2,4,5


In [14]:
dfi.loc[:, 'C'] = dfi.loc[:, 'A']
dfi


Unnamed: 0,A,B,C
0,0,1,0
1,2,3,2
2,4,5,4


In [15]:
# This is like an append operation on the DataFrame.

In [16]:
dfi.loc[3]=5
dfi

Unnamed: 0,A,B,C
0,0,1,0
1,2,3,2
2,4,5,4
3,5,5,5


In [17]:
# Special use of the == operator with list objects

In [18]:
# Comparing a list of values to a column using ==/!= works similarly to in/not in.

In [19]:
df = pd.DataFrame({'a': list('aabbccddeeff'), 'b': list('aaaabbbbcccc'),
                   'c': np.random.randint(5, size=12),
                   'd': np.random.randint(9, size=12)})
df.query('b==["a","b","c"]')

Unnamed: 0,a,b,c,d
0,a,a,1,0
1,a,a,0,5
2,b,a,2,1
3,b,a,2,4
4,c,b,4,2
5,c,b,0,8
6,d,b,1,8
7,d,b,3,0
8,e,c,4,6
9,e,c,1,8


In [20]:
df.query('a in b')

Unnamed: 0,a,b,c,d
0,a,a,1,0
1,a,a,0,5
2,b,a,2,1
3,b,a,2,4
4,c,b,4,2
5,c,b,0,8


In [21]:
df[df['a'].isin(df['b'])]

Unnamed: 0,a,b,c,d
0,a,a,1,0
1,a,a,0,5
2,b,a,2,1
3,b,a,2,4
4,c,b,4,2
5,c,b,0,8


In [22]:
df.query('a not in b')

Unnamed: 0,a,b,c,d
6,d,b,1,8
7,d,b,3,0
8,e,c,4,6
9,e,c,1,8
10,f,c,1,3
11,f,c,2,3


In [23]:
# You can combine this with other expressions for very succinct queries:

In [24]:
df.query('a in b and c<d')

Unnamed: 0,a,b,c,d
1,a,a,0,5
3,b,a,2,4
5,c,b,0,8


In [25]:
# Special use of the == operator with list objects

In [26]:
df.query('b == ["a", "b", "c"]')

Unnamed: 0,a,b,c,d
0,a,a,1,0
1,a,a,0,5
2,b,a,2,1
3,b,a,2,4
4,c,b,4,2
5,c,b,0,8
6,d,b,1,8
7,d,b,3,0
8,e,c,4,6
9,e,c,1,8


In [27]:
df.query('[1, 2] not in c')

Unnamed: 0,a,b,c,d
1,a,a,0,5
4,c,b,4,2
5,c,b,0,8
7,d,b,3,0
8,e,c,4,6


In [28]:
# itertuples

In [29]:
# Vectorized string methods

In [30]:
# Series is equipped with a set of string processing methods that make it easy to operate on each element of the array. Perhaps most importantly, these methods exclude missing/NA values automatically. These are accessed via the Series’s str attribute and generally have names matching the equivalent (scalar) built-in string methods. For example:

In [31]:
s = pd.Series(
    ["A", "B", "C", "Aaba", "Baca", np.nan, "CABA", "dog", "cat"], dtype="string"
)
s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5    <NA>
6    caba
7     dog
8     cat
dtype: string

In [32]:
# By values
# The Series.sort_values() method is used to sort a Series by its values. The DataFrame.sort_values() method is used to sort a DataFrame by its column or row values. The optional by parameter to DataFrame.sort_values() may used to specify one or more columns to use to determine the sorted order.

In [33]:
df1 = pd.DataFrame(
    {"one": [2, 1, 1, 1], "two": [1, 3, 2, 4], "three": [5, 4, 3, 2]}
)

In [34]:
df1.sort_values(by=['one'])

Unnamed: 0,one,two,three
1,1,3,4
2,1,2,3
3,1,4,2
0,2,1,5


In [35]:
df1[["one", "two", "three"]].sort_values(by=["one", "two"])

Unnamed: 0,one,two,three
2,1,2,3
1,1,3,4
3,1,4,2
0,2,1,5


In [36]:
# key will be given the Series of values and should return a Series or array of the same shape with the transformed values. For DataFrame objects, the key is applied per column, so the key should still expect a Series and return a Series, e.g.

In [37]:
# Sorting

In [38]:
# pandas supports three kinds of sorting: sorting by index labels, sorting by column values, and sorting by a combination of both.

In [39]:
# Note that the following also works, but is a bit less obvious / clean:

In [40]:
df.reindex(df.index.difference(["a", "d"]))

Unnamed: 0,a,b,c,d
0,a,a,1,0
1,a,a,0,5
2,b,a,2,1
3,b,a,2,4
4,c,b,4,2
5,c,b,0,8
6,d,b,1,8
7,d,b,3,0
8,e,c,4,6
9,e,c,1,8


In [41]:
# A method closely related to reindex is the drop() function. It removes a set of labels from an axis:

In [42]:
df.drop(["a", "d"], axis=0)

KeyError: "['a', 'd'] not found in axis"

In [None]:
# The query() Method

In [36]:
# DataFrame objects have a query() method that allows selection using an expression.

# # You can get the value of the frame where column b has values between the values of columns a and c. For example:

In [34]:
n=10
df=pd.DataFrame(np.random.randn(n,3),columns=list('abc'))
df

Unnamed: 0,a,b,c
0,-1.305369,1.040128,-0.993968
1,-1.834413,-0.361361,-1.18742
2,0.554001,-0.175275,0.445516
3,0.913711,1.159387,0.03844
4,-0.998582,-0.641036,-0.137943
5,0.535214,0.412659,-0.369042
6,-0.015504,1.35945,-0.41547
7,-1.754792,0.723574,0.54325
8,-0.442076,-0.887775,-0.51139
9,-1.465086,-0.525007,1.973305


In [None]:
# Boolean operators

In [None]:
# You can negate boolean expressions with the word not or the ~ operator.

In [38]:
df = pd.DataFrame(np.random.rand(n, 3), columns=list('abc'))

df['bools'] = np.random.rand(len(df)) > 0.5
df.query('~bools')

Unnamed: 0,a,b,c,bools
0,0.551654,0.454448,0.013613,False
1,0.935603,0.30867,0.120213,False
3,0.825996,0.670635,0.741584,False
4,0.447051,0.780542,0.208835,False
6,0.939327,0.868281,0.072479,False
7,0.498668,0.285449,0.606136,False
8,0.094523,0.944123,0.667439,False
9,0.342792,0.061578,0.736926,False


In [None]:
# Mask

In [None]:
# mask() is the inverse boolean operation of where.

In [None]:
s.mask(s >= 0)

In [None]:
df.mask(df >= 0)

In [None]:
# The in and not in operators

In [None]:
# query() also supports special use of Python’s in and not in comparison operators, providing a succinct syntax for calling the isin method of a Series or DataFrame.

In [45]:
df=pd.DataFrame({'a':list('aabbccddeeff'),'b':np.random.randint(5,size=12)})
df

Unnamed: 0,a,b
0,a,0
1,a,4
2,b,2
3,b,2
4,c,2
5,c,4
6,d,1
7,d,3
8,e,2
9,e,3


In [46]:
df.query('a in b')

Unnamed: 0,a,b
