# How to Sample a Dataframe in Python Pandas

Author: Angelica Lo Duca

Article from [towardsdatascience](https://towardsdatascience.com/how-to-sample-a-dataframe-in-python-pandas-d18a3187139b).

> Note: In this notebook, I am studying the article mentioned above. Some changes may have been made to the code during its implementation.

# Library

In [1]:
from sklearn.datasets import load_iris
import pandas as pd

# Load dataset

In [4]:
data = load_iris()
df = pd.DataFrame(data.data, columns=data.feature_names)
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


# Random sampling

## Exact number

In [5]:
subset = df.sample(n=100)
subset

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
19,5.1,3.8,1.5,0.3
125,7.2,3.2,6.0,1.8
138,6.0,3.0,4.8,1.8
139,6.9,3.1,5.4,2.1
124,6.7,3.3,5.7,2.1
...,...,...,...,...
77,6.7,3.0,5.0,1.7
47,4.6,3.2,1.4,0.2
96,5.7,2.9,4.2,1.3
54,6.5,2.8,4.6,1.5


In [6]:
subset.shape

(100, 4)

## Percentage

In [8]:
subset = df.sample(frac=0.5)
subset

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
55,5.7,2.8,4.5,1.3
133,6.3,2.8,5.1,1.5
15,5.7,4.4,1.5,0.4
52,6.9,3.1,4.9,1.5
17,5.1,3.5,1.4,0.3
...,...,...,...,...
47,4.6,3.2,1.4,0.2
134,6.1,2.6,5.6,1.4
3,4.6,3.1,1.5,0.2
120,6.9,3.2,5.7,2.3


# Sampling with condition

In [9]:
condition = df['sepal width (cm)'] < 3
condition

0      False
1      False
2      False
3      False
4      False
       ...  
145    False
146     True
147    False
148    False
149    False
Name: sepal width (cm), Length: 150, dtype: bool

In [10]:
true_index = condition[condition == True].index
true_index

Int64Index([  8,  41,  53,  54,  55,  57,  58,  59,  60,  62,  63,  64,  67,
             68,  69,  71,  72,  73,  74,  76,  78,  79,  80,  81,  82,  83,
             87,  89,  90,  92,  93,  94,  96,  97,  98,  99, 101, 103, 106,
            107, 108, 111, 113, 114, 118, 119, 121, 122, 123, 126, 128, 130,
            132, 133, 134, 142, 146],
           dtype='int64')

In [11]:
subset = df[condition].sample(n=10)
subset

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
60,5.0,2.0,3.5,1.0
87,6.3,2.3,4.4,1.3
98,5.1,2.5,3.0,1.1
69,5.6,2.5,3.9,1.1
90,5.5,2.6,4.4,1.2
82,5.8,2.7,3.9,1.2
118,7.7,2.6,6.9,2.3
133,6.3,2.8,5.1,1.5
68,6.2,2.2,4.5,1.5
8,4.4,2.9,1.4,0.2


# Sampling at a constant rate

In [12]:
rate = 10
subset = df[::rate]
subset

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
10,5.4,3.7,1.5,0.2
20,5.4,3.4,1.7,0.2
30,4.8,3.1,1.6,0.2
40,5.0,3.5,1.3,0.3
50,7.0,3.2,4.7,1.4
60,5.0,2.0,3.5,1.0
70,5.9,3.2,4.8,1.8
80,5.5,2.4,3.8,1.1
90,5.5,2.6,4.4,1.2


In [13]:
subset.shape

(15, 4)

# Getting the remaining of the dataset

## First solution

In [14]:
remaining = df.drop(labels=subset.index)
remaining

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
5,5.4,3.9,1.7,0.4
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


## Second solution

In [15]:
remaining = df[~df.index.isin(subset.index)]
remaining

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
5,5.4,3.9,1.7,0.4
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3
