# Data Frame Selection

In [5]:
import pandas as pd

# Load data into dataframe
df = pd.read_csv("./data/iris-with-header.tsv", delimiter='\t')
df.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


## Get coloumns by column name

In [6]:
df['Species'].head()

0    Iris-setosa
1    Iris-setosa
2    Iris-setosa
3    Iris-setosa
4    Iris-setosa
Name: Species, dtype: object

In [7]:
df[['SepalLengthCm', 'Species']].head()

Unnamed: 0,SepalLengthCm,Species
0,5.1,Iris-setosa
1,4.9,Iris-setosa
2,4.7,Iris-setosa
3,4.6,Iris-setosa
4,5.0,Iris-setosa


## Get rows by index

In [8]:
df[0:1]

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-setosa


### Every `5` rows

In [9]:
df[::5]

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
5,5.4,3.9,1.7,0.4,Iris-setosa
10,5.4,3.7,1.5,0.2,Iris-setosa
15,5.7,4.4,1.5,0.4,Iris-setosa
20,5.4,3.4,1.7,0.2,Iris-setosa
25,5.0,3.0,1.6,0.2,Iris-setosa
30,4.8,3.1,1.6,0.2,Iris-setosa
35,5.0,3.2,1.2,0.2,Iris-setosa
40,5.0,3.5,1.3,0.3,Iris-setosa
45,4.8,3.0,1.4,0.3,Iris-setosa


### Reverse order

In [10]:
df[::-1] 

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
149,5.9,3.0,5.1,1.8,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
145,6.7,3.0,5.2,2.3,Iris-virginica
144,6.7,3.3,5.7,2.5,Iris-virginica
143,6.8,3.2,5.9,2.3,Iris-virginica
142,5.8,2.7,5.1,1.9,Iris-virginica
141,6.9,3.1,5.1,2.3,Iris-virginica
140,6.7,3.1,5.6,2.4,Iris-virginica


## Get elements by `iloc`, `iat`, `ix`, `loc`
[Pandas Documentation](http://pandas.pydata.org/pandas-docs/stable/indexing.html)

`.iloc` Purely **integer-location based** indexing for selection by position.

In [11]:
print(df.iloc[0]) 
print("")
print(df.iloc[0,4])

SepalLengthCm            5.1
SepalWidthCm             3.5
PetalLengthCm            1.4
PetalWidthCm             0.2
Species          Iris-setosa
Name: 0, dtype: object

Iris-setosa


`.loc` is primarily **label based**, but may also be used with a **boolean array**. .loc will raise KeyError when the items are not found.

In [12]:
print(df.loc[0])
print('')
print(df.loc[0, 'Species'])

SepalLengthCm            5.1
SepalWidthCm             3.5
PetalLengthCm            1.4
PetalWidthCm             0.2
Species          Iris-setosa
Name: 0, dtype: object

Iris-setosa


`.ix` supports **mixed integer** and **label** based access.

In [13]:
print(df.ix[0])
print('')
print(df.ix[0,0])
print('')
print(df.ix[0,'Species'])

SepalLengthCm            5.1
SepalWidthCm             3.5
PetalLengthCm            1.4
PetalWidthCm             0.2
Species          Iris-setosa
Name: 0, dtype: object

5.1

Iris-setosa


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  This is separate from the ipykernel package so we can avoid doing imports until
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """


## Get scalar values with `.iat`, `.at`
Similarly to `loc`, `at` provides **label based scalar** lookups, while, `iat` provides **integer based** lookups analogously to `iloc`

In [14]:
# TODO Series example using s.at

In [15]:
# Can be used with dataframe too
print(df.iat[0,0])
print('')
print(df.at[0,'Species'])

5.1

Iris-setosa


## Conditional selection with `< > = ~`, `.where()`

In [16]:
df['SepalLengthCm'] == 5.1

0       True
1      False
2      False
3      False
4      False
5      False
6      False
7      False
8      False
9      False
10     False
11     False
12     False
13     False
14     False
15     False
16     False
17      True
18     False
19      True
20     False
21      True
22     False
23      True
24     False
25     False
26     False
27     False
28     False
29     False
       ...  
120    False
121    False
122    False
123    False
124    False
125    False
126    False
127    False
128    False
129    False
130    False
131    False
132    False
133    False
134    False
135    False
136    False
137    False
138    False
139    False
140    False
141    False
142    False
143    False
144    False
145    False
146    False
147    False
148    False
149    False
Name: SepalLengthCm, Length: 150, dtype: bool

In [17]:
df.loc[df['SepalLengthCm'] == 5.1]

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
17,5.1,3.5,1.4,0.3,Iris-setosa
19,5.1,3.8,1.5,0.3,Iris-setosa
21,5.1,3.7,1.5,0.4,Iris-setosa
23,5.1,3.3,1.7,0.5,Iris-setosa
39,5.1,3.4,1.5,0.2,Iris-setosa
44,5.1,3.8,1.9,0.4,Iris-setosa
46,5.1,3.8,1.6,0.2,Iris-setosa
98,5.1,2.5,3.0,1.1,Iris-versicolor


In [18]:
df[df==5.1]

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,,,,
1,,,,,
2,,,,,
3,,,,,
4,,,,,
5,,,,,
6,,,,,
7,,,,,
8,,,,,
9,,,,,


In [39]:
np.where(df['SepalLengthCm']==5.1)

(array([ 0, 17, 19, 21, 23, 39, 44, 46, 98]),)

In [20]:
df.where(df[:]['SepalLengthCm']==5.1)

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,,,,,
2,,,,,
3,,,,,
4,,,,,
5,,,,,
6,,,,,
7,,,,,
8,,,,,
9,,,,,


In [41]:
df.loc[0, 'SepalLengthCm']

5.1

In [43]:
df.index.values

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
       143, 144, 145, 146, 147, 148, 149])

## Select random row

In [21]:
df.index.values

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
       143, 144, 145, 146, 147, 148, 149])

In [22]:
import numpy as np
rand_row_ind = np.random.choice(df.index.values, 3)
rand_row_ind

array([ 98,  42, 101])

In [23]:
df.loc[rand_row_ind]

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
98,5.1,2.5,3.0,1.1,Iris-versicolor
42,4.4,3.2,1.3,0.2,Iris-setosa
101,5.8,2.7,5.1,1.9,Iris-virginica


In [24]:
df.iloc[rand_row_ind]

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
98,5.1,2.5,3.0,1.1,Iris-versicolor
42,4.4,3.2,1.3,0.2,Iris-setosa
101,5.8,2.7,5.1,1.9,Iris-virginica


In [25]:
df.ix[rand_row_ind]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
98,5.1,2.5,3.0,1.1,Iris-versicolor
42,4.4,3.2,1.3,0.2,Iris-setosa
101,5.8,2.7,5.1,1.9,Iris-virginica


## Select random column

In [26]:
rand_col_name = np.random.choice(df.columns, 3)
rand_col_name

array(['Species', 'Species', 'SepalLengthCm'], dtype=object)

In [27]:
df[rand_col_name].head()

Unnamed: 0,Species,Species.1,SepalLengthCm
0,Iris-setosa,Iris-setosa,5.1
1,Iris-setosa,Iris-setosa,4.9
2,Iris-setosa,Iris-setosa,4.7
3,Iris-setosa,Iris-setosa,4.6
4,Iris-setosa,Iris-setosa,5.0


In [28]:
df.loc[:, rand_col_name].head()

Unnamed: 0,Species,Species.1,SepalLengthCm
0,Iris-setosa,Iris-setosa,5.1
1,Iris-setosa,Iris-setosa,4.9
2,Iris-setosa,Iris-setosa,4.7
3,Iris-setosa,Iris-setosa,4.6
4,Iris-setosa,Iris-setosa,5.0


In [29]:
df.ix[:, rand_col_name].head()

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


Unnamed: 0,Species,Species.1,SepalLengthCm
0,Iris-setosa,Iris-setosa,5.1
1,Iris-setosa,Iris-setosa,4.9
2,Iris-setosa,Iris-setosa,4.7
3,Iris-setosa,Iris-setosa,4.6
4,Iris-setosa,Iris-setosa,5.0


## Select random rows and columns

In [30]:
df.loc[rand_row_ind, rand_col_name]

Unnamed: 0,Species,Species.1,SepalLengthCm
98,Iris-versicolor,Iris-versicolor,5.1
42,Iris-setosa,Iris-setosa,4.4
101,Iris-virginica,Iris-virginica,5.8


In [31]:
df.ix[rand_row_ind, rand_col_name]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


Unnamed: 0,Species,Species.1,SepalLengthCm
98,Iris-versicolor,Iris-versicolor,5.1
42,Iris-setosa,Iris-setosa,4.4
101,Iris-virginica,Iris-virginica,5.8


In [32]:
df.loc[rand_row_ind, rand_col_name]

Unnamed: 0,Species,Species.1,SepalLengthCm
98,Iris-versicolor,Iris-versicolor,5.1
42,Iris-setosa,Iris-setosa,4.4
101,Iris-virginica,Iris-virginica,5.8


## Select random element

In [33]:
for i in range(rand_row_ind.size):
    print(df.loc[rand_row_ind[i], rand_col_name[i]])

Iris-versicolor
Iris-setosa
5.8


## Select random row with `df.sample()`

In [34]:
df.sample(n=2)

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
119,6.0,2.2,5.0,1.5,Iris-virginica
142,5.8,2.7,5.1,1.9,Iris-virginica


### Reference:

In [35]:
help(df.sample)

Help on method sample in module pandas.core.generic:

sample(n=None, frac=None, replace=False, weights=None, random_state=None, axis=None) method of pandas.core.frame.DataFrame instance
    Return a random sample of items from an axis of object.
    
    You can use `random_state` for reproducibility.
    
    Parameters
    ----------
    n : int, optional
        Number of items from axis to return. Cannot be used with `frac`.
        Default = 1 if `frac` = None.
    frac : float, optional
        Fraction of axis items to return. Cannot be used with `n`.
    replace : boolean, optional
        Sample with or without replacement. Default = False.
    weights : str or ndarray-like, optional
        Default 'None' results in equal probability weighting.
        If passed a Series, will align with target object on index. Index
        values in weights not found in sampled object will be ignored and
        index values in sampled object not in weights will be assigned
        weights 

In [36]:
# More conditional examples