In [1]:
import pandas as pd

In [2]:
%ls Datasets

census.data  direct_marketing.csv  servo.data  tutorial.csv


In [3]:
data = pd.read_csv("Datasets/direct_marketing.csv")
data.head()

Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,segment,visit,conversion,spend,DM_category
0,10,2) $100 - $200,142.44,1,0,Surburban,0,Phone,Womens E-Mail,0,0,0.0,4
1,6,3) $200 - $350,329.08,1,1,Rural,1,Web,No E-Mail,0,0,0.0,11
2,7,2) $100 - $200,180.65,0,1,Surburban,1,Web,Womens E-Mail,0,0,0.0,1
3,9,5) $500 - $750,675.83,1,0,Rural,1,Web,Mens E-Mail,0,0,0.0,2
4,2,1) $0 - $100,45.34,1,0,Urban,0,Web,Womens E-Mail,0,0,0.0,4


<h3> Column Indexing </h3>

<p> A dataframe is essentially one or more series which have been 'stitched' together into a new data type. Pandas exposes many equivalent methods for slicing out those underlying series. You can slice by location, the way you would normally index into a regular Python list. You can slice by label, the way you would normally index into a Python dictionary. And like NumPy arrays, you can also index by boolean masks: </p>

In [4]:
#Produces a series object
data.recency
data['recency']
data.loc[:, 'recency']
data.iloc[:, 0]
data.ix[:, 0]

0        10
1         6
2         7
3         9
4         2
5         6
6         9
7         9
8         9
9        10
10        7
11        1
12        5
13        2
14        4
15        3
16        5
17        9
18       11
19        5
20        9
21       11
22        2
23        2
24        4
25        6
26       12
27        6
28        7
29        2
         ..
63970     4
63971     5
63972     3
63973     8
63974     8
63975    10
63976     1
63977     8
63978    10
63979    10
63980     3
63981     4
63982     5
63983     2
63984     2
63985     9
63986     9
63987     1
63988     6
63989    10
63990     6
63991     1
63992     1
63993     4
63994     7
63995    10
63996     5
63997     6
63998     1
63999     1
Name: recency, dtype: int64

In [5]:
# Produces a dataframe object
data[['recency']]
data.loc[:, ['recency']]
data.iloc[:, [0]]

Unnamed: 0,recency
0,10
1,6
2,7
3,9
4,2
5,6
6,9
7,9
8,9
9,10


<h3> Row Indexing </h3>

<p> You can use any of the .loc[], .iloc[], or .ix[] methods to do selection by row, noting that the expected order is [row_indexer, column_indexer]:</p>

In [6]:
data[0:2]
data.iloc[0:2, :]

Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,segment,visit,conversion,spend,DM_category
0,10,2) $100 - $200,142.44,1,0,Surburban,0,Phone,Womens E-Mail,0,0,0.0,4
1,6,3) $200 - $350,329.08,1,1,Rural,1,Web,No E-Mail,0,0,0.0,11


<h3> Boolean Indexing </h3>

<p> Your dataframes and series can also be indexed with a boolean operation—a dataframe or series with the same dimensions as the one you are selecting from, but with every value either being set to True or False. You can create a new boolean series either by manually specifying the values, or by using a conditional:</p>

In [7]:
data.recency < 7

0        False
1         True
2        False
3        False
4         True
5         True
6        False
7        False
8        False
9        False
10       False
11        True
12        True
13        True
14        True
15        True
16        True
17       False
18       False
19        True
20       False
21       False
22        True
23        True
24        True
25        True
26       False
27        True
28       False
29        True
         ...  
63970     True
63971     True
63972     True
63973    False
63974    False
63975    False
63976     True
63977    False
63978    False
63979    False
63980     True
63981     True
63982     True
63983     True
63984     True
63985    False
63986    False
63987     True
63988     True
63989    False
63990     True
63991     True
63992     True
63993     True
63994    False
63995    False
63996     True
63997     True
63998     True
63999     True
Name: recency, dtype: bool

In [8]:
# to subset the rows corresponding with what we previously identified as being true
data[data.recency < 7]

Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,segment,visit,conversion,spend,DM_category
1,6,3) $200 - $350,329.08,1,1,Rural,1,Web,No E-Mail,0,0,0.0,11
4,2,1) $0 - $100,45.34,1,0,Urban,0,Web,Womens E-Mail,0,0,0.0,4
5,6,2) $100 - $200,134.83,0,1,Surburban,0,Phone,Womens E-Mail,1,0,0.0,1
11,1,3) $200 - $350,211.45,0,1,Urban,1,Phone,Womens E-Mail,0,0,0.0,1
12,5,5) $500 - $750,642.90,0,1,Surburban,1,Multichannel,Womens E-Mail,0,0,0.0,1
13,2,2) $100 - $200,101.64,0,1,Urban,0,Web,Mens E-Mail,1,0,0.0,3
14,4,3) $200 - $350,241.42,0,1,Rural,1,Multichannel,No E-Mail,0,0,0.0,5
15,3,1) $0 - $100,58.13,1,0,Urban,1,Web,No E-Mail,1,0,0.0,6
16,5,1) $0 - $100,29.99,1,0,Surburban,0,Phone,Mens E-Mail,0,0,0.0,2
19,5,"6) $750 - $1,000",828.42,1,0,Surburban,1,Multichannel,Mens E-Mail,0,0,0.0,2


In [9]:
# to combine multiple filters
data[ (data.recency < 7) & (data.newbie == 0) ]

Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,segment,visit,conversion,spend,DM_category
4,2,1) $0 - $100,45.34,1,0,Urban,0,Web,Womens E-Mail,0,0,0.0,4
5,6,2) $100 - $200,134.83,0,1,Surburban,0,Phone,Womens E-Mail,1,0,0.0,1
13,2,2) $100 - $200,101.64,0,1,Urban,0,Web,Mens E-Mail,1,0,0.0,3
16,5,1) $0 - $100,29.99,1,0,Surburban,0,Phone,Mens E-Mail,0,0,0.0,2
22,2,2) $100 - $200,118.40,1,0,Surburban,0,Web,Mens E-Mail,1,0,0.0,2
24,4,1) $0 - $100,78.24,1,0,Surburban,0,Web,No E-Mail,0,0,0.0,6
27,6,2) $100 - $200,162.98,0,1,Surburban,0,Web,Mens E-Mail,0,0,0.0,3
29,2,3) $200 - $350,203.35,1,0,Rural,0,Web,No E-Mail,0,0,0.0,6
30,2,3) $200 - $350,237.53,0,1,Surburban,0,Phone,Womens E-Mail,0,0,0.0,1
32,6,2) $100 - $200,128.01,0,1,Urban,0,Web,Mens E-Mail,0,0,0.0,3


In [10]:
# Writing to a slice

data[data.recency < 7 ] = -100
data.head()

Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,segment,visit,conversion,spend,DM_category
0,10,2) $100 - $200,142.44,1,0,Surburban,0,Phone,Womens E-Mail,0,0,0.0,4
1,-100,-100,-100.0,-100,-100,-100,-100,-100,-100,-100,-100,-100.0,-100
2,7,2) $100 - $200,180.65,0,1,Surburban,1,Web,Womens E-Mail,0,0,0.0,1
3,9,5) $500 - $750,675.83,1,0,Rural,1,Web,Mens E-Mail,0,0,0.0,2
4,-100,-100,-100.0,-100,-100,-100,-100,-100,-100,-100,-100,-100.0,-100


In [11]:
data[0:6]

Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,segment,visit,conversion,spend,DM_category
0,10,2) $100 - $200,142.44,1,0,Surburban,0,Phone,Womens E-Mail,0,0,0.0,4
1,-100,-100,-100.0,-100,-100,-100,-100,-100,-100,-100,-100,-100.0,-100
2,7,2) $100 - $200,180.65,0,1,Surburban,1,Web,Womens E-Mail,0,0,0.0,1
3,9,5) $500 - $750,675.83,1,0,Rural,1,Web,Mens E-Mail,0,0,0.0,2
4,-100,-100,-100.0,-100,-100,-100,-100,-100,-100,-100,-100,-100.0,-100
5,-100,-100,-100.0,-100,-100,-100,-100,-100,-100,-100,-100,-100.0,-100


In [12]:
data.loc?

<h2> Lecture: Feature Representation </h2>