In [1]:
import numpy as np
import pandas as pd

There are three fundamental data structures in pandas:

- Series: 1D labeled array data

- DataFrame: 2D tabular data with rows and columns.

- Index: A sequence of row/column labels.

## Series ##

1-dimensional array-like object. It contains:

- A sequence of values of the same type.
- A sequence of data labels called the index.


In [2]:
s = pd.Series([1, 10, 100])
s

0      1
1     10
2    100
dtype: int64

In [3]:
s.index

RangeIndex(start=0, stop=3, step=1)

In [4]:
#return values as numpy array
s.values

array([  1,  10, 100])

In [5]:
s[1]

10

In [6]:
s[1:2]

1    10
dtype: int64

In [7]:
s = pd.Series([1, 10, 100], index = ["first", "second", "third"])
s

first       1
second     10
third     100
dtype: int64

In [8]:
s.index

Index(['first', 'second', 'third'], dtype='object')

In [9]:
s[1]

10

In [10]:
s['second']

10

In [11]:
s[['first','third']]

first      1
third    100
dtype: int64

In [12]:
s[['second']]

second    10
dtype: int64

In [13]:
s["second":"third"]

second     10
third     100
dtype: int64

In [14]:
s[1:2]

second    10
dtype: int64

In [15]:
s>5

first     False
second     True
third      True
dtype: bool

In [16]:
s[[True, False, True]]

first      1
third    100
dtype: int64

In [17]:
s[s>5]

second     10
third     100
dtype: int64

In [18]:
s

first       1
second     10
third     100
dtype: int64

In [19]:
s.apply(lambda x: x <11)

first      True
second     True
third     False
dtype: bool

In [20]:
def f (x):
    return x<11
s.apply(f)

first      True
second     True
third     False
dtype: bool

In [21]:
# apply function each element of Series
s[s.apply(lambda x: x <11)]



first      1
second    10
dtype: int64

## DataFrame ##

Tabular data is one of the most common ways that data scientists use to organize data. This is in large part due to the simplicity and flexibility of tables. Tables allow us to represent each observation, or instance of data from an individual, as its own row. We can record each observation’s distinct characteristics, or features, in separate columns.

DataFrame class of the pandas is used to represent 2D tabular data with rows and columns. 

You can also think of a DataFrame as a collection of Series that all share the same Index.

There are many ways to create a DataFrame. Some of the  popular approaches:

- Using a list and column name(s).
- From a dictionary.
- From a Series.
- From a CSV file

### Creating a Table (DataFrame) from Scratch ###

In [22]:
streets = ['Bancroft', 'Durant', 'Channing', 'Haste']  #creates an list

In [23]:
streets

['Bancroft', 'Durant', 'Channing', 'Haste']

In [24]:
pd.DataFrame(streets, columns=['Street name']) #Creates a dataframe and adds a column labeled 'Street name'

Unnamed: 0,Street name
0,Bancroft
1,Durant
2,Channing
3,Haste


In [25]:
streets_with_numbers = [['Bancroft',1], ['Durant',2], ['Channing',3], ['Haste',4]]  #creates an list

In [26]:
pd.DataFrame(streets_with_numbers, columns=['Street name','Blocks away from campus']) #Creates a data frame with two columns labed 'Street name' and 'Blocks away from campus'

Unnamed: 0,Street name,Blocks away from campus
0,Bancroft,1
1,Durant,2
2,Channing,3
3,Haste,4


In [27]:
southside = pd.DataFrame(streets, columns=['Street name'])

In [28]:
southside

Unnamed: 0,Street name
0,Bancroft
1,Durant
2,Channing
3,Haste


In [29]:
np_array=np.arange(1,21,2).reshape(5,2)
np_array

array([[ 1,  3],
       [ 5,  7],
       [ 9, 11],
       [13, 15],
       [17, 19]])

In [30]:
pd.DataFrame(np_array, columns=["first","second"])

Unnamed: 0,first,second
0,1,3
1,5,7
2,9,11
3,13,15
4,17,19


In [31]:
# creates a new dataframe with two columms from a dictionary
pd.DataFrame({'Street name':streets,'Blocks away from campus':np.arange(4)})

Unnamed: 0,Street name,Blocks away from campus
0,Bancroft,0
1,Durant,1
2,Channing,2
3,Haste,3


In [32]:
southside

Unnamed: 0,Street name
0,Bancroft
1,Durant
2,Channing
3,Haste


In [33]:
# Adds a new column
southside['Blocks away from campus']=np.arange(4)  

In [34]:
southside

Unnamed: 0,Street name,Blocks away from campus
0,Bancroft,0
1,Durant,1
2,Channing,2
3,Haste,3


In [35]:
s1 = pd.Series(['Bancroft', 'Durant', 'Channing', 'Haste'], index = ["first", "second", "third", "fourth"])
s1

first     Bancroft
second      Durant
third     Channing
fourth       Haste
dtype: object

In [36]:
# creates a new DataFrame  from a Series
pd.DataFrame(s1, columns=["test"])

Unnamed: 0,test
first,Bancroft
second,Durant
third,Channing
fourth,Haste


In [37]:
s2 = pd.Series(['A', 'B', 'C', 'D'], index = ["first1", "second", "third", "fourth"])
s2

first1    A
second    B
third     C
fourth    D
dtype: object

In [38]:
pd.DataFrame({"first-column": s1, "second-column": s2})

Unnamed: 0,first-column,second-column
first,Bancroft,
first1,,A
fourth,Haste,D
second,Durant,B
third,Channing,C


### Creating a Table (DataFrame) from a File  ##

In [39]:
# Each row represents one ice-cream cone
cones = pd.read_csv('cones.csv')
cones

Unnamed: 0,Flavor,Color,Price
0,strawberry,pink,3.55
1,chocolate,light brown,4.75
2,chocolate,dark brown,5.25
3,strawberry,pink,5.25
4,chocolate,dark brown,5.25
5,bubblegum,pink,4.75


In [40]:
# first n rows
cones.head(4)

Unnamed: 0,Flavor,Color,Price
0,strawberry,pink,3.55
1,chocolate,light brown,4.75
2,chocolate,dark brown,5.25
3,strawberry,pink,5.25


In [41]:
# last n rows
cones.tail(3)

Unnamed: 0,Flavor,Color,Price
3,strawberry,pink,5.25
4,chocolate,dark brown,5.25
5,bubblegum,pink,4.75


In [42]:
cones.index

RangeIndex(start=0, stop=6, step=1)

In [43]:
cones.columns

Index(['Flavor', 'Color', 'Price'], dtype='object')

In [44]:
cones.values

array([['strawberry', 'pink', 3.55],
       ['chocolate', 'light brown', 4.75],
       ['chocolate', 'dark brown', 5.25],
       ['strawberry', 'pink', 5.25],
       ['chocolate', 'dark brown', 5.25],
       ['bubblegum', 'pink', 4.75]], dtype=object)

In [45]:
cones.shape

(6, 3)

In [46]:
cones.size

18

In [47]:
cones.dtypes

Flavor     object
Color      object
Price     float64
dtype: object

### Index

Index doesn’t have to be an integer, nor does it have to be unique

In [48]:
pd.read_csv('cones.csv')

Unnamed: 0,Flavor,Color,Price
0,strawberry,pink,3.55
1,chocolate,light brown,4.75
2,chocolate,dark brown,5.25
3,strawberry,pink,5.25
4,chocolate,dark brown,5.25
5,bubblegum,pink,4.75


In [49]:
cones = pd.read_csv('cones.csv', index_col = "Flavor")
cones

Unnamed: 0_level_0,Color,Price
Flavor,Unnamed: 1_level_1,Unnamed: 2_level_1
strawberry,pink,3.55
chocolate,light brown,4.75
chocolate,dark brown,5.25
strawberry,pink,5.25
chocolate,dark brown,5.25
bubblegum,pink,4.75


In [50]:
cones.index

Index(['strawberry', 'chocolate', 'chocolate', 'strawberry', 'chocolate',
       'bubblegum'],
      dtype='object', name='Flavor')

In [51]:
cones.reset_index(inplace=True)

In [52]:
cones

Unnamed: 0,Flavor,Color,Price
0,strawberry,pink,3.55
1,chocolate,light brown,4.75
2,chocolate,dark brown,5.25
3,strawberry,pink,5.25
4,chocolate,dark brown,5.25
5,bubblegum,pink,4.75


In [53]:
cones.index

RangeIndex(start=0, stop=6, step=1)

In [54]:
cones=cones.set_index("Color")
cones

Unnamed: 0_level_0,Flavor,Price
Color,Unnamed: 1_level_1,Unnamed: 2_level_1
pink,strawberry,3.55
light brown,chocolate,4.75
dark brown,chocolate,5.25
pink,strawberry,5.25
dark brown,chocolate,5.25
pink,bubblegum,4.75


In [55]:
cones.index

Index(['pink', 'light brown', 'dark brown', 'pink', 'dark brown', 'pink'], dtype='object', name='Color')

In [56]:
cones.reset_index(inplace=True)
cones

Unnamed: 0,Color,Flavor,Price
0,pink,strawberry,3.55
1,light brown,chocolate,4.75
2,dark brown,chocolate,5.25
3,pink,strawberry,5.25
4,dark brown,chocolate,5.25
5,pink,bubblegum,4.75


In [57]:
cones.index=np.arange(10,70,10)
cones

Unnamed: 0,Color,Flavor,Price
10,pink,strawberry,3.55
20,light brown,chocolate,4.75
30,dark brown,chocolate,5.25
40,pink,strawberry,5.25
50,dark brown,chocolate,5.25
60,pink,bubblegum,4.75


###  Extracting a subset of rows and columns in DataFrame

In [58]:
cones = pd.read_csv('cones.csv')
cones

Unnamed: 0,Flavor,Color,Price
0,strawberry,pink,3.55
1,chocolate,light brown,4.75
2,chocolate,dark brown,5.25
3,strawberry,pink,5.25
4,chocolate,dark brown,5.25
5,bubblegum,pink,4.75


### -Label-based : .loc 

Specify the labels (row indices and column labels ) of rows and columns. The row labels are the first argument to the .loc function; the column labels are the second.

Arguments to .loc can be:

- A single value.
- A slice.
- A list.

In [59]:
cones.loc[1,'Flavor' ]

'chocolate'

Keep in mind that passing in just one argument as a single value will produce a Series.

In [60]:
cones.loc[1]

Flavor      chocolate
Color     light brown
Price            4.75
Name: 1, dtype: object

In [61]:
cones.loc[:,'Flavor' ]

0    strawberry
1     chocolate
2     chocolate
3    strawberry
4     chocolate
5     bubblegum
Name: Flavor, dtype: object

In [62]:
cones.loc[1:4,'Flavor':'Color' ]

Unnamed: 0,Flavor,Color
1,chocolate,light brown
2,chocolate,dark brown
3,strawberry,pink
4,chocolate,dark brown


There are a couple of things we should note. Firstly,  pandas allows us to slice string values (in our example, the column labels). Secondly, <b>slicing with .loc is inclusive</b>. Notice how our resulting DataFrame includes every row and column between and including the slice labels we specified. Remember <b> Python is exclusive, the right end of a slice is not included </b>

In [63]:
cones.loc[1:4]

Unnamed: 0,Flavor,Color,Price
1,chocolate,light brown,4.75
2,chocolate,dark brown,5.25
3,strawberry,pink,5.25
4,chocolate,dark brown,5.25


In [64]:
cones.loc[1:4,: ]

Unnamed: 0,Flavor,Color,Price
1,chocolate,light brown,4.75
2,chocolate,dark brown,5.25
3,strawberry,pink,5.25
4,chocolate,dark brown,5.25


In [65]:
cones.loc[:,'Flavor':'Color' ]

Unnamed: 0,Flavor,Color
0,strawberry,pink
1,chocolate,light brown
2,chocolate,dark brown
3,strawberry,pink
4,chocolate,dark brown
5,bubblegum,pink


In [66]:
cones.loc[[1,3,5],['Flavor','Price'] ]

Unnamed: 0,Flavor,Price
1,chocolate,4.75
3,strawberry,5.25
5,bubblegum,4.75


In [67]:
cones.loc[1:5,['Flavor','Price'] ]

Unnamed: 0,Flavor,Price
1,chocolate,4.75
2,chocolate,5.25
3,strawberry,5.25
4,chocolate,5.25
5,bubblegum,4.75


In [68]:
cones.loc[1,'Flavor' ]

'chocolate'

In [69]:
cones.loc[[1],'Flavor' ]

1    chocolate
Name: Flavor, dtype: object

In [70]:
cones.loc[1 ,['Flavor']]

Flavor    chocolate
Name: 1, dtype: object

In [71]:
cones.loc[[1],['Flavor'] ]

Unnamed: 0,Flavor
1,chocolate


In [72]:
cones.loc[:,'Flavor' ]

0    strawberry
1     chocolate
2     chocolate
3    strawberry
4     chocolate
5     bubblegum
Name: Flavor, dtype: object

In [73]:
cones.loc[:,['Flavor']]

Unnamed: 0,Flavor
0,strawberry
1,chocolate
2,chocolate
3,strawberry
4,chocolate
5,bubblegum


### -Position-based : .iloc 

Slicing with .iloc works similarly to .loc. However, .iloc uses the positions of rows and columns rather than the labels . The arguments to the .iloc function also behave similarly — single values, lists, slices, and any combination of these are permitted.

In [74]:
cones

Unnamed: 0,Flavor,Color,Price
0,strawberry,pink,3.55
1,chocolate,light brown,4.75
2,chocolate,dark brown,5.25
3,strawberry,pink,5.25
4,chocolate,dark brown,5.25
5,bubblegum,pink,4.75


In [75]:
cones.loc[1,'Flavor' ]

'chocolate'

In [76]:
cones.iloc[1,0]

'chocolate'

In [77]:
#cones.loc[:,'Flavor' ]
cones.iloc[:, 0]

0    strawberry
1     chocolate
2     chocolate
3    strawberry
4     chocolate
5     bubblegum
Name: Flavor, dtype: object

Notice how the first argument to both .loc and .iloc are the same in these examples. This is because the row with a label of 0 is conveniently in the 0
th (equivalently, the first position) of DataFrame

In [78]:
#cones.loc[1:4,'Flavor':'Color' ]
cones.iloc[1:5,0:2 ]

Unnamed: 0,Flavor,Color
1,chocolate,light brown
2,chocolate,dark brown
3,strawberry,pink
4,chocolate,dark brown


<b> Slicing is no longer inclusive in .iloc — it’s exclusive</b> . In other words, the right end of a slice is not included when using .iloc. This is one of the subtleties of pandas syntax; you will get used to it with practice.

In [79]:
#cones.loc[[1,3,5],['Flavor','Price'] ]
cones.iloc[[1,3,5],[0,2] ]

Unnamed: 0,Flavor,Price
1,chocolate,4.75
3,strawberry,5.25
5,bubblegum,4.75


### -Context-based : indexing with [] 

The <b> [] </b> selection operator is confusing, yet the most commonly used. It only takes a single argument, which may be one of the following:

- A single-column label.
- A list of column labels.
- A slice of row numbers.



So,  <b> [] </b> is context-dependent. 

In [80]:
cones['Flavor']

0    strawberry
1     chocolate
2     chocolate
3    strawberry
4     chocolate
5     bubblegum
Name: Flavor, dtype: object

In [81]:
cones[['Flavor']]

Unnamed: 0,Flavor
0,strawberry
1,chocolate
2,chocolate
3,strawberry
4,chocolate
5,bubblegum


In [82]:
cones[['Flavor', 'Price']]

Unnamed: 0,Flavor,Price
0,strawberry,3.55
1,chocolate,4.75
2,chocolate,5.25
3,strawberry,5.25
4,chocolate,5.25
5,bubblegum,4.75


In [83]:
cones[1:4]

Unnamed: 0,Flavor,Color,Price
1,chocolate,light brown,4.75
2,chocolate,dark brown,5.25
3,strawberry,pink,5.25


### Conditional Selection

Select a subset of rows in a DataFrame that satisfy some specified condition.

Another possible input of the .loc and [] methods:
- a boolean array, which is simply an array or Series where each element is either True or False. This boolean array must have a length equal to the number of rows in the DataFrame. 

It will return all rows that correspond to a value of True in the array. Similar to Series, numPy arrays

In [84]:
cones

Unnamed: 0,Flavor,Color,Price
0,strawberry,pink,3.55
1,chocolate,light brown,4.75
2,chocolate,dark brown,5.25
3,strawberry,pink,5.25
4,chocolate,dark brown,5.25
5,bubblegum,pink,4.75


In [85]:
cones[[True, False,True,False, True,False]]

Unnamed: 0,Flavor,Color,Price
0,strawberry,pink,3.55
2,chocolate,dark brown,5.25
4,chocolate,dark brown,5.25


In [86]:
cones.loc[[True, False,True,False, True,False]]

Unnamed: 0,Flavor,Color,Price
0,strawberry,pink,3.55
2,chocolate,dark brown,5.25
4,chocolate,dark brown,5.25


In [87]:
cones.loc[[True, False,True,False, True,False],['Flavor']]

Unnamed: 0,Flavor
0,strawberry
2,chocolate
4,chocolate


In [88]:
cones[cones['Flavor'] == 'chocolate']

Unnamed: 0,Flavor,Color,Price
1,chocolate,light brown,4.75
2,chocolate,dark brown,5.25
4,chocolate,dark brown,5.25


In [89]:
selection=cones['Flavor'] == 'chocolate'
selection

0    False
1     True
2     True
3    False
4     True
5    False
Name: Flavor, dtype: bool

In [90]:
cones[selection]

Unnamed: 0,Flavor,Color,Price
1,chocolate,light brown,4.75
2,chocolate,dark brown,5.25
4,chocolate,dark brown,5.25


In [91]:
cones.loc[selection]

Unnamed: 0,Flavor,Color,Price
1,chocolate,light brown,4.75
2,chocolate,dark brown,5.25
4,chocolate,dark brown,5.25


In [92]:
cones[cones['Flavor'] == 'chocolate']

Unnamed: 0,Flavor,Color,Price
1,chocolate,light brown,4.75
2,chocolate,dark brown,5.25
4,chocolate,dark brown,5.25


Boolean conditions can be combined using various bitwise operators, allowing us to filter results by multiple conditions. <b> p and q are boolean arrays or Series</b>.

- ~p  :	Returns negation of p
- p | q	: p OR q
- p & q	: p AND q
- p ^ q	: p XOR q (exclusive or)

In [93]:
cones[~(cones['Flavor'] == 'chocolate')]

Unnamed: 0,Flavor,Color,Price
0,strawberry,pink,3.55
3,strawberry,pink,5.25
5,bubblegum,pink,4.75


In [94]:
cones[(cones['Flavor'] == 'chocolate') | (cones['Price'] < 5.0) ]

Unnamed: 0,Flavor,Color,Price
0,strawberry,pink,3.55
1,chocolate,light brown,4.75
2,chocolate,dark brown,5.25
4,chocolate,dark brown,5.25
5,bubblegum,pink,4.75


In [95]:
cones[(cones['Flavor'] == 'chocolate') & (cones['Price'] < 5.0) ]

Unnamed: 0,Flavor,Color,Price
1,chocolate,light brown,4.75


In [96]:
cones[(cones['Flavor'] == 'chocolate') ^ (cones['Price'] < 5.0) ]

Unnamed: 0,Flavor,Color,Price
0,strawberry,pink,3.55
2,chocolate,dark brown,5.25
4,chocolate,dark brown,5.25
5,bubblegum,pink,4.75


In [97]:
cones[[True,False,True,False,True,False]]

Unnamed: 0,Flavor,Color,Price
0,strawberry,pink,3.55
2,chocolate,dark brown,5.25
4,chocolate,dark brown,5.25


In [98]:
(cones['Price'] < 5.0) 

0     True
1     True
2    False
3    False
4    False
5     True
Name: Price, dtype: bool

In [99]:
(cones['Flavor'] == 'chocolate')


0    False
1     True
2     True
3    False
4     True
5    False
Name: Flavor, dtype: bool

In [100]:
(cones['Flavor'] == 'chocolate') ^ (cones['Price'] < 5.0)

0     True
1    False
2     True
3    False
4     True
5     True
dtype: bool

In [101]:
cones[cones['Color'].str.startswith("p")]

Unnamed: 0,Flavor,Color,Price
0,strawberry,pink,3.55
3,strawberry,pink,5.25
5,bubblegum,pink,4.75


### Adding, Removing, and Modifying Columns

In [102]:
cones

Unnamed: 0,Flavor,Color,Price
0,strawberry,pink,3.55
1,chocolate,light brown,4.75
2,chocolate,dark brown,5.25
3,strawberry,pink,5.25
4,chocolate,dark brown,5.25
5,bubblegum,pink,4.75


In [103]:
#add new column. Changes the original dataframe
cones['New Column']= np.arange(1,12,2)

In [104]:
cones

Unnamed: 0,Flavor,Color,Price,New Column
0,strawberry,pink,3.55,1
1,chocolate,light brown,4.75,3
2,chocolate,dark brown,5.25,5
3,strawberry,pink,5.25,7
4,chocolate,dark brown,5.25,9
5,bubblegum,pink,4.75,11


In [105]:
# Different way to add a new column. Original dataframe does not change
cones.assign(new_clm=["A", "B","C", "D", "E", "F"])

Unnamed: 0,Flavor,Color,Price,New Column,new_clm
0,strawberry,pink,3.55,1,A
1,chocolate,light brown,4.75,3,B
2,chocolate,dark brown,5.25,5,C
3,strawberry,pink,5.25,7,D
4,chocolate,dark brown,5.25,9,E
5,bubblegum,pink,4.75,11,F


In [106]:
cones

Unnamed: 0,Flavor,Color,Price,New Column
0,strawberry,pink,3.55,1
1,chocolate,light brown,4.75,3
2,chocolate,dark brown,5.25,5
3,strawberry,pink,5.25,7
4,chocolate,dark brown,5.25,9
5,bubblegum,pink,4.75,11


In [107]:
#re-assign existing column
cones['New Column']= cones['New Column'] +1

In [108]:
cones

Unnamed: 0,Flavor,Color,Price,New Column
0,strawberry,pink,3.55,2
1,chocolate,light brown,4.75,4
2,chocolate,dark brown,5.25,6
3,strawberry,pink,5.25,8
4,chocolate,dark brown,5.25,10
5,bubblegum,pink,4.75,12


In [109]:
#rename columns
cones=cones.rename(columns={"New Column":"Odd_Numbers"})
cones

Unnamed: 0,Flavor,Color,Price,Odd_Numbers
0,strawberry,pink,3.55,2
1,chocolate,light brown,4.75,4
2,chocolate,dark brown,5.25,6
3,strawberry,pink,5.25,8
4,chocolate,dark brown,5.25,10
5,bubblegum,pink,4.75,12


<b>.drop()</b> method removes a column or row of a DataFrame. Use the axis parameter to specify whether a column or row should be dropped. Unless otherwise specified, pandas will assume that we are dropping a row by default.

In [110]:
cones.drop('Odd_Numbers', axis=1)
# or cones.drop('Odd_Numbers', axis="columns")

Unnamed: 0,Flavor,Color,Price
0,strawberry,pink,3.55
1,chocolate,light brown,4.75
2,chocolate,dark brown,5.25
3,strawberry,pink,5.25
4,chocolate,dark brown,5.25
5,bubblegum,pink,4.75


In [111]:
cones

Unnamed: 0,Flavor,Color,Price,Odd_Numbers
0,strawberry,pink,3.55,2
1,chocolate,light brown,4.75,4
2,chocolate,dark brown,5.25,6
3,strawberry,pink,5.25,8
4,chocolate,dark brown,5.25,10
5,bubblegum,pink,4.75,12


In [112]:
cones= cones.drop('Odd_Numbers', axis=1)
cones

Unnamed: 0,Flavor,Color,Price
0,strawberry,pink,3.55
1,chocolate,light brown,4.75
2,chocolate,dark brown,5.25
3,strawberry,pink,5.25
4,chocolate,dark brown,5.25
5,bubblegum,pink,4.75


In [113]:
no_color = cones.drop('Color', axis="columns")

In [114]:
no_color

Unnamed: 0,Flavor,Price
0,strawberry,3.55
1,chocolate,4.75
2,chocolate,5.25
3,strawberry,5.25
4,chocolate,5.25
5,bubblegum,4.75


In [115]:
cones

Unnamed: 0,Flavor,Color,Price
0,strawberry,pink,3.55
1,chocolate,light brown,4.75
2,chocolate,dark brown,5.25
3,strawberry,pink,5.25
4,chocolate,dark brown,5.25
5,bubblegum,pink,4.75


In [116]:
cones.drop(['Flavor','Color'], axis=1)

Unnamed: 0,Price
0,3.55
1,4.75
2,5.25
3,5.25
4,5.25
5,4.75


In [117]:
cones.drop([2,4], axis=0)
# or cones.drop([2,4], axis="rows")

Unnamed: 0,Flavor,Color,Price
0,strawberry,pink,3.55
1,chocolate,light brown,4.75
3,strawberry,pink,5.25
5,bubblegum,pink,4.75


In [118]:
cones.drop(3, axis='rows')

Unnamed: 0,Flavor,Color,Price
0,strawberry,pink,3.55
1,chocolate,light brown,4.75
2,chocolate,dark brown,5.25
4,chocolate,dark brown,5.25
5,bubblegum,pink,4.75


### Utility function examples

In [119]:
# Just about any NumPy function can be applied to pandas DataFrames and Series.
np.mean(cones['Price'])

4.8

In [120]:
cones.shape

(6, 3)

In [121]:
cones.size

18

In [122]:
#computes many statistics: minimum value, maximum value, mean value, etc..
cones.describe()

Unnamed: 0,Price
count,6.0
mean,4.8
std,0.659545
min,3.55
25%,4.75
50%,5.0
75%,5.25
max,5.25


In [123]:
cones["new"]=np.arange(1,7)

In [124]:
cones.describe()

Unnamed: 0,Price,new
count,6.0,6.0
mean,4.8,3.5
std,0.659545,1.870829
min,3.55,1.0
25%,4.75,2.25
50%,5.0,3.5
75%,5.25,4.75
max,5.25,6.0


In [125]:
cones=cones.drop("new",axis=1)
cones

Unnamed: 0,Flavor,Color,Price
0,strawberry,pink,3.55
1,chocolate,light brown,4.75
2,chocolate,dark brown,5.25
3,strawberry,pink,5.25
4,chocolate,dark brown,5.25
5,bubblegum,pink,4.75


In [126]:
cones['Flavor'].describe()

count             6
unique            3
top       chocolate
freq              3
Name: Flavor, dtype: object

In [127]:
cones.drop('Price', axis=1).describe()

Unnamed: 0,Flavor,Color
count,6,6
unique,3,3
top,chocolate,pink
freq,3,3


<b> .sample() </b>

Random processes are at the heart of many data science techniques .sample() lets us quickly select random entries (a row if called from a DataFrame, or a value if called from a Series).

- By default, .sample() selects entries without replacement. Pass in the argument replace=True to sample with replacement.

In [128]:
cones

Unnamed: 0,Flavor,Color,Price
0,strawberry,pink,3.55
1,chocolate,light brown,4.75
2,chocolate,dark brown,5.25
3,strawberry,pink,5.25
4,chocolate,dark brown,5.25
5,bubblegum,pink,4.75


In [129]:
cones.sample()

Unnamed: 0,Flavor,Color,Price
1,chocolate,light brown,4.75


In [130]:
# Sample n=3 random rows
cones.sample(3)

Unnamed: 0,Flavor,Color,Price
4,chocolate,dark brown,5.25
3,strawberry,pink,5.25
2,chocolate,dark brown,5.25


In [131]:
cones.sample(6,replace=True)

Unnamed: 0,Flavor,Color,Price
3,strawberry,pink,5.25
2,chocolate,dark brown,5.25
1,chocolate,light brown,4.75
1,chocolate,light brown,4.75
4,chocolate,dark brown,5.25
2,chocolate,dark brown,5.25


In [132]:
# Returns the number of occurrence of each unique value in a Series
cones["Flavor"].value_counts()

chocolate     3
strawberry    2
bubblegum     1
Name: Flavor, dtype: int64

In [133]:
# Returns an array of unique values in a Series
cones["Flavor"].unique()

array(['strawberry', 'chocolate', 'bubblegum'], dtype=object)

<b>.sort_values() </b>

Orders a DataFrame or Series by a specified column in ascending order (default) or descending order.

In [134]:
cones.sort_values(by='Price')

Unnamed: 0,Flavor,Color,Price
0,strawberry,pink,3.55
1,chocolate,light brown,4.75
5,bubblegum,pink,4.75
2,chocolate,dark brown,5.25
3,strawberry,pink,5.25
4,chocolate,dark brown,5.25


In [135]:
cones.sort_values(by='Price', ascending=False)

Unnamed: 0,Flavor,Color,Price
2,chocolate,dark brown,5.25
3,strawberry,pink,5.25
4,chocolate,dark brown,5.25
1,chocolate,light brown,4.75
5,bubblegum,pink,4.75
0,strawberry,pink,3.55


In [136]:
cones.sort_values(by='Flavor', ascending=False)

Unnamed: 0,Flavor,Color,Price
0,strawberry,pink,3.55
3,strawberry,pink,5.25
1,chocolate,light brown,4.75
2,chocolate,dark brown,5.25
4,chocolate,dark brown,5.25
5,bubblegum,pink,4.75


In [137]:
cones.sort_values(by=['Flavor', 'Price'], ascending=True)

Unnamed: 0,Flavor,Color,Price
5,bubblegum,pink,4.75
1,chocolate,light brown,4.75
2,chocolate,dark brown,5.25
4,chocolate,dark brown,5.25
0,strawberry,pink,3.55
3,strawberry,pink,5.25


In [138]:
cones.sort_values(by=['Price','Flavor'], ascending=False)

Unnamed: 0,Flavor,Color,Price
3,strawberry,pink,5.25
2,chocolate,dark brown,5.25
4,chocolate,dark brown,5.25
1,chocolate,light brown,4.75
5,bubblegum,pink,4.75
0,strawberry,pink,3.55


In [139]:
cones['flavor_length']=cones['Flavor'].str.len()
cones.sort_values(by='flavor_length', ascending=False)

Unnamed: 0,Flavor,Color,Price,flavor_length
0,strawberry,pink,3.55,10
3,strawberry,pink,5.25,10
1,chocolate,light brown,4.75,9
2,chocolate,dark brown,5.25,9
4,chocolate,dark brown,5.25,9
5,bubblegum,pink,4.75,9


In [140]:
cones=cones.drop('flavor_length', axis=1)
cones

Unnamed: 0,Flavor,Color,Price
0,strawberry,pink,3.55
1,chocolate,light brown,4.75
2,chocolate,dark brown,5.25
3,strawberry,pink,5.25
4,chocolate,dark brown,5.25
5,bubblegum,pink,4.75


## Example  ##

This table can be found online: 
https://www.statcrunch.com/app/index.php?dataid=1843341

NBA players, 2015-2016 season

Each row represents one player. The columns are:


PLAYER : Player’s name
POSITION: Player’s position on team
TEAM: Team name
'15-'16 SALARY: Player’s salary in 2015-2016, in millions of dollars

The code for the positions is PG (Point Guard), SG (Shooting Guard), PF (Power Forward), SF (Small Forward), and C (Center). 



In [141]:
nba = pd.read_csv('nba_salaries.csv')
nba

Unnamed: 0,PLAYER,POSITION,TEAM,'15-'16 SALARY
0,Paul Millsap,PF,Atlanta Hawks,18.671659
1,Al Horford,C,Atlanta Hawks,12.000000
2,Tiago Splitter,C,Atlanta Hawks,9.756250
3,Jeff Teague,PG,Atlanta Hawks,8.000000
4,Kyle Korver,SG,Atlanta Hawks,5.746479
...,...,...,...,...
412,Gary Neal,PG,Washington Wizards,2.139000
413,DeJuan Blair,C,Washington Wizards,2.000000
414,Kelly Oubre Jr.,SF,Washington Wizards,1.920240
415,Garrett Temple,SG,Washington Wizards,1.100602


In [142]:
nba.columns

Index(['PLAYER', 'POSITION', 'TEAM', ''15-'16 SALARY'], dtype='object')

In [143]:
nba=nba.rename(columns={"'15-'16 SALARY":"SALARY"})
nba

Unnamed: 0,PLAYER,POSITION,TEAM,SALARY
0,Paul Millsap,PF,Atlanta Hawks,18.671659
1,Al Horford,C,Atlanta Hawks,12.000000
2,Tiago Splitter,C,Atlanta Hawks,9.756250
3,Jeff Teague,PG,Atlanta Hawks,8.000000
4,Kyle Korver,SG,Atlanta Hawks,5.746479
...,...,...,...,...
412,Gary Neal,PG,Washington Wizards,2.139000
413,DeJuan Blair,C,Washington Wizards,2.000000
414,Kelly Oubre Jr.,SF,Washington Wizards,1.920240
415,Garrett Temple,SG,Washington Wizards,1.100602


In [144]:
nba.columns

Index(['PLAYER', 'POSITION', 'TEAM', 'SALARY'], dtype='object')

The table contains 417 rows, one for each player.

In [145]:
nba.shape

(417, 4)

In [146]:
nba.head(10)

Unnamed: 0,PLAYER,POSITION,TEAM,SALARY
0,Paul Millsap,PF,Atlanta Hawks,18.671659
1,Al Horford,C,Atlanta Hawks,12.0
2,Tiago Splitter,C,Atlanta Hawks,9.75625
3,Jeff Teague,PG,Atlanta Hawks,8.0
4,Kyle Korver,SG,Atlanta Hawks,5.746479
5,Thabo Sefolosha,SF,Atlanta Hawks,4.0
6,Mike Scott,PF,Atlanta Hawks,3.333333
7,Kent Bazemore,SF,Atlanta Hawks,2.0
8,Dennis Schroder,PG,Atlanta Hawks,1.7634
9,Tim Hardaway Jr.,SG,Atlanta Hawks,1.30452


In [147]:
nba.tail(10)

Unnamed: 0,PLAYER,POSITION,TEAM,SALARY
407,Kris Humphries,PF,Washington Wizards,4.44
408,Jared Dudley,SF,Washington Wizards,4.375
409,Alan Anderson,SG,Washington Wizards,4.0
410,Drew Gooden,PF,Washington Wizards,3.3
411,Ramon Sessions,PG,Washington Wizards,2.170465
412,Gary Neal,PG,Washington Wizards,2.139
413,DeJuan Blair,C,Washington Wizards,2.0
414,Kelly Oubre Jr.,SF,Washington Wizards,1.92024
415,Garrett Temple,SG,Washington Wizards,1.100602
416,Jarell Eddie,SG,Washington Wizards,0.561716


In [148]:
nba.dtypes

PLAYER       object
POSITION     object
TEAM         object
SALARY      float64
dtype: object

In [149]:
nba.iloc[[0]]  # return first row as a dataframe

Unnamed: 0,PLAYER,POSITION,TEAM,SALARY
0,Paul Millsap,PF,Atlanta Hawks,18.671659


In [150]:
nba.iloc[np.arange(3, 6)] # return rows 3,4,5 as a table

Unnamed: 0,PLAYER,POSITION,TEAM,SALARY
3,Jeff Teague,PG,Atlanta Hawks,8.0
4,Kyle Korver,SG,Atlanta Hawks,5.746479
5,Thabo Sefolosha,SF,Atlanta Hawks,4.0


In [151]:
nba.iloc[3:6]

Unnamed: 0,PLAYER,POSITION,TEAM,SALARY
3,Jeff Teague,PG,Atlanta Hawks,8.0
4,Kyle Korver,SG,Atlanta Hawks,5.746479
5,Thabo Sefolosha,SF,Atlanta Hawks,4.0


Sort rows in alphabetical order by player name:

In [152]:
nba.sort_values(by='PLAYER').head(5)

Unnamed: 0,PLAYER,POSITION,TEAM,SALARY
68,Aaron Brooks,PG,Chicago Bulls,2.25
291,Aaron Gordon,PF,Orlando Magic,4.17168
59,Aaron Harrison,SG,Charlotte Hornets,0.525093
235,Adreian Payne,PF,Minnesota Timberwolves,1.93884
1,Al Horford,C,Atlanta Hawks,12.0


If we want a table of the top 5 highest paid players, we can first sort the list by salary and then take the first five rows:

In [153]:
nba.sort_values(by='SALARY', ascending=False).head()

Unnamed: 0,PLAYER,POSITION,TEAM,SALARY
169,Kobe Bryant,SF,Los Angeles Lakers,25.0
29,Joe Johnson,SF,Brooklyn Nets,24.894863
72,LeBron James,SF,Cleveland Cavaliers,22.9705
255,Carmelo Anthony,SF,New York Knicks,22.875
131,Dwight Howard,C,Houston Rockets,22.359364


In [154]:
nba[nba['PLAYER']=='Stephen Curry']  # Select PLAYER Stephen Curry

Unnamed: 0,PLAYER,POSITION,TEAM,SALARY
121,Stephen Curry,PG,Golden State Warriors,11.370786


In [155]:
warriors = nba[nba['TEAM']=='Golden State Warriors']  # select all players in 'Golden State Warriors'

In [156]:
warriors

Unnamed: 0,PLAYER,POSITION,TEAM,SALARY
117,Klay Thompson,SG,Golden State Warriors,15.501
118,Draymond Green,PF,Golden State Warriors,14.26087
119,Andrew Bogut,C,Golden State Warriors,13.8
120,Andre Iguodala,SF,Golden State Warriors,11.710456
121,Stephen Curry,PG,Golden State Warriors,11.370786
122,Jason Thompson,PF,Golden State Warriors,7.008475
123,Shaun Livingston,PG,Golden State Warriors,5.543725
124,Harrison Barnes,SF,Golden State Warriors,3.873398
125,Marreese Speights,C,Golden State Warriors,3.815
126,Leandro Barbosa,SG,Golden State Warriors,2.5


In [157]:
nba[nba['SALARY'] > 10]  # select all players whose SALARY is above 10(million)

Unnamed: 0,PLAYER,POSITION,TEAM,SALARY
0,Paul Millsap,PF,Atlanta Hawks,18.671659
1,Al Horford,C,Atlanta Hawks,12.000000
29,Joe Johnson,SF,Brooklyn Nets,24.894863
30,Thaddeus Young,PF,Brooklyn Nets,11.235955
42,Al Jefferson,C,Charlotte Hornets,13.500000
...,...,...,...,...
368,DeMar DeRozan,SG,Toronto Raptors,10.050000
383,Gordon Hayward,SF,Utah Jazz,15.409570
400,John Wall,PG,Washington Wizards,15.851950
401,Nene Hilario,C,Washington Wizards,13.000000


In [158]:
nba[nba['SALARY'] > 10].sort_values(by='SALARY') # ... and sort

Unnamed: 0,PLAYER,POSITION,TEAM,SALARY
368,DeMar DeRozan,SG,Toronto Raptors,10.050000
298,Gerald Wallace,SF,Philadelphia 76ers,10.105855
204,Luol Deng,SF,Miami Heat,10.151612
144,Monta Ellis,SG,Indiana Pacers,10.300000
95,Wilson Chandler,SF,Denver Nuggets,10.449438
...,...,...,...,...
131,Dwight Howard,C,Houston Rockets,22.359364
255,Carmelo Anthony,SF,New York Knicks,22.875000
72,LeBron James,SF,Cleveland Cavaliers,22.970500
29,Joe Johnson,SF,Brooklyn Nets,24.894863


Find all the Point Guards (PG) whose salaries were over 15 million:

In [159]:
nba[(nba['SALARY'] > 15) & (nba['POSITION'] == 'PG')]

Unnamed: 0,PLAYER,POSITION,TEAM,SALARY
60,Derrick Rose,PG,Chicago Bulls,20.093064
74,Kyrie Irving,PG,Cleveland Cavaliers,16.407501
156,Chris Paul,PG,Los Angeles Clippers,21.468695
269,Russell Westbrook,PG,Oklahoma City Thunder,16.744218
400,John Wall,PG,Washington Wizards,15.85195


In [160]:
nba[(nba['SALARY'] >= 10) & (nba['SALARY'] < 10.3)]  # all players with salaries between 10(included)-10.3(not included) millions

Unnamed: 0,PLAYER,POSITION,TEAM,SALARY
204,Luol Deng,SF,Miami Heat,10.151612
298,Gerald Wallace,SF,Philadelphia 76ers,10.105855
356,Danny Green,SG,San Antonio Spurs,10.0
368,DeMar DeRozan,SG,Toronto Raptors,10.05


Instead of looking for an exact match, you can search for the entry contains a string:

In [161]:
nba[nba['TEAM'].apply(lambda x: 'Warriors' in x)]

Unnamed: 0,PLAYER,POSITION,TEAM,SALARY
117,Klay Thompson,SG,Golden State Warriors,15.501
118,Draymond Green,PF,Golden State Warriors,14.26087
119,Andrew Bogut,C,Golden State Warriors,13.8
120,Andre Iguodala,SF,Golden State Warriors,11.710456
121,Stephen Curry,PG,Golden State Warriors,11.370786
122,Jason Thompson,PF,Golden State Warriors,7.008475
123,Shaun Livingston,PG,Golden State Warriors,5.543725
124,Harrison Barnes,SF,Golden State Warriors,3.873398
125,Marreese Speights,C,Golden State Warriors,3.815
126,Leandro Barbosa,SG,Golden State Warriors,2.5


In [162]:
nba[nba['POSITION'].apply(lambda x: 'G' in x)]

Unnamed: 0,PLAYER,POSITION,TEAM,SALARY
3,Jeff Teague,PG,Atlanta Hawks,8.000000
4,Kyle Korver,SG,Atlanta Hawks,5.746479
8,Dennis Schroder,PG,Atlanta Hawks,1.763400
9,Tim Hardaway Jr.,SG,Atlanta Hawks,1.304520
11,Jason Richardson,SG,Atlanta Hawks,0.947276
...,...,...,...,...
409,Alan Anderson,SG,Washington Wizards,4.000000
411,Ramon Sessions,PG,Washington Wizards,2.170465
412,Gary Neal,PG,Washington Wizards,2.139000
415,Garrett Temple,SG,Washington Wizards,1.100602


In [163]:
nba[~(nba['TEAM'] == 'Cleveland Cavaliers') & (nba['SALARY'] >= 20)]

Unnamed: 0,PLAYER,POSITION,TEAM,SALARY
29,Joe Johnson,SF,Brooklyn Nets,24.894863
60,Derrick Rose,PG,Chicago Bulls,20.093064
131,Dwight Howard,C,Houston Rockets,22.359364
156,Chris Paul,PG,Los Angeles Clippers,21.468695
169,Kobe Bryant,SF,Los Angeles Lakers,25.0
201,Chris Bosh,PF,Miami Heat,22.19273
202,Dwyane Wade,SG,Miami Heat,20.0
255,Carmelo Anthony,SF,New York Knicks,22.875
268,Kevin Durant,SF,Oklahoma City Thunder,20.158622
