In [1]:
from IPython.display import Markdown
Markdown(filename="../INDEX.md")

#Table of Contents
<small>Click on the following links to access the revelant content</small>
<br><br>
1. [Data Loading](http://nbviewer.ipython.org/urls/raw.githubusercontent.com/rphoa/python-project-primer/master/notebooks/1.%20Data%20Loading.ipynb)
2. [Data Manipulation](http://nbviewer.ipython.org/urls/raw.githubusercontent.com/rphoa/python-project-primer/master/notebooks/2.%20Data%20Manipulation.ipynb)
3. [Modelling](http://nbviewer.ipython.org/urls/raw.githubusercontent.com/rphoa/python-project-primer/master/notebooks/3.%20Modelling.ipynb)
4. [Saving Output](http://nbviewer.ipython.org/urls/raw.githubusercontent.com/rphoa/python-project-primer/master/notebooks/4.%20Saving%20Output.ipynb)


***
## 2.a Selecting

####[pandas] Select single columns by column name or index

In [2]:
import pandas as pd

data = pd.read_csv("../data/iris_with_header.csv", header=0)

#select by column name
sepal_length_by_column = data["sepal length (cm)"]

#select by index
sepal_length_by_index = data.iloc[:,0]

print ("Single column by column name: \t", sepal_length_by_column[:5].values.tolist())
print ("Single column by index: \t", sepal_length_by_index[:5].values.tolist())
print ()

Single column by column name: 	 [5.1, 4.9, 4.7, 4.6, 5.0]
Single column by index: 	 [5.1, 4.9, 4.7, 4.6, 5.0]



####[pandas] Select multiple columns by column name or index

In [3]:
#select by column name
sepal_length_by_column = data[["sepal length (cm)", "petal length (cm)"]]

#select by index
sepal_length_by_index = data.iloc[:, [0,2]]

#print first 5 rows of data set from column name and index - row version
print ("Multi column by column name - row version: \t", sepal_length_by_column[:5].values.tolist())
print ("Multi column by index - row version: \t\t", sepal_length_by_index[:5].values.tolist())
print ()

#print first 5 rows of data set from column name and index - column version (numpy style)
print ("Multi column by column name (1) - column version: \t", sepal_length_by_column[:5].values.T[0,:].tolist())
print ("Multi column by index name (1) - column version: \t", sepal_length_by_index[:5].values.T[0,:].tolist())
print ("Multi column by column name (2) - column version: \t", sepal_length_by_column[:5].values.T[1,:].tolist())
print ("Multi column by index name (2) - column version: \t", sepal_length_by_index[:5].values.T[1,:].tolist())

Multi column by column name - row version: 	 [[5.1, 1.4], [4.9, 1.4], [4.7, 1.3], [4.6, 1.5], [5.0, 1.4]]
Multi column by index - row version: 		 [[5.1, 1.4], [4.9, 1.4], [4.7, 1.3], [4.6, 1.5], [5.0, 1.4]]

Multi column by column name (1) - column version: 	 [5.1, 4.9, 4.7, 4.6, 5.0]
Multi column by index name (1) - column version: 	 [5.1, 4.9, 4.7, 4.6, 5.0]
Multi column by column name (2) - column version: 	 [1.4, 1.4, 1.3, 1.5, 1.4]
Multi column by index name (2) - column version: 	 [1.4, 1.4, 1.3, 1.5, 1.4]


***
## 2.b Slicing

In [4]:
#required imports
import pandas as pd

data = pd.read_csv("../data/iris_with_header.csv", header=0)

#select first 5 rows, all columns
print (data[:5].to_string())
print ()

#select first 10 rows of every 5th row
print (data[::5][:10].to_string())
print ()

#select third to tenth row
print (data[3:10].to_string())
print ()

#select fifth to tenth row and second to forth column by index
print (data.iloc[5:10, 2:4].to_string())
print ()

#select fifth to tenth row and second to forth column by column name
print (data.loc[5:10, "petal length (cm)":"petal width (cm)"].to_string())
print ()

   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)    type
0                5.1               3.5                1.4               0.2  setosa
1                4.9               3.0                1.4               0.2  setosa
2                4.7               3.2                1.3               0.2  setosa
3                4.6               3.1                1.5               0.2  setosa
4                5.0               3.6                1.4               0.2  setosa

    sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)    type
0                 5.1               3.5                1.4               0.2  setosa
5                 5.4               3.9                1.7               0.4  setosa
10                5.4               3.7                1.5               0.2  setosa
15                5.7               4.4                1.5               0.4  setosa
20                5.4               3.4                1.7            

***
## 2.c Filtering

In [5]:
#required imports
import pandas as pd

data = pd.read_csv("../data/iris_with_header.csv", header=0)

#select only setosa type
print (data[data["type"] == "versicolor"][:5].to_string())
print ()

#select only versicolor type with sepal length more than 6.5cm 
print (data[(data["type"] == "versicolor") & (data["sepal length (cm)"] > 6.5)][:5].to_string())
print ()

#select setosa and versicolor type with sepal length more than 6.5cm 
print (data[(data["type"].isin(["setosa", "versicolor"])) & (data["sepal length (cm)"] > 5.5)][:5].to_string())
print ()

    sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)        type
50                7.0               3.2                4.7               1.4  versicolor
51                6.4               3.2                4.5               1.5  versicolor
52                6.9               3.1                4.9               1.5  versicolor
53                5.5               2.3                4.0               1.3  versicolor
54                6.5               2.8                4.6               1.5  versicolor

    sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)        type
50                7.0               3.2                4.7               1.4  versicolor
52                6.9               3.1                4.9               1.5  versicolor
58                6.6               2.9                4.6               1.3  versicolor
65                6.7               3.1                4.4               1.4  versicolor
75                6.

***
## 2.d Transformation

In [6]:
#required imports
import pandas as pd

data = pd.read_csv("../data/iris_with_header.csv", header=0)

data["sepal area"] = data["sepal length (cm)"] * data["sepal width (cm)"]

def set_color(x):
    if (x == "setosa"):
        return "red"
    elif (x == "versicolor"):
        return "blue"
    elif (x == "virginica"):
        return "green"

data["color"] = data["type"].apply(set_color)

print (data[:5].to_string())
print ()


print (data.groupby("type").nth(0).to_string())
print ()

   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)    type  sepal area color
0                5.1               3.5                1.4               0.2  setosa       17.85   red
1                4.9               3.0                1.4               0.2  setosa       14.70   red
2                4.7               3.2                1.3               0.2  setosa       15.04   red
3                4.6               3.1                1.5               0.2  setosa       14.26   red
4                5.0               3.6                1.4               0.2  setosa       18.00   red

            color  petal length (cm)  petal width (cm)  sepal area  sepal length (cm)  sepal width (cm)
type                                                                                                   
setosa        red                1.4               0.2       17.85                5.1               3.5
versicolor   blue                4.7               1.4       22.40         