In [None]:
import numpy as np
import pandas as pd

# DataFrame Functionality: Basic operations and manipulation

In [None]:
titanic = pd.read_csv('../datasets/titanic/titanic.csv')

## Inspecting the DataFrame

In [None]:
len(titanic)

In [None]:
# Number of rows and columns in the DataFrane
titanic.shape

In [None]:
titanic.head(3)

In [None]:
titanic.tail(3)

In [None]:
titanic.describe()

In [None]:
titanic.info()

## Common operations

**In the following operations we apply the .head(3) method just for convinience, so we can see only a few rows of the output.**

In [None]:
# Selecting rows where embarked in not null


In [None]:
# Selecting rows where embarked in not null and passangers in the first class


In [None]:
# Sorting values by fare 

titanic.sort_values(by='fare', ascending=True).head(3)

In [None]:
# Get the oldest and youngest passangers


In [None]:
# Setting a column as an index
titanic.set_index('name', inplace=True)
titanic.head(3)

In [None]:
titanic.loc['Allison, Miss. Helen Loraine']

In [None]:
# Assing the value 13 to the boat of 'Allison, Miss. Helen Loraine'


#### Deleting rows and colums

In [None]:
titanic.reset_index(inplace=True)
titanic.head(5)

In [None]:
titanic.drop(1309, axis=0, inplace=True)
titanic.tail(3)

In [None]:
titanic.drop('body', axis=1, inplace=True)
titanic.tail(3)

## Statistics with DataFrames

A large number of methods for computing descriptive statistics and other related operations on Series and DataFrames. Most of these are aggregations (hence producing a lower-dimensional result) like sum(), mean(), and quantile(), but some of them, like cumsum() and cumprod(), produce an object of the same size. Generally speaking, these methods take an axis argument, just like ndarray.{sum, std, ...}, but the axis can be specified by name or integer:

* Series: no axis argument needed
* DataFrame: “index” (axis=0, default), “columns” (axis=1) (**This can be potentially confusing!**)

Here is a quick reference summary table of common functions.

<table border="1" class="docutils">
<colgroup>
<col width="20%">
<col width="80%">
</colgroup>
<thead valign="bottom">
<tr class="row-odd"><th class="head">Function</th>
<th class="head">Description</th>
</tr>
</thead>
<tbody valign="top">
<tr class="row-even"><td><tt class="docutils literal"><span class="pre">count</span></tt></td>
<td>Number of non-null observations</td>
</tr>
<tr class="row-odd"><td><tt class="docutils literal"><span class="pre">sum</span></tt></td>
<td>Sum of values</td>
</tr>
<tr class="row-even"><td><tt class="docutils literal"><span class="pre">mean</span></tt></td>
<td>Mean of values</td>
</tr>
<tr class="row-odd"><td><tt class="docutils literal"><span class="pre">mad</span></tt></td>
<td>Mean absolute deviation</td>
</tr>
<tr class="row-even"><td><tt class="docutils literal"><span class="pre">median</span></tt></td>
<td>Arithmetic median of values</td>
</tr>
<tr class="row-odd"><td><tt class="docutils literal"><span class="pre">min</span></tt></td>
<td>Minimum</td>
</tr>
<tr class="row-even"><td><tt class="docutils literal"><span class="pre">max</span></tt></td>
<td>Maximum</td>
</tr>
<tr class="row-odd"><td><tt class="docutils literal"><span class="pre">mode</span></tt></td>
<td>Mode</td>
</tr>
<tr class="row-even"><td><tt class="docutils literal"><span class="pre">abs</span></tt></td>
<td>Absolute Value</td>
</tr>
<tr class="row-odd"><td><tt class="docutils literal"><span class="pre">prod</span></tt></td>
<td>Product of values</td>
</tr>
<tr class="row-even"><td><tt class="docutils literal"><span class="pre">std</span></tt></td>
<td>Bessel-corrected sample standard deviation</td>
</tr>
<tr class="row-odd"><td><tt class="docutils literal"><span class="pre">var</span></tt></td>
<td>Unbiased variance</td>
</tr>
<tr class="row-even"><td><tt class="docutils literal"><span class="pre">sem</span></tt></td>
<td>Standard error of the mean</td>
</tr>
<tr class="row-odd"><td><tt class="docutils literal"><span class="pre">skew</span></tt></td>
<td>Sample skewness (3rd moment)</td>
</tr>
<tr class="row-even"><td><tt class="docutils literal"><span class="pre">kurt</span></tt></td>
<td>Sample kurtosis (4th moment)</td>
</tr>
<tr class="row-odd"><td><tt class="docutils literal"><span class="pre">quantile</span></tt></td>
<td>Sample quantile (value at %)</td>
</tr>
<tr class="row-even"><td><tt class="docutils literal"><span class="pre">cumsum</span></tt></td>
<td>Cumulative sum</td>
</tr>
<tr class="row-odd"><td><tt class="docutils literal"><span class="pre">cumprod</span></tt></td>
<td>Cumulative product</td>
</tr>
<tr class="row-even"><td><tt class="docutils literal"><span class="pre">cummax</span></tt></td>
<td>Cumulative maximum</td>
</tr>
<tr class="row-odd"><td><tt class="docutils literal"><span class="pre">cummin</span></tt></td>
<td>Cumulative minimum</td>
</tr>
</tbody>
</table>

In [None]:
titanic.describe()

In [None]:
titanic_numerics = titanic[['age','sibsp', 'parch', 'fare']]

In [None]:
titanic_numerics.mean(axis=0)

In [None]:
titanic_numerics.max(axis=0)

The idxmin() and idxmax() functions on Series and DataFrame compute the index labels with the minimum and maximum corresponding values:

In [None]:
titanic_numerics.idxmax(axis=0)

In [None]:
titanic_numerics.idxmin(axis=0)

### Counting values for categorical variables

In [None]:
titanic['embarked'].value_counts()

In [None]:
# Number of unique values
titanic['embarked'].nunique()

### Iteration

The behavior of basic iteration over pandas objects depends on the type. When iterating over a Series, it is regarded as array-like, and basic iteration produces the values. Other data structures, like DataFrame and Panel, follow the dict-like convention of iterating over the “keys” of the objects.

In short, basic iteration (for i in object) produces:

* Series: values
* DataFrame: column labels

In [None]:
titanic.reset_index(inplace=True)

In [None]:
for name, survived in zip(titanic['name'][0:10], titanic['survived'][0:10]):
    text = " survided" if survived==0 else " did not survived"
    print name + text

### Go to Exercises