# Data Munging

## Data loading and preprocessing with pandas

### Fast and easy data loading

In [17]:
import pandas as pd
import numpy as np
iris_filename = 'datasets-uci-iris.csv'
iris = pd.read_csv(iris_filename, sep=',', header=None,
names= ['sepal_length', 'sepal_width', 
        'petal_length', 'petal_width',
        'target'])

In [18]:
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,target
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [19]:
iris.columns

Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'target'], dtype='object')

In [20]:
y = iris['target']
y

0         Iris-setosa
1         Iris-setosa
2         Iris-setosa
3         Iris-setosa
4         Iris-setosa
            ...      
145    Iris-virginica
146    Iris-virginica
147    Iris-virginica
148    Iris-virginica
149    Iris-virginica
Name: target, Length: 150, dtype: object

In [21]:
X = iris[['sepal_length', 'sepal_width']]
X

Unnamed: 0,sepal_length,sepal_width
0,5.1,3.5
1,4.9,3.0
2,4.7,3.2
3,4.6,3.1
4,5.0,3.6
...,...,...
145,6.7,3.0
146,6.3,2.5
147,6.5,3.0
148,6.2,3.4


### Dealing with problematic data

In [22]:
import pandas as pd
fake_dataset = pd.read_csv('a_loading_example_1.csv', sep=',')
fake_dataset

Unnamed: 0,Date,Temperature_city_1,Temperature_city_2,Temperature_city_3,Which_destination
0,20140910,80.0,32.0,40,1
1,20140911,100.0,50.0,36,2
2,20140912,102.0,55.0,46,1
3,20140913,60.0,20.0,35,3
4,20140914,60.0,,32,3
5,20140915,,57.0,42,2


In [23]:
fake_dataset.fillna(fake_dataset.mean(axis=0))

Unnamed: 0,Date,Temperature_city_1,Temperature_city_2,Temperature_city_3,Which_destination
0,20140910,80.0,32.0,40,1
1,20140911,100.0,50.0,36,2
2,20140912,102.0,55.0,46,1
3,20140913,60.0,20.0,35,3
4,20140914,60.0,42.8,32,3
5,20140915,80.4,57.0,42,2


### Data preprocessing

In [24]:
iris['target'].unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

### Data selection

In [25]:
import pandas as pd
dataset = pd.read_csv('a_selection_example_1.csv')
dataset

Unnamed: 0,n,val1,val2,val3
0,100,10,10,C
1,101,10,20,C
2,102,10,30,B
3,103,10,40,B
4,104,10,50,A


In [26]:
dataset = pd.read_csv('a_selection_example_1.csv', index_col=0)
dataset

Unnamed: 0_level_0,val1,val2,val3
n,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
100,10,10,C
101,10,20,C
102,10,30,B
103,10,40,B
104,10,50,A


In [27]:
dataset['val3'][104]

'A'

In [28]:
dataset.loc[104, 'val3']

'A'

In [29]:
dataset.iloc[4, 2]

'A'

In [30]:
dataset[['val3', 'val2']][0:2]

Unnamed: 0_level_0,val3,val2
n,Unnamed: 1_level_1,Unnamed: 2_level_1
100,C,10
101,C,20
