# Data Cleansing

In [68]:
import numpy as np
import pandas as pd

In [59]:
#!curl https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data -o "./data/iris.csv"

^C


In [60]:
#df = pd.read_csv("./data/iris.csv" , header=None)
#df.columns = ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm', 'Species']
#df.to_csv("./data/iris-with-header.tsv", index=False, sep="\t", encoding="utf-8")

In [80]:
df = pd.read_csv("./data/iris-with-header.tsv", delimiter='\t' )

## Randomly select some elements and replace with NaN
See [cheat-sheet:dataframe-selection](https://github.com/ryubidragonfire/python-cheatsheet/blob/master/dataframe-selection.ipynb)

In [81]:
rand_row_ind = np.random.choice(df.index.values, 10)
rand_col_name = np.random.choice(df.columns, 10)

In [82]:
for i in range(rand_row_ind.size):
    df.loc[rand_row_ind[i], rand_col_name[i]] = np.nan

In [103]:
#df

## Remove duplicated rows

In [84]:
print(df.shape)
df2 = df.drop_duplicates()
print(df2.shape)

(150, 5)
(148, 5)


## Check for missing data

In [85]:
df.isnull().sum()

SepalLengthCm    0
SepalWidthCm     1
PetalLengthCm    5
PetalWidthCm     3
Species          1
dtype: int64

## Either: Remove rows with missing data

- `df.dropna()`
- `df.dropna(axis=1)`
- `df.dropna(how='all')`
- `df.dropna(threshold=2)`
- `df.dropna(subset=['Speacies])`

In [86]:
print(df2.shape)
df3 = df2.dropna()
print(df3.shape)

(148, 5)
(138, 5)


## Or: Impute missing data with `Imputer`
Interpolate by estimation. Replace missing value by e.g. mean, median, most_frequent.
Only apply to `numerical`.
See [sklearn documentation](??)

In [90]:
help(Imputer)

Help on class Imputer in module sklearn.preprocessing.imputation:

class Imputer(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin)
 |  Imputation transformer for completing missing values.
 |  
 |  Read more in the :ref:`User Guide <imputation>`.
 |  
 |  Parameters
 |  ----------
 |  missing_values : integer or "NaN", optional (default="NaN")
 |      The placeholder for the missing values. All occurrences of
 |      `missing_values` will be imputed. For missing values encoded as np.nan,
 |      use the string value "NaN".
 |  
 |  strategy : string, optional (default="mean")
 |      The imputation strategy.
 |  
 |      - If "mean", then replace missing values using the mean along
 |        the axis.
 |      - If "median", then replace missing values using the median along
 |        the axis.
 |      - If "most_frequent", then replace missing using the most frequent
 |        value along the axis.
 |  
 |  axis : integer, optional (default=0)
 |      The axis along which to i

In [93]:
df.columns

Index(['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm',
       'Species'],
      dtype='object')

In [104]:
df.isnull().sum()

SepalLengthCm    0
SepalWidthCm     1
PetalLengthCm    5
PetalWidthCm     3
Species          1
dtype: int64

In [106]:
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
#imp = imp.fit(df['Species'])
#imp = imp.fit(df)
imp = imp.fit(df[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']])
array_imputed = imp.transform(df[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']].values)
df_imp = df.copy()
df_imp[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']] = array_imputed
df_imp.isnull().sum()

SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          1
dtype: int64

In [102]:
df_imp

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
5,5.4,3.9,1.7,0.4,Iris-setosa
6,4.6,3.4,1.4,0.3,Iris-setosa
7,5.0,3.4,1.5,0.2,Iris-setosa
8,4.4,2.9,1.4,0.2,Iris-setosa
9,4.9,3.1,1.5,0.1,Iris-setosa


In [None]:
# Imputing categorical data