## Handle corrupt/deformed data 
This notebook illustrates other type of a corrupt data represented as 0 instead of '?' (see ProcessingCategoricalTitanic.ipynb)

###  Pima Indians Diabetes Dataset contains corrupt data which marked as zero 0 since  a zero for body mass index or blood pressure is invalid.
    0. Number of times pregnant.
    1. Plasma glucose concentration a 2 hours in an oral glucose tolerance test.
    2. Diastolic blood pressure (mm Hg).
    3. Triceps skinfold thickness (mm).
    4. 2-Hour serum insulin (mu U/ml).
    5. Body mass index (weight in kg/(height in m)^2).
    6. Diabetes pedigree function.
    7. Age (years).
    8. Class variable (0 or 1).

In [1]:
import pandas as pd
import numpy as np
cols= ['pregnacies','glucose','diastolic','triceps','insulin','bmi','pdf','age','diabetes']
df= pd.read_csv('/home/tri/Downloads/pima-indians-diabetes.data',names=cols)

#df= pd.read_csv('/home/tri/Downloads/pima-indians-diabetes.data',header=None)
df.head(3)
# deformed data can be seen as triceps having 0

Unnamed: 0,pregnacies,glucose,diastolic,triceps,insulin,bmi,pdf,age,diabetes
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1


In [2]:
#### Check number of  0 value (corrupt value)
#(df[[1,2,3,4,5]] == 0).sum()

In [3]:
# mark missing value with nan
# df.insulin.replace(0,np.nan, inplace=True)
# df.triceps.replace(0,np.nan, inplace=True)
# df.bmi.replace(0,np.nan, inplace=True)
df[['glucose','diastolic','triceps','insulin','bmi']]=df[['glucose','diastolic','triceps','insulin','bmi']].replace(0,np.nan)

df.head(3)

Unnamed: 0,pregnacies,glucose,diastolic,triceps,insulin,bmi,pdf,age,diabetes
0,6,148.0,72.0,35.0,,33.6,0.627,50,1
1,1,85.0,66.0,29.0,,26.6,0.351,31,0
2,8,183.0,64.0,,,23.3,0.672,32,1


In [4]:
df.shape

(768, 9)

# Using missing data

In [5]:
from sklearn.preprocessing import Imputer

In [7]:
# Preview dataset with missing values
masses_data =pd.read_csv('/home/tri/Downloads/mammographic_masses.data')
masses_data.head(3)

Unnamed: 0,5,67,3,5.1,3.1,1
0,4,43,1,1,?,1
1,5,58,4,5,3,1
2,4,28,1,1,3,0


### Read data and convert '?' character into nan

In [8]:
masses_data = pd.read_csv('/home/tri/Downloads/mammographic_masses.data', na_values=['?'], names=['BI_RADS','age','shape','margin','density','severity'])
masses_data.head()

Unnamed: 0,BI_RADS,age,shape,margin,density,severity
0,5.0,67.0,3.0,5.0,3.0,1
1,4.0,43.0,1.0,1.0,,1
2,5.0,58.0,4.0,5.0,3.0,1
3,4.0,28.0,1.0,1.0,3.0,0
4,5.0,74.0,1.0,5.0,,1


In [9]:
nans_count = masses_data.shape[0] - masses_data.dropna().shape[0]
print("%d rows have missing values in the train data" % nans_count)
masses_data.isnull().sum()

131 rows have missing values in the train data


BI_RADS      2
age          5
shape       31
margin      48
density     76
severity     0
dtype: int64

### Create a new data with misssing values

In [4]:
import pandas as pd
import numpy as np
dat = {'first_name': ['Bob','Ellen', np.nan, 'Anna', 'John'], 
        'last_name': ['George', 'William', np.nan,'Bush',  'Conan'], 
        'age': [18, 17, np.nan, 17, 18], 
        'sex': ['f', 'm',np.nan,  'm', 'f'], 
        'midterm': [ 89,np.nan, np.nan, 87, 90],
        'Final': [91, np.nan,94, np.nan, 94]}
df = pd.DataFrame(dat, columns = ['first_name', 'last_name', 'age', 'sex', 'midterm', 'Final'])
df

Unnamed: 0,first_name,last_name,age,sex,midterm,Final
0,Bob,George,18.0,f,89.0,91.0
1,Ellen,William,17.0,m,,
2,,,,,,94.0
3,Anna,Bush,17.0,m,87.0,
4,John,Conan,18.0,f,90.0,94.0


In [2]:
df_null= df[df.isnull()]
df_null.head(3)

Unnamed: 0,first_name,last_name,age,sex,midterm,Final
0,,,,,,
1,,,,,,
2,,,,,,


### Dropping missing only for specific column

In [6]:
df.dropna(subset=['last_name'], inplace=True)
df

Unnamed: 0,first_name,last_name,age,sex,midterm,Final
0,Bob,George,18.0,f,89.0,91.0
1,Ellen,William,17.0,m,,
3,Anna,Bush,17.0,m,87.0,
4,John,Conan,18.0,f,90.0,94.0


# Dropping missing data approach

In [3]:
df_process = df.dropna()
df_process

Unnamed: 0,first_name,last_name,age,sex,midterm,Final
0,Bob,George,18.0,f,89.0,91.0
4,John,Conan,18.0,f,90.0,94.0


### Dropping only if missing entire row

In [21]:
df_process = df.dropna(how='all')
df_process
# for column, indicate axis=1
# df_process = df.dropna(axis=1, how='all')

Unnamed: 0,first_name,last_name,age,sex,midterm,Final
0,Bob,George,18.0,f,89.0,91.0
1,Ellen,William,17.0,m,,
2,,,,,,94.0
3,Anna,Bush,17.0,m,87.0,
4,John,Conan,18.0,f,90.0,94.0


### Dropping by a threshold

In [24]:
df.dropna(thresh=4)

Unnamed: 0,first_name,last_name,age,sex,midterm,Final
0,Bob,George,18.0,f,89.0,91.0
1,Ellen,William,17.0,m,,
3,Anna,Bush,17.0,m,87.0,
4,John,Conan,18.0,f,90.0,94.0


# A better approach is to fill missing values 

##  Fill missing values with the mean value of its column

In [8]:
df['Final'].fillna(df['Final'].mean(),inplace=True)
df

Unnamed: 0,first_name,last_name,age,sex,midterm,Final
0,Bob,George,18.0,f,89.0,91.0
1,Ellen,William,17.0,m,,93.0
2,,,,,,94.0
3,Anna,Bush,17.0,m,87.0,93.0
4,John,Conan,18.0,f,90.0,94.0


### Fill missing value using groupby approach

In [10]:
## Fill missing values with mean values in each group (eg: sex). Note 3rd row missing sex value
df['midterm'].fillna(df.groupby("sex")["midterm"].transform("mean"), inplace=True)
df

Unnamed: 0,first_name,last_name,age,sex,midterm,Final
0,Bob,George,18.0,f,89.0,91.0
1,Ellen,William,17.0,m,87.0,
2,,,,,,94.0
3,Anna,Bush,17.0,m,87.0,
4,John,Conan,18.0,f,90.0,94.0


## FilI missing catogory value with the most frequence value

In [12]:
df['first_name'].fillna(df['first_name'].value_counts().index[0],inplace=True)
df['last_name'].fillna(df['last_name'].value_counts().index[0],inplace=True)
df

Unnamed: 0,first_name,last_name,age,sex,midterm,Final
0,Bob,George,18.0,f,89.0,91.0
1,Ellen,William,17.0,m,,
2,Anna,George,,,,94.0
3,Anna,Bush,17.0,m,87.0,
4,John,Conan,18.0,f,90.0,94.0


In [18]:
#  Fill missing categorical values by the most frequent values of groups
#https://stackoverflow.com/questions/46532986/filling-missing-values-of-categorical-values-based-on-other-categorical-values-i
import pandas as pd
import numpy as np
data = {'type': ['softdrink', 'juice', 'softdrink', 'softdrink',    'juice','juice','softdrink'],
    'product': ['coca', np.nan, 'pepsi', 'pepsi', 'orange','grape',np.nan],
    'price': [25, 94, 57, 62, 70,50,60]}
df = pd.DataFrame(data)
df.head(4)

Unnamed: 0,price,product,type
0,25,coca,softdrink
1,94,,juice
2,57,pepsi,softdrink
3,62,pepsi,softdrink


In [20]:
df['product']=df.groupby('type').product.transform(lambda x: x.fillna(x.mode()[0]))
df

Unnamed: 0,price,product,type
0,25,coca,softdrink
1,94,grape,juice
2,57,pepsi,softdrink
3,62,pepsi,softdrink
4,70,orange,juice
5,50,grape,juice
6,60,pepsi,softdrink


In [14]:
## Fill every column with its own most frequent value 
df = df.apply(lambda x:x.fillna(x.value_counts().index[0]))
df

Unnamed: 0,first_name,last_name,age,sex,midterm,Final
0,Bob,George,18.0,f,89.0,91.0
1,Ellen,William,17.0,m,87.0,94.0
2,Anna,George,18.0,m,87.0,94.0
3,Anna,Bush,17.0,m,87.0,94.0
4,John,Conan,18.0,f,90.0,94.0


## Filling missing values with scikit-learn

In [19]:
import pandas as pd
dat = {'first_name': ['Bob','Ellen', np.nan, 'Anna', 'John'], 
        'last_name': ['George', 'William', np.nan,'Bush',  'Conan'], 
        'age': [18, 17, np.nan, 17, 18], 
        'sex': ['f', 'm',np.nan,  'm', 'f'], 
        'midterm': [ 89,np.nan, np.nan, 87, 90],
        'Final': [91, np.nan,94, np.nan, 94]}
df = pd.DataFrame(dat, columns = ['first_name', 'last_name', 'age', 'sex', 'midterm', 'Final'])
df

Unnamed: 0,first_name,last_name,age,sex,midterm,Final
0,Bob,George,18.0,f,89.0,91.0
1,Ellen,William,17.0,m,,
2,,,,,,94.0
3,Anna,Bush,17.0,m,87.0,
4,John,Conan,18.0,f,90.0,94.0


In [15]:
# scikit learn impute only process with numeric value, the following solution can solve this problem with category from
# https://stackoverflow.com/questions/25239958/impute-categorical-missing-values-in-scikit-learn
import pandas as pd
import numpy as np

from sklearn.base import TransformerMixin

class DataFrameImputer(TransformerMixin):

    def __init__(self):
        """Impute missing values.
        Columns of dtype object are imputed with the most frequent value in column.
        Columns of other types are imputed with mean of column.
        """
    def fit(self, X, y=None):
        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].mean() for c in X],
            index=X.columns)

        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)

#alternative, use data = [['a', 1, 2], []'b', 1, 1], ['b', 2, 2],[np.nan, np.nan, np.nan]]

df_pre = DataFrameImputer().fit_transform(df)
print('before...')
print(df)
print('after...')
print(df_pre)

before...
  first_name last_name   age sex  midterm  Final
0        Bob    George  18.0   f     89.0   91.0
1      Ellen   William  17.0   m     87.0   94.0
2       Anna    George  18.0   m     87.0   94.0
3       Anna      Bush  17.0   m     87.0   94.0
4       John     Conan  18.0   f     90.0   94.0
after...
  first_name last_name   age sex  midterm  Final
0        Bob    George  18.0   f     89.0   91.0
1      Ellen   William  17.0   m     87.0   94.0
2       Anna    George  18.0   m     87.0   94.0
3       Anna      Bush  17.0   m     87.0   94.0
4       John     Conan  18.0   f     90.0   94.0


### Imputing a single column with imputer from scikit learn

In [20]:
# using above df to fill missing value for final
import numpy as np
from sklearn.preprocessing import Imputer

imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
# Using most frequent value: imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
df['Final']=imp.fit_transform(df[['Final']])
df

Unnamed: 0,first_name,last_name,age,sex,midterm,Final
0,Bob,George,18.0,f,89.0,91.0
1,Ellen,William,17.0,m,,93.0
2,,,,,,94.0
3,Anna,Bush,17.0,m,87.0,93.0
4,John,Conan,18.0,f,90.0,94.0


## Example of Imputing with pipeline

In [4]:
# Example of Imputing with pipeline
from sklearn.preprocessing import Imputer
from sklearn.pipeline import Pipeline
from sklearn.pipeline import SVC

steps =[('imputation',Imputer(missing_values='NaN',strategy='most_frequent',axis=0)),('SVM',SVC())]

### Fill missing value for all (numeric) columns 

In [4]:
# Preview dataset with missing values
import pandas as pd
df = pd.read_csv('/home/tri/Downloads/mammographic_masses.data', na_values=['?'], names=['BI_RADS','age','shape','margin','density','severity'])
df.head(2)

Unnamed: 0,BI_RADS,age,shape,margin,density,severity
0,5.0,67.0,3.0,5.0,3.0,1
1,4.0,43.0,1.0,1.0,,1


In [31]:
# strategy includes 'mean', 'median', 'most_frequent' value along axis (0: column, 1: rows)
fill_NaN = Imputer(missing_values=np.nan, strategy='mean', axis=1)
imputed_df = pd.DataFrame(fill_NaN.fit_transform(df))
imputed_df.columns = df.columns
imputed_df.index = df.index
imputed_df.head(2)

Unnamed: 0,BI_RADS,age,shape,margin,density,severity
0,5.0,67.0,3.0,5.0,3.0,1.0
1,4.0,43.0,1.0,1.0,10.0,1.0


In [19]:
# check missing value
df.isnull().sum()

BI_RADS      2
age          5
shape       31
margin      48
density     76
severity     0
dtype: int64

In [24]:
# total missing value over original data
(df[df.isnull().any(axis=1)].shape,df.shape)

((131, 6), (961, 6))

In [26]:
# Extract missing rows with missing values
missing_df= df[df.isnull().any(axis=1)]
missing_df.head(5)

Unnamed: 0,BI_RADS,age,shape,margin,density,severity
1,4.0,43.0,1.0,1.0,,1
4,5.0,74.0,1.0,5.0,,1
5,4.0,65.0,1.0,,3.0,0
6,4.0,70.0,,,3.0,0
7,5.0,42.0,1.0,,3.0,0


In [3]:
import pandas as pd
import numpy as np
from fancyimpute import KNN

In [4]:
dat = {'first_name': ['Bob','Ellen', np.nan, 'Anna', 'John'], 
        'last_name': ['George', 'William', np.nan,'Bush',  'Conan'], 
        'age': [18, 17, np.nan, 17, 18], 
        'sex': ['f', 'm',np.nan,  'm', 'f'], 
        'midterm': [ 89,np.nan, np.nan, 87, 90],
        'Final': [91, np.nan,94, np.nan, 94]}
df = pd.DataFrame(dat, columns = ['first_name', 'last_name', 'age', 'sex', 'midterm', 'Final'])
df

Unnamed: 0,first_name,last_name,age,sex,midterm,Final
0,Bob,George,18.0,f,89.0,91.0
1,Ellen,William,17.0,m,,
2,,,,,,94.0
3,Anna,Bush,17.0,m,87.0,
4,John,Conan,18.0,f,90.0,94.0


In [5]:
df_pro=df.select_dtypes(include=[np.float]).as_matrix()
type(df_pro)

numpy.ndarray

In [11]:
# Filter numeric data from missing value data frame
df_numeric = df.select_dtypes(include=[np.float])


df_filled = pd.DataFrame(KNN(3).complete(df_numeric.as_matrix()))

Imputing row 1/5 with 0 missing, elapsed time: 0.001


In [12]:
df_filled.columns= df_numeric.columns
df_filled.index= df_numeric.index

In [13]:
df_filled.head(3)

Unnamed: 0,age,midterm,Final
0,18.0,89.0,91.0
1,17.0,87.000005,92.5
2,18.0,89.999999,94.0


SimpleFill: Replaces missing entries with the mean or median of each column.

•KNN: Nearest neighbor imputations which weights samples using the mean squared difference on features for which two rows both have observed data.

•SoftImpute: Matrix completion by iterative soft thresholding of SVD decompositions. Inspired by the softImpute package for R, which is based on Spectral Regularization Algorithms for Learning Large Incomplete Matrices by Mazumder et. al.

•IterativeSVD: Matrix completion by iterative low-rank SVD decomposition. Should be similar to SVDimpute from Missing value estimation methods for DNA microarrays by Troyanskaya et. al.

•MICE: Reimplementation of Multiple Imputation by Chained Equations.

•MatrixFactorization: Direct factorization of the incomplete matrix into low-rank U and V, with an L1 sparsity penalty on the elements of U and an L2 penalty on the elements of V. Solved by gradient descent.

•NuclearNormMinimization: Simple implementation of Exact Matrix Completion via Convex Optimization by Emmanuel Candes and Benjamin Recht using cvxpy. Too slow for large matrices.

•BiScaler: Iterative estimation of row/column means and standard deviations to get doubly normalized matrix. Not guaranteed to converge but works well in practice. Taken from Matrix Completion and Low-Rank SVD via Fast Alternating Least Squares.

In [None]:
from fancyimpute import BiScaler, KNN, NuclearNormMinimization, SoftImpute

# X is the complete data matrix
# X_incomplete has the same values as X except a subset have been replace with NaN

# Use 3 nearest rows which have a feature to fill in each row's missing features
X_filled_knn = KNN(k=3).complete(X_incomplete)

# matrix completion using convex optimization to find low-rank solution
# that still matches observed values. Slow!
X_filled_nnm = NuclearNormMinimization().complete(X_incomplete)

# Instead of solving the nuclear norm objective directly, instead
# induce sparsity using singular value thresholding
X_filled_softimpute = SoftImpute().complete(X_incomplete_normalized)

# print mean squared error for the three imputation methods above
nnm_mse = ((X_filled_nnm[missing_mask] - X[missing_mask]) ** 2).mean()
print("Nuclear norm minimization MSE: %f" % nnm_mse)

softImpute_mse = ((X_filled_softimpute[missing_mask] - X[missing_mask]) ** 2).mean()
print("SoftImpute MSE: %f" % softImpute_mse)

knn_mse = ((X_filled_knn[missing_mask] - X[missing_mask]) ** 2).mean()
print("knnImpute MSE: %f" % knn_mse)

In [12]:
# Modified method for series
import numpy
import pandas 
from sklearn.base import TransformerMixin

class SeriesImputer(TransformerMixin):

    def __init__(self):
        """Impute missing values.

        If the Series is of dtype Object, then impute with the most frequent object.
        If the Series is not of dtype Object, then impute with the mean.  

        """
    def fit(self, X, y=None):
        if   X.dtype == numpy.dtype('O'): self.fill = X.value_counts().index[0]
        else                            : self.fill = X.mean()
        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)
    
# Make a series
s1 = pandas.Series(['k', 'i', 't', 't', 'e', numpy.NaN])

a  = SeriesImputer()   # Initialize the imputer
a.fit(s1)              # Fit the imputer
s2 = a.transform(s1)  
s2

0    k
1    i
2    t
3    t
4    e
5    t
dtype: object

## Fill missing values with time series

In [75]:
## Using ranfom forest model to predict missing values

In [11]:
from sklearn.preprocessing import StandardScaler,MinMaxScaler
scaler = StandardScaler()

In [12]:
all_features_scaled = scaler.fit_transform(all_features)
all_features_scaled

array([[ 0.7650629 ,  0.17563638,  1.39618483,  0.24046607],
       [ 0.15127063,  0.98104077,  1.39618483,  0.24046607],
       [-1.89470363, -1.43517241, -1.157718  ,  0.24046607],
       ..., 
       [ 0.56046548,  0.98104077,  1.39618483,  0.24046607],
       [ 0.69686376,  0.98104077,  1.39618483,  0.24046607],
       [ 0.42406719,  0.17563638,  0.11923341,  0.24046607]])

In [13]:
np.random.seed(1234)
training_inputs, testing_inputs, training_classes,testing_classes= train_test_split(all_features_scaled, all_classes,train_size=.75, random_state=1)

# Decision Tree

In [14]:
from sklearn.tree import DecisionTreeClassifier
clf= DecisionTreeClassifier(random_state=1)
clf.fit(training_inputs, training_classes)
clf.score(testing_inputs,testing_classes)

0.73557692307692313

In [15]:
cv_scores =cross_val_score(clf, all_features_scaled,all_classes,cv=10)
cv_scores.mean()

0.73735569455522443

In [16]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=10, random_state=1)
cv_scores = cross_val_score(clf,all_features_scaled, all_classes, cv=10)
cv_scores.mean()

0.75404964806963037

In [17]:
from sklearn.svm import SVC
C=10
svc = SVC(kernel='linear',C=C)
cv_scores = cross_val_score(svc, all_features_scaled,all_classes,cv=10)
cv_scores.mean()

0.79770370681331593

# KNN

In [18]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=10)
cv_scores = cross_val_score(clf, all_features_scaled,all_classes, cv=10)
cv_scores.mean()

0.78547954885745075

In [47]:
for n in range(1,50):
    clf = KNeighborsClassifier(n_neighbors =n)
    cv_scores =cross_val_score(clf,all_features_scaled, all_classes, cv=10)
    print(n,cv_scores.mean())

1 0.723912374236
2 0.688983809804
3 0.75410806991
4 0.730081300813
5 0.773546450611
6 0.762616318934
7 0.794059513315
8 0.774708240628
9 0.788020024348
10 0.785479548857
11 0.79153338091
12 0.779425716805
13 0.781908470117
14 0.791503995074
15 0.787874844325
16 0.779441109385
17 0.781807368848
18 0.775681121699
19 0.780514741894
20 0.782866658271
21 0.785392790675
22 0.78173425409
23 0.780558820648
24 0.780587506822
25 0.787817122147
26 0.786626995788
27 0.785436519598
28 0.790227110533
29 0.786597959783
30 0.787831465234
31 0.791417236892
32 0.787831465234
33 0.786597609952
34 0.786611953039
35 0.786626296125
36 0.785435819935
37 0.786684368135
38 0.78665533213
39 0.787889187412
40 0.785479199026
41 0.785464506108
42 0.781850048277
43 0.78306921064
44 0.783054867554
45 0.783054867554
46 0.785464855939
47 0.786684368135
48 0.789065320516
49 0.790299525629


# Naive Bayes

In [19]:
from sklearn.naive_bayes import MultinomialNB
scaler =MinMaxScaler()
all_features_minmax = scaler.fit_transform(all_features)
clf = MultinomialNB()
cv_scores = cross_val_score(clf,all_features_minmax,all_classes, cv=10)
cv_scores.mean()

0.78440556651693882

# Approach 2: Impute missing data

In [20]:
masses_data = pd.read_csv('mammographic_masses.data', na_values=['?'], names=['BI_RADS','age','shape','margin','density','severity'])
masses_data.describe()



Unnamed: 0,BI_RADS,age,shape,margin,density,severity
count,959.0,956.0,930.0,913.0,885.0,961.0
mean,4.348279,55.487448,2.721505,2.796276,2.910734,0.463059
std,1.783031,14.480131,1.242792,1.566546,0.380444,0.498893
min,0.0,18.0,1.0,1.0,1.0,0.0
25%,,,,,,0.0
50%,,,,,,0.0
75%,,,,,,1.0
max,55.0,96.0,4.0,5.0,4.0,1.0


In [34]:
masses_data.isnull().sum()

BI_RADS      2
age          5
shape       31
margin      48
density     76
severity     0
dtype: int64

In [32]:
a[np.where(a<7)]

array([5, 6])

# Neural network

In [1]:
from keras.layers import Dense
from keras.models import Sequential

Using TensorFlow backend.


In [2]:
def create_model():
    model =Sequential()
    model.add(Dense(6,input_dim=4, kernel_initializer='normal',activation='relu'))
    model.add(Dense(1, kernel_initializer='normal',activation='sigmoid'))
    model.compile(loss='binary_crossentropy',optimizer='rmsprop', metrics=['accuracy'])
    return model

In [58]:
from keras.wrappers.scikit_learn import KerasClassifier
estimator = KerasClassifier(build_fn=create_model,nb_epoch=100,verbose=0)
cv_scores = cross_val_score(estimator,all_features_scaled, all_classes, cv=10)
cv_scores.mean()

0.79879518086651713