Skip to content

Commit

Permalink
More unit tests
Browse files Browse the repository at this point in the history
For #5
  • Loading branch information
rhiever committed Mar 6, 2016
1 parent 92797a8 commit bdd4858
Showing 1 changed file with 43 additions and 0 deletions.
43 changes: 43 additions & 0 deletions tests.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from datacleaner import autoclean, autoclean_cv
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

np.random.seed(300)

Expand Down Expand Up @@ -31,6 +32,7 @@ def test_autoclean_cv_already_clean_data():
assert cleaned_testing_data.equals(testing_data)

def test_autoclean_with_nans_all_numerical():
"""Test autoclean() with a data set that has all numerical values and some NaNs"""
data = pd.DataFrame({'A': np.random.rand(1000),
'B': np.random.rand(1000),
'C': np.random.randint(0, 3, 1000)})
Expand All @@ -47,6 +49,7 @@ def test_autoclean_with_nans_all_numerical():
assert cleaned_data.equals(hand_cleaned_data)

def test_autoclean_cv_with_nans_all_numerical():
"""Test autoclean_cv() with a data set that has all numerical values and some NaNs"""
data = pd.DataFrame({'A': np.random.rand(1000),
'B': np.random.rand(1000),
'C': np.random.randint(0, 3, 1000)})
Expand All @@ -73,3 +76,43 @@ def test_autoclean_cv_with_nans_all_numerical():

assert cleaned_training_data.equals(hand_cleaned_training_data)
assert cleaned_testing_data.equals(hand_cleaned_testing_data)

def test_autoclean_no_nans_with_strings():
"""Test autoclean() with a data set that has some string-encoded categorical values and no NaNs"""
data = pd.DataFrame({'A': np.random.rand(1000),
'B': np.random.rand(1000),
'C': np.random.randint(0, 3, 1000)})

string_map = {0: 'oranges', 1: 'apples', 2: 'bananas'}
data['C'] = data['C'].apply(lambda x: string_map[x])

hand_cleaned_data = data.copy()
hand_cleaned_data['C'] = LabelEncoder().fit_transform(hand_cleaned_data['C'].values)

cleaned_data = autoclean(data)

assert cleaned_data.equals(hand_cleaned_data)

def test_autoclean_cv_no_nans_with_strings():
"""Test autoclean_cv() with a data set that has some string-encoded categorical values and no NaNs"""
data = pd.DataFrame({'A': np.random.rand(1000),
'B': np.random.rand(1000),
'C': np.random.randint(0, 3, 1000)})

string_map = {0: 'oranges', 1: 'apples', 2: 'bananas'}
data['C'] = data['C'].apply(lambda x: string_map[x])

training_data = data[:500].copy()
testing_data = data[500:].copy()

cleaned_training_data, cleaned_testing_data = autoclean_cv(training_data, testing_data)

hand_cleaned_training_data = training_data.copy()
hand_cleaned_testing_data = testing_data.copy()

encoder = LabelEncoder()
hand_cleaned_training_data['C'] = encoder.fit_transform(hand_cleaned_training_data['C'].values)
hand_cleaned_testing_data['C'] = encoder.transform(hand_cleaned_testing_data['C'].values)

assert cleaned_training_data.equals(training_data)
assert cleaned_testing_data.equals(testing_data)

0 comments on commit bdd4858

Please sign in to comment.