In [None]:
pip install py-AutoClean



In [None]:
import pandas as pd
from AutoClean import AutoClean
import numpy as np
from scipy import stats

class DataCleaner:
    def __init__(self, df):
        self._df = None
        self.df = df  # This will trigger the setter and perform initial cleaning and issue detection

    @property
    def df(self):
        return self._df

    @df.setter
    def df(self, new_df):
        self._df = new_df
        self._detect_and_print_issues()
        self._clean_data()

    def _detect_and_print_issues(self):
        issues = self._detect_issues()
        if issues:
            print("Issues detected:")
            for issue in issues:
                print(" -", issue)
        else:
            print("No issues detected. The data is clean.")

    def _detect_issues(self):
        issues = []
        # Check for missing values
        if self.df.isnull().sum().sum() > 0:
            issues.append("Missing values detected.")
        # Check for duplicate rows
        if self.df.duplicated().sum() > 0:
            issues.append("Duplicate rows detected.")
        # Check for incorrect data types
        for col in self.df.columns:
            if self.df[col].dtype == object:
                try:
                    pd.to_numeric(self.df[col])
                    issues.append(f"Incorrect data type in column {col}.")
                except ValueError:
                    pass
        # Check for outliers
        numeric_cols = self.df.select_dtypes(include=[np.number])
        if not numeric_cols.empty:
            z_scores = np.abs(stats.zscore(numeric_cols))
            if np.any(z_scores > 3):
                issues.append("Outliers detected.")
        # Check for high cardinality in categorical variables
        for col in self.df.select_dtypes(include=['object']):
            if self.df[col].nunique() > 10:  # Adjust threshold based on use case
                issues.append(f"High cardinality in column {col}.")
        return issues


    def _clean_data(self):
        autocleaner = AutoClean(self._df)
        self._df = autocleaner.output


**With this class, any issues detected are automatically printed out as soon as the DataCleaner object is created or the dataframe is updated. The data is then cleaned. If there are no issues, a message stating that the data is clean will be printed.**

In [None]:
#To detect and automatically clean, we simply call the class on the data frame.
df = pd.read_csv('/content/raw.csv')
cleaner = DataCleaner(df)


Issues detected:
 - Missing values detected.
 - Duplicate rows detected.
 - Outliers detected.
 - High cardinality in column Education.
 - High cardinality in column Occupation.
 - High cardinality in column Native-country.
AutoClean process completed in 36.343142 seconds
Logfile saved to: /content/autoclean.log


In [None]:
#you can access the cleaned data with cleaner.df
cleaner.df

Unnamed: 0,Age,Workclass,Fnlwgt,Education,Education-num,Marital-status,Occupation,Relationship,Race,Sex,...,Relationship_ not-in-family,Relationship_ other-relative,Relationship_ own-child,Relationship_ unmarried,Relationship_ wife,Sex_ female,Sex_ male,Income_<=50k,Income_>50k,Occupation_lab
0,39,state-gov,77516,bachelors,13,never-married,adm-clerical,not-in-family,white,male,...,1,0,0,0,0,0,1,1,0,0
1,50,self-emp-not-inc,83311,bachelors,13,married-civ-spouse,exec-managerial,husband,white,male,...,0,0,0,0,0,0,1,1,0,3
2,38,private,215646,hs-grad,9,divorced,handlers-cleaners,not-in-family,white,male,...,1,0,0,0,0,0,1,1,0,5
3,53,private,234721,11th,7,married-civ-spouse,handlers-cleaners,husband,black,male,...,0,0,0,0,0,0,1,1,0,5
4,28,private,338409,bachelors,13,married-civ-spouse,prof-specialty,wife,black,female,...,0,0,0,0,1,1,0,1,0,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32532,27,private,257302,assoc-acdm,12,married-civ-spouse,tech-support,wife,white,female,...,0,0,0,0,1,1,0,1,0,12
32533,40,private,154374,hs-grad,9,married-civ-spouse,machine-op-inspct,husband,white,male,...,0,0,0,0,0,0,1,0,1,6
32534,58,private,151910,hs-grad,9,widowed,adm-clerical,unmarried,white,female,...,0,0,0,1,0,1,0,1,0,0
32535,22,private,201490,hs-grad,9,never-married,adm-clerical,own-child,white,male,...,0,0,1,0,0,0,1,1,0,0


In [None]:
#
df.describe()

Unnamed: 0,Age,Fnlwgt,Education-num,Capital-gain,Capital-loss,Hours-per-week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [None]:
cleaner.df.describe()

Unnamed: 0,Age,Fnlwgt,Education-num,Capital-gain,Capital-loss,Hours-per-week,Race_ amer-indian-eskimo,Race_ asian-pac-islander,Race_ black,Race_ other,...,Relationship_ not-in-family,Relationship_ other-relative,Relationship_ own-child,Relationship_ unmarried,Relationship_ wife,Sex_ female,Sex_ male,Income_<=50k,Income_>50k,Occupation_lab
count,32537.0,32537.0,32537.0,32537.0,32537.0,32537.0,32537.0,32537.0,32537.0,32537.0,...,32537.0,32537.0,32537.0,32537.0,32537.0,32537.0,32537.0,32537.0,32537.0,32537.0
mean,38.559855,186824.961736,10.106832,0.0,0.0,41.064911,0.009558,0.031902,0.095952,0.008329,...,0.254848,0.03015,0.155638,0.105879,0.048191,0.330762,0.669238,0.759074,0.240926,6.026001
std,13.554847,95118.115529,2.502781,0.0,0.0,6.212522,0.0973,0.175742,0.29453,0.090884,...,0.435783,0.171003,0.362518,0.307688,0.214174,0.470495,0.470495,0.427652,0.427652,3.918121
min,17.0,12285.0,4.0,0.0,0.0,32.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,6.0
75%,48.0,236993.0,12.0,0.0,0.0,45.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,9.0
max,78.0,415742.0,16.0,0.0,0.0,52.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,13.0


In [None]:
pip install lux

In [None]:
import lux
from google.colab import output
output.enable_custom_widget_manager()

In [None]:
df

Button(description='Toggle Pandas/Lux', layout=Layout(top='5px', width='140px'), style=ButtonStyle())

Output()

In [16]:
cleaner.df

Unnamed: 0,Age,Workclass,Fnlwgt,Education,Education-num,Marital-status,Occupation,Relationship,Race,Sex,...,Relationship_ not-in-family,Relationship_ other-relative,Relationship_ own-child,Relationship_ unmarried,Relationship_ wife,Sex_ female,Sex_ male,Income_<=50k,Income_>50k,Occupation_lab
0,39,state-gov,77516,bachelors,13,never-married,adm-clerical,not-in-family,white,male,...,1,0,0,0,0,0,1,1,0,0
1,50,self-emp-not-inc,83311,bachelors,13,married-civ-spouse,exec-managerial,husband,white,male,...,0,0,0,0,0,0,1,1,0,3
2,38,private,215646,hs-grad,9,divorced,handlers-cleaners,not-in-family,white,male,...,1,0,0,0,0,0,1,1,0,5
3,53,private,234721,11th,7,married-civ-spouse,handlers-cleaners,husband,black,male,...,0,0,0,0,0,0,1,1,0,5
4,28,private,338409,bachelors,13,married-civ-spouse,prof-specialty,wife,black,female,...,0,0,0,0,1,1,0,1,0,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32532,27,private,257302,assoc-acdm,12,married-civ-spouse,tech-support,wife,white,female,...,0,0,0,0,1,1,0,1,0,12
32533,40,private,154374,hs-grad,9,married-civ-spouse,machine-op-inspct,husband,white,male,...,0,0,0,0,0,0,1,0,1,6
32534,58,private,151910,hs-grad,9,widowed,adm-clerical,unmarried,white,female,...,0,0,0,1,0,1,0,1,0,0
32535,22,private,201490,hs-grad,9,never-married,adm-clerical,own-child,white,male,...,0,0,1,0,0,0,1,1,0,0


In [17]:
cleaner.df.head()

Unnamed: 0,Age,Workclass,Fnlwgt,Education,Education-num,Marital-status,Occupation,Relationship,Race,Sex,...,Relationship_ not-in-family,Relationship_ other-relative,Relationship_ own-child,Relationship_ unmarried,Relationship_ wife,Sex_ female,Sex_ male,Income_<=50k,Income_>50k,Occupation_lab
0,39,state-gov,77516,bachelors,13,never-married,adm-clerical,not-in-family,white,male,...,1,0,0,0,0,0,1,1,0,0
1,50,self-emp-not-inc,83311,bachelors,13,married-civ-spouse,exec-managerial,husband,white,male,...,0,0,0,0,0,0,1,1,0,3
2,38,private,215646,hs-grad,9,divorced,handlers-cleaners,not-in-family,white,male,...,1,0,0,0,0,0,1,1,0,5
3,53,private,234721,11th,7,married-civ-spouse,handlers-cleaners,husband,black,male,...,0,0,0,0,0,0,1,1,0,5
4,28,private,338409,bachelors,13,married-civ-spouse,prof-specialty,wife,black,female,...,0,0,0,0,1,1,0,1,0,9
