In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from preprocessing.utils import clean_data,outlier_treatment,save_object
from sklearn.impute import SimpleImputer
import re
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from src.exception import CustomException
from src.logger import logging
import sys

In [15]:
data = pd.read_csv(r"C:\Users\Pranav\Desktop\Data Science\MLOps\Data\water_potability.csv", encoding = 'unicode_escape')

In [16]:
data.head()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,,204.8904555,20791.31898,7.300211873,368.516441,564.3086542,10.37978308,86.99097046,2.963135381,0
1,3.716080075,129.4229205,18630.05786,6.635245884,,592.8853591,15.18001312,56.32907628,4.500656275,0
2,8.099124189,224.2362594,19909.54173,9.275883603,,418.6062131,16.86863693,66.42009251,3.05593375,0
3,8.316765884,214.3733941,22018.41744,8.059332377,356.886136,363.2665162,18.4365245,100.3416744,4.628770537,0
4,9.092223456,181.1015092,17978.98634,6.546599974,310.135738,398.4108134,11.55827944,31.99799273,4.075075425,0


In [17]:
data.isnull().sum()

ph                 491
Hardness             0
Solids               0
Chloramines          0
Sulfate            781
Conductivity         0
Organic_carbon       0
Trihalomethanes    162
Turbidity            0
Potability           0
dtype: int64

In [18]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3276 entries, 0 to 3275
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ph               2785 non-null   object 
 1   Hardness         3276 non-null   object 
 2   Solids           3276 non-null   object 
 3   Chloramines      3276 non-null   object 
 4   Sulfate          2495 non-null   float64
 5   Conductivity     3276 non-null   object 
 6   Organic_carbon   3276 non-null   object 
 7   Trihalomethanes  3114 non-null   object 
 8   Turbidity        3276 non-null   object 
 9   Potability       3276 non-null   int64  
dtypes: float64(1), int64(1), object(8)
memory usage: 256.1+ KB


In [19]:
def get_non_numeric(series):
    non_num_elems = []
    for elem in series:
        try:
            float(elem)
        except:
            non_num_elems.append(elem)
    return non_num_elems
            

In [20]:
for i in data.columns:
    non_num = get_non_numeric(data[i])
    print('*******************************')
    print('In column {} the non numeric elements are'.format(i) )
    print(non_num)
    print('*******************************')

*******************************
In column ph the non numeric elements are
['7.-.160467231']
*******************************
*******************************
In column Hardness the non numeric elements are
['214.496610%457156', "20''9.609618"]
*******************************
*******************************
In column Solids the non numeric elements are
['18767.656--18134', '172.>66.59342', '2(9368.6741)8', ' ? 29,477.76 ', ' ? 24,967.32 ', ' ? 26,631.21 ', ' ? 24,539.71 ', ' ? 20,864.34 ', ' ? 16,879.52 ', ' ? 16,488.02 ', ' ? 11,351.86 ']
*******************************
*******************************
In column Chloramines the non numeric elements are
['2.39798!499', '2.10269!09!91']
*******************************
*******************************
In column Sulfate the non numeric elements are
[]
*******************************
*******************************
In column Conductivity the non numeric elements are
['52,,,3.6712975', '323.794---624']
*******************************
***********

In [21]:
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3276 entries, 0 to 3275
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ph               2785 non-null   object 
 1   Hardness         3276 non-null   object 
 2   Solids           3276 non-null   object 
 3   Chloramines      3276 non-null   object 
 4   Sulfate          2495 non-null   float64
 5   Conductivity     3276 non-null   object 
 6   Organic_carbon   3276 non-null   object 
 7   Trihalomethanes  3114 non-null   object 
 8   Turbidity        3276 non-null   object 
 9   Potability       3276 non-null   int64  
dtypes: float64(1), int64(1), object(8)
memory usage: 256.1+ KB


In [22]:

def clean_out(df:pd.DataFrame):

        df['Solids']  = df['Solids'].apply(clean_data)
        df['Organic_carbon'] = df['Organic_carbon'].apply(clean_data)

        for i in df.columns:
            df[i] = pd.to_numeric(df[i],errors= 'coerce')

        

        return df

data = clean_out(df= data)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3276 entries, 0 to 3275
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ph               2784 non-null   float64
 1   Hardness         3274 non-null   float64
 2   Solids           3275 non-null   float64
 3   Chloramines      3274 non-null   float64
 4   Sulfate          2495 non-null   float64
 5   Conductivity     3274 non-null   float64
 6   Organic_carbon   3276 non-null   float64
 7   Trihalomethanes  3113 non-null   float64
 8   Turbidity        3274 non-null   float64
 9   Potability       3276 non-null   int64  
dtypes: float64(9), int64(1)
memory usage: 256.1 KB


In [23]:
def clean_data(text):

    text = re.sub(r"[' ',()-->?¥]","",text)
    return text


data['Solids']  = data['Solids'].apply(clean_data)
data['Organic_carbon'] = data['Organic_carbon'].apply(clean_data)

for i in data.columns:
    data[i] = pd.to_numeric(data[i],errors= 'coerce')



TypeError: expected string or bytes-like object, got 'float'

In [24]:
features = data.drop('Potability',axis = 1)
target   = data['Potability'] 

In [25]:
def outlier_treatment(series):
    q1 = np.percentile(features[series], 25)
    q3 = np.percentile(features[series], 75)
    iqr = q3 - q1
    
    for idx in range(len(features[series])):
        elem = features.at[idx, series]
        if elem > q3 + iqr * 1.5:
            features.at[idx, series] = q3 + iqr * 1.5
        elif elem < q1 - iqr * 1.5:
            features.at[idx, series] = q1 - iqr * 1.5
    
    return features[series]


for i in features:
    data[i] = outlier_treatment(i)

In [26]:
columns = ['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity',
       'Organic_carbon', 'Trihalomethanes', 'Turbidity']

pipeline = Pipeline(steps=[
                ('Impute',SimpleImputer(strategy='median')),
                ('Scaling',MinMaxScaler())

            ])

prepocessor = ColumnTransformer([('Impute',SimpleImputer(strategy='median'),features.columns),
                                 ('Scaling',MinMaxScaler(),features.columns)
                                 ])


In [27]:
type(prepocessor)

sklearn.compose._column_transformer.ColumnTransformer

In [None]:

features_array = prepocessor.fit_transform(features)

In [None]:
target.info()

<class 'pandas.core.series.Series'>
RangeIndex: 3276 entries, 0 to 3275
Series name: Potability
Non-Null Count  Dtype
--------------  -----
3276 non-null   int64
dtypes: int64(1)
memory usage: 25.7 KB


In [None]:
data.columns

Index(['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity',
       'Organic_carbon', 'Trihalomethanes', 'Turbidity', 'Potability'],
      dtype='object')

In [None]:
np.array(11)

array(11)

In [33]:
def get_preprocessor():

        

        try:
            columns = ['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity',
       'Organic_carbon', 'Trihalomethanes', 'Turbidity', 'Potability']

            pipeline = Pipeline(steps=[
                
                ('Impute',SimpleImputer(strategy='median')),
                ('Scaling',MinMaxScaler())

            ])

            prepocessor = ColumnTransformer([
                'pipeline', pipeline ,columns
                                 ])

            logging.info('Created Preprocessor object')

            return prepocessor

            
        except Exception as e:
            raise CustomException(e,sys)


In [34]:
pre_obj = get_preprocessor()
type(pre_obj) 

sklearn.compose._column_transformer.ColumnTransformer