In [7]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.pipeline import Pipeline as ImbPipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.base import BaseEstimator, TransformerMixin

In [None]:
# Lower_bound :  Q1 - 1.5*IQR
# upper_bound : Q3 + 1.5 * IQR 
# IQR = Q3 - Q1

In [17]:
# oops concept for handling outlier - transformation approach (capping method)
class OutlierClipper(BaseEstimator, TransformerMixin):
    def __init__(self, factor = 1.5):
        self.factor = factor

    def fit(self, x, y = None):
        x_df = pd.DataFrame(x)
        self.lower_bound = x_df.quantile(0.25) - (self.factor * (x_df.quantile(0.75) - x_df.quantile(0.25)))
        self.upper_bound = x_df.quantile(0.75) + (self.factor * (x_df.quantile(0.75) - x_df.quantile(0.25)))
        return self

    def transform(self, x, y=None):
        x_df = pd.DataFrame(x)
        x_clipped = x_df.clip(lower = self.lower_bound, upper = self.upper_bound, axis=1)
        return x_clipped.values

# Load the dataset for machine learning problem
file_path = "Attrition.csv"
data = pd.read_csv(file_path)

x = data.drop('Attrition', axis=1)
y = data['Attrition']

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state=42, stratify=y)

numerical_col = x.select_dtypes(include=['int64','int32','float32','float64']).columns
categorical_col = x.select_dtypes(include=['object']).columns


numerical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                                        ('outlier_clipper',OutlierClipper()),
                                        ('feature_scaling', StandardScaler())])


categorical_transformer =   Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                                            ('onehotencoder', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(transformers=[('num', numerical_transformer, numerical_col),
                                               ('cat',categorical_transformer,categorical_col)])

# Create pipeline with imbalance technique (smote)
pipeline1 = ImbPipeline(steps=[('preprocessor', preprocessor),('classifier',RandomForestClassifier(random_state=42))])


#pipeline2 = ImbPipeline(steps=[('preprocessor', preprocessor),('smote', SMOTE(random_state=42)),
                             # ('classifier',DecisionTreeClassifier(random_state=42))])

#pipeline3 = ImbPipeline(steps=[('preprocessor', preprocessor),('smote', SMOTE(random_state=42)),
                             # ('classifier',LogisticRegression())])

# Model building
pipeline1.fit(x_train, y_train)

# make prediction with x_test dataset
y_pred_test = pipeline1.predict(x_test)

# Evaluate the model
print(classification_report(y_test, y_pred_test))
print()
print("accuracy score :", accuracy_score(y_test, y_pred_test)) 


              precision    recall  f1-score   support

          No       0.85      0.98      0.91       247
         Yes       0.50      0.13      0.20        47

    accuracy                           0.84       294
   macro avg       0.68      0.55      0.56       294
weighted avg       0.80      0.84      0.80       294


accuracy score : 0.8401360544217688


In [None]:
SimpleImputer()
# mean , median , mode and constant
# mean and median - num
# mode - categorical
# constant - direct value put 999999

In [19]:
numerical_transformer

In [21]:
categorical_transformer

In [23]:
preprocessor

In [25]:
pipeline1

In [27]:
# oops concept for handling outlier - transformation approach (capping method)
class OutlierClipper(BaseEstimator, TransformerMixin):
    def __init__(self, factor = 1.5):
        self.factor = factor

    def fit(self, x, y = None):
        x_df = pd.DataFrame(x)
        self.lower_bound = x_df.quantile(0.25) - (self.factor * (x_df.quantile(0.75) - x_df.quantile(0.25)))
        self.upper_bound = x_df.quantile(0.75) + (self.factor * (x_df.quantile(0.75) - x_df.quantile(0.25)))
        return self

    def transform(self, x, y=None):
        x_df = pd.DataFrame(x)
        x_clipped = x_df.clip(lower = self.lower_bound, upper = self.upper_bound, axis=1)
        return x_clipped.values

# Load the dataset for machine learning problem
file_path = "diabetes.csv"
data = pd.read_csv(file_path)

x = data.drop('Outcome', axis=1)
y = data['Outcome']

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state=42, stratify=y)

numerical_col = x.select_dtypes(include=['int64','int32','float32','float64']).columns
categorical_col = x.select_dtypes(include=['object']).columns


numerical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                                        ('outlier_clipper',OutlierClipper()),
                                        ('feature_scaling', StandardScaler())])


categorical_transformer =   Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                                            ('onehotencoder', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(transformers=[('num', numerical_transformer, numerical_col),
                                               ('cat',categorical_transformer,categorical_col)])

# Create pipeline with imbalance technique (smote)
pipeline1 = ImbPipeline(steps=[('preprocessor', preprocessor),('classifier',RandomForestClassifier(random_state=42))])


#pipeline2 = ImbPipeline(steps=[('preprocessor', preprocessor),('smote', SMOTE(random_state=42)),
                             # ('classifier',DecisionTreeClassifier(random_state=42))])

#pipeline3 = ImbPipeline(steps=[('preprocessor', preprocessor),('smote', SMOTE(random_state=42)),
                             # ('classifier',LogisticRegression())])

# Model building
pipeline1.fit(x_train, y_train)

# make prediction with x_test dataset
y_pred_test = pipeline1.predict(x_test)

# Evaluate the model
print(classification_report(y_test, y_pred_test))
print()
print("accuracy score :", accuracy_score(y_test, y_pred_test)) 


              precision    recall  f1-score   support

           0       0.78      0.85      0.81       100
           1       0.67      0.56      0.61        54

    accuracy                           0.75       154
   macro avg       0.72      0.70      0.71       154
weighted avg       0.74      0.75      0.74       154


accuracy score : 0.7467532467532467


# Housing Problem

In [36]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor

In [34]:
# oops concept for handling outlier - transformation approach (capping method)
class OutlierClipper(BaseEstimator, TransformerMixin):
    def __init__(self, factor = 1.5):
        self.factor = factor

    def fit(self, x, y = None):
        x_df = pd.DataFrame(x)
        self.lower_bound = x_df.quantile(0.25) - (self.factor * (x_df.quantile(0.75) - x_df.quantile(0.25)))
        self.upper_bound = x_df.quantile(0.75) + (self.factor * (x_df.quantile(0.75) - x_df.quantile(0.25)))
        return self

    def transform(self, x, y=None):
        x_df = pd.DataFrame(x)
        x_clipped = x_df.clip(lower = self.lower_bound, upper = self.upper_bound, axis=1)
        return x_clipped.values

# Load the dataset for machine learning problem
file_path = "HousingData.csv"
data = pd.read_csv(file_path)

x = data.drop('MEDV', axis=1)
y = data['MEDV']

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state=42)

numerical_col = x.select_dtypes(include=['int64','int32','float32','float64']).columns
categorical_col = x.select_dtypes(include=['object']).columns


numerical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                                        ('outlier_clipper',OutlierClipper()),
                                        ('feature_scaling', StandardScaler())])


categorical_transformer =   Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                                            ('onehotencoder', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(transformers=[('num', numerical_transformer, numerical_col),
                                               ('cat',categorical_transformer,categorical_col)])

# Create pipeline with imbalance technique (smote)
pipeline1 = ImbPipeline(steps=[('preprocessor', preprocessor),('regressor_problem',LinearRegression())])


#pipeline2 = ImbPipeline(steps=[('preprocessor', preprocessor),('smote', SMOTE(random_state=42)),
                             # ('classifier',DecisionTreeClassifier(random_state=42))])

#pipeline3 = ImbPipeline(steps=[('preprocessor', preprocessor),('smote', SMOTE(random_state=42)),
                             # ('classifier',LogisticRegression())])

# Model building
pipeline1.fit(x_train, y_train)

# make prediction with x_test dataset
y_pred_test = pipeline1.predict(x_test)

# Evaluate the model
#print(classification_report(y_test, y_pred_test))
#print()
print("accuracy score :", r2_score(y_test, y_pred_test)) 


accuracy score : 0.681571925698981


In [38]:
# oops concept for handling outlier - transformation approach (capping method)
class OutlierClipper(BaseEstimator, TransformerMixin):
    def __init__(self, factor = 1.5):
        self.factor = factor

    def fit(self, x, y = None):
        x_df = pd.DataFrame(x)
        self.lower_bound = x_df.quantile(0.25) - (self.factor * (x_df.quantile(0.75) - x_df.quantile(0.25)))
        self.upper_bound = x_df.quantile(0.75) + (self.factor * (x_df.quantile(0.75) - x_df.quantile(0.25)))
        return self

    def transform(self, x, y=None):
        x_df = pd.DataFrame(x)
        x_clipped = x_df.clip(lower = self.lower_bound, upper = self.upper_bound, axis=1)
        return x_clipped.values

# Load the dataset for machine learning problem
file_path = "HousingData.csv"
data = pd.read_csv(file_path)

x = data.drop('MEDV', axis=1)
y = data['MEDV']

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state=42)

numerical_col = x.select_dtypes(include=['int64','int32','float32','float64']).columns
categorical_col = x.select_dtypes(include=['object']).columns


numerical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                                        ('outlier_clipper',OutlierClipper()),
                                        ('feature_scaling', StandardScaler())])


categorical_transformer =   Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                                            ('onehotencoder', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(transformers=[('num', numerical_transformer, numerical_col),
                                               ('cat',categorical_transformer,categorical_col)])

# Create pipeline with imbalance technique (smote)
pipeline1 = ImbPipeline(steps=[('preprocessor', preprocessor),('regressor_problem',RandomForestRegressor())])


#pipeline2 = ImbPipeline(steps=[('preprocessor', preprocessor),('smote', SMOTE(random_state=42)),
                             # ('classifier',DecisionTreeClassifier(random_state=42))])

#pipeline3 = ImbPipeline(steps=[('preprocessor', preprocessor),('smote', SMOTE(random_state=42)),
                             # ('classifier',LogisticRegression())])

# Model building
pipeline1.fit(x_train, y_train)

# make prediction with x_test dataset
y_pred_test = pipeline1.predict(x_test)

# Evaluate the model
#print(classification_report(y_test, y_pred_test))
#print()
print("accuracy score :", r2_score(y_test, y_pred_test)) 


accuracy score : 0.8763447538226801
