In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

# Load the House Prices dataset 
# Replace with your actual dataset loading mechanism
house_prices_data = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')


In [3]:
# Define features and target
X = house_prices_data.drop('SalePrice', axis=1)  # Features
y = house_prices_data['SalePrice']  # Target


In [4]:
#Descriptive Statistics
house_prices_data.info()
house_prices_data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [5]:
# Numerical preprocessing steps (impute missing values and scale)
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values with mean
    ('scaler', StandardScaler())                 # Scale numerical features
])

# Categorical preprocessing steps (impute missing values and one-hot encode)
categorical_features = X.select_dtypes(include=['object']).columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values with most frequent value
    ('onehot', OneHotEncoder(handle_unknown='ignore'))     # One-hot encode categorical features
])


In [6]:
# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])


In [7]:
#Outlier Detection

from scipy import stats
import numpy as np

def detect_outliers_zscore(X, threshold=3):
    z_scores = stats.zscore(X)
    abs_z_scores = np.abs(z_scores)
    outliers = (abs_z_scores > threshold).any(axis=1)
    return X[~outliers]



In [8]:
print(numeric_features)
X[numeric_features].info()
X[numeric_features].head()

Index(['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
       'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF',
       'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea',
       'MiscVal', 'MoSold', 'YrSold'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 37 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   LotFrontage    1201 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   OverallQual    1460 non-null   int64  


Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,548,0,61,0,0,0,0,0,2,2008
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,460,298,0,0,0,0,0,0,5,2007
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,608,0,42,0,0,0,0,0,9,2008
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,642,0,35,272,0,0,0,0,2,2006
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,836,192,84,0,0,0,0,0,12,2008


In [9]:
Z = detect_outliers_zscore(X[numeric_features])  # Detect outliers, stored under new variable Z
print(X[numeric_features].describe())
Z.describe()

                Id   MSSubClass  LotFrontage        LotArea  OverallQual  \
count  1460.000000  1460.000000  1201.000000    1460.000000  1460.000000   
mean    730.500000    56.897260    70.049958   10516.828082     6.099315   
std     421.610009    42.300571    24.284752    9981.264932     1.382997   
min       1.000000    20.000000    21.000000    1300.000000     1.000000   
25%     365.750000    20.000000    59.000000    7553.500000     5.000000   
50%     730.500000    50.000000    69.000000    9478.500000     6.000000   
75%    1095.250000    70.000000    80.000000   11601.500000     7.000000   
max    1460.000000   190.000000   313.000000  215245.000000    10.000000   

       OverallCond    YearBuilt  YearRemodAdd   MasVnrArea   BsmtFinSF1  ...  \
count  1460.000000  1460.000000   1460.000000  1452.000000  1460.000000  ...   
mean      5.575342  1971.267808   1984.865753   103.685262   443.639726  ...   
std       1.112799    30.202904     20.645407   181.066207   456.098091  ..

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
count,1045.0,1045.0,869.0,1045.0,1045.0,1045.0,1045.0,1045.0,1038.0,1045.0,...,1045.0,1045.0,1045.0,1045.0,1045.0,1045.0,1045.0,1045.0,1045.0,1045.0
mean,735.191388,54.009569,68.378596,9271.703349,6.114833,5.511005,1975.629665,1986.323445,94.333333,419.678469,...,469.836364,88.625837,42.570335,14.683254,0.02201,5.989474,0.0,14.357895,6.273684,2007.810526
std,423.335231,40.174427,23.418415,3806.795993,1.344144,0.977292,28.957322,20.699239,158.841845,410.041466,...,200.599091,107.497878,53.59839,42.959263,0.711491,28.930437,0.0,99.998995,2.674508,1.344239
min,1.0,20.0,21.0,1300.0,2.0,3.0,1885.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0
25%,377.0,20.0,57.0,7226.0,5.0,5.0,1957.0,1969.0,0.0,0.0,...,336.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2007.0
50%,738.0,50.0,68.0,9120.0,6.0,5.0,1978.0,1996.0,0.0,390.0,...,474.0,0.0,27.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0
75%,1107.0,60.0,80.0,11143.0,7.0,6.0,2003.0,2004.0,155.5,697.0,...,576.0,168.0,64.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0
max,1457.0,180.0,313.0,31770.0,10.0,8.0,2009.0,2010.0,1129.0,1696.0,...,1069.0,468.0,244.0,205.0,23.0,182.0,0.0,1300.0,12.0,2010.0


In [37]:
#Multicollinearitydetection #removal
def Multicollinearity1(X, threshold = 0.3):
    correlation_matrix_ = np.corrcoef(X, rowvar=False)
    correlated_pairs = find_correlated_features(correlation_matrix_, threshold)
    selected_features = list(range(X.shape[1]))
        
    for i, j, _ in correlated_pairs:
    # Remove feature j (keeping feature i) from the selected features
        selected_features.remove(j)
    print(correlated_pairs)
    return X[:, selected_features]
        
    
    
    

In [44]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder

# Full pipeline including outlier detection, multi-collinearity removal, and cleaning data
#How to input a class visa vis a function
#Have not created an individual pipeline for outlier detection and multicollinearity removal

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
    ('outlier_detector', FunctionTransformer(func=detect_outliers_zscore)),
    ('multicollinearity_remover', FunctionTransformer(func=lambda X: Multicollinearity1(X, numeric_features)))
  ])


In [46]:
# Assuming X and y are your features and target variables
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

AxisError: axis 0 is out of bounds for array of dimension 0

In [40]:
import joblib

# Save preprocessing pipeline to a file
joblib.dump(preprocessor, 'preprocessor_pipeline.pkl')


['preprocessor_pipeline.pkl']

In [None]:
# Later, you can load the pipeline from the file
loaded_pipeline = joblib.load('preprocessor_pipeline.pkl')

# Use the loaded pipeline to preprocess new datasets
# new_data = pd.read_csv('new_data.csv')  # Load your new dataset
# new_data_preprocessed = loaded_pipeline.transform(new_data)