hi

# Read the Data

In [165]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier 
from sklearn.pipeline import make_pipeline
# explicitly require this experimental feature
from sklearn.experimental import enable_iterative_imputer  # noqa
# now you can import normally from sklearn.impute
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OrdinalEncoder

In [2]:
housing = pd.read_csv('https://raw.githubusercontent.com/sherwan-m/predicting_housing_prices/main/Data/housing-classification-iter3.csv')

## Initial exploration

What columns exist on this data? What are their data types?

In [6]:
housing.head()

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,Expensive,MSZoning,Condition1,Heating,Street,CentralAir,Foundation
0,8450,65.0,856,3,0,0,2,0,0,0,RL,Norm,GasA,Pave,Y,PConc
1,9600,80.0,1262,3,1,0,2,298,0,0,RL,Feedr,GasA,Pave,Y,CBlock
2,11250,68.0,920,3,1,0,2,0,0,0,RL,Norm,GasA,Pave,Y,PConc
3,9550,60.0,756,3,1,0,3,0,0,0,RL,Norm,GasA,Pave,Y,BrkTil
4,14260,84.0,1145,4,1,0,3,192,0,0,RL,Norm,GasA,Pave,Y,PConc


Do we have missing values on this dataset?

In [5]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   LotArea       1460 non-null   int64  
 1   LotFrontage   1201 non-null   float64
 2   TotalBsmtSF   1460 non-null   int64  
 3   BedroomAbvGr  1460 non-null   int64  
 4   Fireplaces    1460 non-null   int64  
 5   PoolArea      1460 non-null   int64  
 6   GarageCars    1460 non-null   int64  
 7   WoodDeckSF    1460 non-null   int64  
 8   ScreenPorch   1460 non-null   int64  
 9   Expensive     1460 non-null   int64  
 10  MSZoning      1460 non-null   object 
 11  Condition1    1460 non-null   object 
 12  Heating       1460 non-null   object 
 13  Street        1460 non-null   object 
 14  CentralAir    1460 non-null   object 
 15  Foundation    1460 non-null   object 
dtypes: float64(1), int64(9), object(6)
memory usage: 182.6+ KB


Do we have duplicated information?

In [7]:
housing.duplicated().sum()

11

Is there any column that helps us identify if a house is expensive or not?

In [None]:
housing.loc[lambda df_ :df_['Expensive'] ==1].head(50)

## Data Splitting (Train - Test)

Setting X and y

- **X**: columns that help us make a prediction.
- **y**: the column that we want to predict.

In [9]:
X = housing.copy()
y = X.pop('Expensive')

## Feature Selection: 
1. convert non_numerics columns to numeric
2. choose the helpful featuers

Scikit-Learn models cannot deal with categorical features

In [63]:
class NumericConvertor:
    data =0
    def __init__(self):
        super().__init__()
    def fit_transform(self, X, y=None):
        num_data_df= X.copy()
        object_columns = num_data_df.select_dtypes(include=object).columns
        if len(object_columns) == 0:
            return  num_data_df
        
        data_lenth = len(num_data_df[object_columns[0]])

        for object_column in object_columns:
            values = num_data_df[object_column].unique()
            if len(values) > 0.5 * data_lenth:
                num_data_df.drop(columns= object_column, inplace=True)
                continue
            num_data_df[object_column] = pd.factorize(num_data_df[object_column])[0]
            # num_data_df[object_column] = num_data_df[object_column].apply(lambda x: pd.factorize(x)[0])
        return num_data_df
        return 
    def transform(self, X):
        num_data_df= X.copy()
        object_columns = num_data_df.select_dtypes(include=object).columns
        if len(object_columns) == 0:
            return  num_data_df
        
        data_lenth = len(num_data_df[object_columns[0]])

        for object_column in object_columns:
            values = num_data_df[object_column].unique()
            if len(values) > 0.5 * data_lenth:
                num_data_df.drop(columns= object_column, inplace=True)
                continue
            num_data_df[object_column] = pd.factorize(num_data_df[object_column])[0]
            # num_data_df[object_column] = num_data_df[object_column].apply(lambda x: pd.factorize(x)[0])
        return num_data_df
    def fit(self, X, y=None):
        
        self.fit_transform(X)
        return self

In [54]:
#this fuction converts string value columns to numeric
#its a first version of it, i will improve and adjust it for this case

def convertor_to_numeric(data_df):
  num_data_df= data_df.copy()
  object_columns = num_data_df.select_dtypes(include=object).columns
  if len(object_columns) == 0:
    return  num_data_df
  
  data_lenth = len(num_data_df[object_columns[0]])

  for object_column in object_columns:
    values = num_data_df[object_column].unique()
    if len(values) > 0.5 * data_lenth:
      num_data_df.drop(columns= object_column, inplace=True)
      continue
    num_data_df[object_column] = pd.factorize(num_data_df[object_column])[0]
    # num_data_df[object_column] = num_data_df[object_column].apply(lambda x: pd.factorize(x)[0])
  return num_data_df

In [55]:
X_num = convertor_to_numeric(X)
X_num

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,MSZoning,Condition1,Heating,Street,CentralAir,Foundation
0,8450,65.0,856,3,0,0,2,0,0,0,0,0,0,0,0
1,9600,80.0,1262,3,1,0,2,298,0,0,1,0,0,0,1
2,11250,68.0,920,3,1,0,2,0,0,0,0,0,0,0,0
3,9550,60.0,756,3,1,0,3,0,0,0,0,0,0,0,2
4,14260,84.0,1145,4,1,0,3,192,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,7917,62.0,953,3,1,0,2,0,0,0,0,0,0,0,0
1456,13175,85.0,1542,3,2,0,2,349,0,0,0,0,0,0,1
1457,9042,66.0,1152,4,2,0,1,0,0,0,0,0,0,0,5
1458,9717,68.0,1078,2,0,0,1,366,0,0,0,0,0,0,1


choose the helpful featuers

### Data splitting

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_num, 
                                                            y, 
                                                            test_size=0.2, 
                                                            random_state=123)

### Imputing missing values

(Fit on train, transform train & test)

In [17]:
imp_mean = IterativeImputer(random_state=0)

imp_mean.fit(X_train)

# transform train 
X_imputed_train = imp_mean.transform(X_train)
X_imputed_test = imp_mean.transform(X_test)

## Create our first models

Based on the previous exploration, you have found some column that have some relation to the price of a house. Now it's your turn to create a python function to classify if a house is going to be expensive (1) or not (0).

there are the simples models, i will try to make them better

In [30]:
def dummy_model_v0(X_train, X_test, y_train, y_test):
      pred_pessimistic_train_v0 =  X_train.copy()
      pred_pessimistic_train_v0['pred'] = 0
      train_accuracy = accuracy_score(y_true = y_train,
                                 y_pred = pred_pessimistic_train_v0['pred']
                                 )
      pred_pessimistic_test_v0 =  X_test.copy()
      pred_pessimistic_test_v0['pred'] = 0
      test_accuracy = accuracy_score(y_true = y_test,
                               y_pred = pred_pessimistic_test_v0['pred']
                               )
      
      return round(train_accuracy,2),round(test_accuracy,2)

In [31]:
def dummy_model_v1(X_train, X_test, y_train, y_test):
      pred_pessimistic_train_v1 =  X_train.copy()
      pred_pessimistic_train_v1['pred'] = 0
      pred_pessimistic_train_v1.loc[lambda df_ :df_['BedroomAbvGr'] > 2,'pred'] = 1
      train_accuracy = accuracy_score(y_true = y_train,
                                 y_pred = pred_pessimistic_train_v1['pred']
                                 )
      pred_pessimistic_test_v1 =  X_test.copy()
      pred_pessimistic_test_v1['pred'] = 0
      pred_pessimistic_test_v1.loc[lambda df_ :df_['BedroomAbvGr'] > 2,'pred'] = 1
      test_accuracy = accuracy_score(y_true = y_test,
                               y_pred = pred_pessimistic_test_v1['pred']
                               )
      
      return round(train_accuracy,2),round(test_accuracy,2)

In [39]:
def dummy_model_v2(X_train, X_test, y_train, y_test):
      pred_pessimistic_train_v1 =  X_train.copy()
      pred_pessimistic_train_v1['pred'] = 0
      pred_pessimistic_train_v1.loc[lambda df_ :df_['GarageCars'] > 2.5,'pred'] = 1
      train_accuracy = accuracy_score(y_true = y_train,
                                 y_pred = pred_pessimistic_train_v1['pred']
                                 )
      pred_pessimistic_test_v1 =  X_test.copy()
      pred_pessimistic_test_v1['pred'] = 0
      pred_pessimistic_test_v1.loc[lambda df_ :df_['GarageCars'] > 2.5,'pred'] = 1
      test_accuracy = accuracy_score(y_true = y_test,
                               y_pred = pred_pessimistic_test_v1['pred']
                               )
      
      return round(train_accuracy,2),round(test_accuracy,2)

In [33]:
def dummy_model_v3(X_train, X_test, y_train, y_test):
      pred_pessimistic_train_v1 =  X_train.copy()
      pred_pessimistic_train_v1['pred'] = 0
      pred_pessimistic_train_v1.loc[lambda df_ :df_['LotArea'] > 10000.0,'pred'] = 1
      train_accuracy = accuracy_score(y_true = y_train,
                                 y_pred = pred_pessimistic_train_v1['pred']
                                 )
      pred_pessimistic_test_v1 =  X_test.copy()
      pred_pessimistic_test_v1['pred'] = 0
      pred_pessimistic_test_v1.loc[lambda df_ :df_['LotArea'] > 10000.0,'pred'] = 1
      test_accuracy = accuracy_score(y_true = y_test,
                               y_pred = pred_pessimistic_test_v1['pred']
                               )
      
      return round(train_accuracy,2),round(test_accuracy,2)

In [40]:
dummy_model_v2(X_train, X_test, y_train, y_test)

(0.9, 0.9)

## Modelling: Decision Tree

### creating Model & fitting 

In [56]:
my_tree = DecisionTreeClassifier(min_samples_leaf=6)


my_tree.fit(X = X_imputed_train, 
            y = y_train)

### Apply your model

In [57]:
y_pred_tree_train = my_tree.predict(X_imputed_train)


array([1, 0, 1, ..., 1, 0, 0], dtype=int64)

## Check the performance of your model

In [58]:
accuracy_score(y_true = y_train,
               y_pred = y_pred_tree_train
              )

0.9452054794520548

In [129]:
y_pred_tree_test = my_tree.predict(X_imputed_test)

accuracy_score(y_true = y_test,
               y_pred = y_pred_tree_test
              )

0.8972602739726028

# Creating a Pipeline

In [164]:
## version0
# reading
data = pd.read_csv('https://raw.githubusercontent.com/JoanClaverol/housing_data/main/housing-classification-iter3.csv')

# X and y creation
X = data
y = X.pop("Expensive")

# feature selection: only numericals
X_num = convertor_to_numeric(X)

# data splitting
X_train0, X_test0, y_train0, y_test0 = train_test_split(X_num, y, test_size=0.2, random_state=123)

# initialize transformers &amp; model
imputer0 = IterativeImputer(random_state=0)
dtree0 = DecisionTreeClassifier()
 
# Create a pipeline


pipe0 = make_pipeline(imputer0,
                     dtree0)

In [165]:
## version1
# reading
data = pd.read_csv('https://raw.githubusercontent.com/JoanClaverol/housing_data/main/housing-classification-iter3.csv')

# X and y creation
X = data
y = X.pop("Expensive")

# feature selection: only numericals
feature_selector = NumericConvertor()

# data splitting
X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y, test_size=0.2, random_state=123)

# initialize transformers &amp; model
imputer1 = IterativeImputer(random_state=0)
dtree1 = DecisionTreeClassifier()
 
# Create a pipeline
pipe1 = make_pipeline(feature_selector,
                     imputer1,
                     dtree1)



In [188]:
pipe0.fit(X_train0, y_train0)
pipe0.score(X_test0, y_test0)
# pipe1.predict(X_test)

0.8938356164383562

In [187]:
pipe1.fit(X_train1, y_train1)
pipe1.score(X_test1, y_test1)
# pipe1.predict(X_test)

0.8904109589041096

## Cross Validation with the whole pipeline:


In [192]:
param_grid = {
    "iterativeimputer__initial_strategy":["mean", "median","most_frequent"],
    # "iterativeimputer__max_iter":range(10,20,5),
    "decisiontreeclassifier__max_depth": range(2, 14),
    "decisiontreeclassifier__min_samples_leaf": range(3, 10),
    "decisiontreeclassifier__criterion":["gini", "entropy"]
}

In [193]:
from sklearn.model_selection import GridSearchCV

search0 = GridSearchCV(pipe0,
                      param_grid,
                      cv=10,
                      verbose=1)
search1 = GridSearchCV(pipe1,
                      param_grid,
                      cv=10,
                      verbose=1)

In [None]:
search0.fit(X_train0, y_train0)
search1.fit(X_train1, y_train1)

In [None]:
search0.best_params_

In [None]:
search1.best_params_

In [None]:
search0.best_score_

In [None]:
search1.best_score_

## Categorical encoding - "MANUAL" approach (Without using Pipelines)

### Replacing NaNs
**Replacing NaNs in categorical features**

We were imputing the mean to NaN’s on our preprocessing pipeline for numerical features. There's a problem with categorical values: they don’t have a “mean”. Here, we will replace NaNs with a string that marks them: “N_A”. It is not an elegant solution, but it will allow us to move forward.

**Replacing NaNs in numerical features**

This is what we already did in previous notebooks: replacing numerical NaNs with the mean of their column.

In [78]:
housing = pd.read_csv('https://raw.githubusercontent.com/sherwan-m/predicting_housing_prices/main/Data/housing-classification-iter3.csv')

# X and y creation
X = housing.copy()
# X.loc[lambda df_ : df_['Heating'] == 'Floor','Heating'] = 'GasA'
y = X.pop("Expensive")

# Feature Engineering

# data splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [16]:
X_train_cat = X_train.select_dtypes(exclude="number")

# defining the imputer to use "N_A" as replacement value
cat_imputer = SimpleImputer(strategy="constant", 
                            fill_value="N_A")

# fitting the imputer
cat_imputer.fit(X_train_cat)

# transforming the data & keeping it as a DataFrame
X_train_cat_imputed = pd.DataFrame(cat_imputer.transform(X_train_cat), 
                             columns=X_train_cat.columns)


# Selecting numerical columns
X_train_num = X_train.select_dtypes(include="number")

# Imputing the mean
num_imputer = IterativeImputer(random_state=0, initial_strategy="mean")

# Fitting
num_imputer.fit(X_train_num)

# Transforming, keeping a DataFrame
X_train_num_imputed = pd.DataFrame(num_imputer.transform(X_train_num), 
                             columns=X_train_num.columns)

# Concatenating all columns
X_train_imputed = pd.concat([X_train_cat_imputed, X_train_num_imputed], axis=1)

X_train_imputed.head()

Unnamed: 0,MSZoning,Condition1,Heating,Street,CentralAir,Foundation,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch
0,RL,Norm,GasA,Pave,Y,PConc,9900.0,90.0,1347.0,4.0,1.0,0.0,3.0,340.0,0.0
1,RL,Norm,GasA,Pave,Y,CBlock,14585.0,76.628342,1144.0,3.0,2.0,0.0,2.0,216.0,0.0
2,RL,PosN,GasA,Pave,Y,CBlock,12227.0,76.27762,1330.0,4.0,1.0,0.0,2.0,550.0,0.0
3,RL,Norm,GasA,Pave,N,CBlock,10778.0,72.0,1768.0,4.0,0.0,0.0,0.0,0.0,0.0
4,RL,Norm,GasA,Pave,Y,Wood,14115.0,85.0,796.0,1.0,0.0,0.0,2.0,40.0,0.0


### One Hot encoding
One Hot encoding means creating a new binary column for each category in every categorical column. Fortunately, a Scikit-Learn transformer takes care of everything.

In [13]:
# import
from sklearn.preprocessing import OneHotEncoder

# initialize
my_onehot = OneHotEncoder(drop="first")

# fit
my_onehot.fit(X_train_cat_imputed)

# transform
X_train_cat_imputed_onehot = my_onehot.transform(X_train_cat_imputed)

The result is a "sparse matrix": an object that Scikit-Learn creates when a matrix contains mostly zeros:

In [14]:
X_train_cat_imputed_onehot

<1168x24 sparse matrix of type '<class 'numpy.float64'>'
	with 6762 stored elements in Compressed Sparse Row format>

### Converting the sparse matrix into a DataFrame
To see what exactly is inside of this sparse matrix we can convert it to a pandas DataFrame:
Now, for exploration and learning purposes, we will rename the columns in this dataframe so that we know the origin of each binary column (the category and original column they come from).

Retrieving the column names for the "one-hot" columns
The fitted transformer contains this information, and the method get_feature_names_out allows us to recover the names of the columns.

In [15]:
colnames = my_onehot.get_feature_names_out(X_train_cat_imputed.columns)
X_train_cat_imputed_onehot_final =  pd.DataFrame.sparse.from_spmatrix(X_train_cat_imputed_onehot, columns= colnames)
X_train_cat_imputed_onehot_final

Unnamed: 0,MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Condition1_Feedr,Condition1_Norm,Condition1_PosA,Condition1_PosN,Condition1_RRAe,Condition1_RRAn,...,Heating_Grav,Heating_OthW,Heating_Wall,Street_Pave,CentralAir_Y,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood
0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1163,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
1164,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
1165,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
1166,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0


### Concatenating "one-hot" columns with numerical columns:

In [17]:
X_train_imputed_final = pd.concat([X_train_cat_imputed_onehot_final, X_train_num_imputed], axis=1)

##  Categorical encoding - "Automated" approach (Using Pipelines)

In the manual approach, to encode the categorical columns numericall, we have:

1. Selected the categorical columns.
2. Fitted a `OneHotEncoder` to them.
3. Transformed the categorical columns with the encoder.
4. Converted the sparse matrix into a dataframe.
5. Recovered the names of the columns.
6. Concatenated the one-hot columns with the numerical columns.

All these steps can be synthetised by using Scikit-Learn Pipelines and specifically something called `ColumnTransformer`, which allows us to apply different transformations to two or more groups of columns: in our case, categorical and numerical columns.

This process is also called creating "branches" in the pipeline. One branch for the categorical columns and another for the numerical columns. Each branch will contain as many transformers as we want. Then, the branches will meet again, and the transformed columns will be automatically concatenated. Let's see the process in action:

In [79]:
#Creating the "numeric pipe" and the "categoric pipe":

# select categorical and numerical column names
X_cat_columns = X.select_dtypes(exclude="number").columns
X_num_columns = X.select_dtypes(include="number").columns

# create numerical pipeline, only with the IterativeImputer(random_state=0, initial_strategy="mean")
numeric_pipe = make_pipeline(
    IterativeImputer(random_state=0, initial_strategy="mean"))
 
 # create categorical pipeline, with the SimpleImputer(fill_value="N_A") and the OneHotEncoder
categoric_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="N_A"),
    OneHotEncoder(drop="first")
)

#Using ColumnTransformer a pipeline with 2 branches (the preprocessor):

preprocessor = ColumnTransformer(
    transformers=[
        ("num_pipe", numeric_pipe, X_num_columns),
        ("cat_pipe", categoric_pipe, X_cat_columns),
    ]
)

## Creating the full_pipeline (preprocessor + Decision Tree)

In [80]:
full_pipeline = make_pipeline(preprocessor, 
                              DecisionTreeClassifier())

In [81]:
full_pipeline.fit(X_train, y_train)

In [82]:
full_pipeline.predict(X_train)

array([1, 0, 1, ..., 1, 0, 0], dtype=int64)

In [83]:
accuracy_score(y_true = y_train,
               y_pred = full_pipeline.predict(X_train)
              )

1.0

In [85]:
accuracy_score(y_true = y_test,
               y_pred = full_pipeline.predict(X_test)
              )

0.8904109589041096

## use the new Pipeline with branches to train a DecisionTree with GridSearch cross validation.
We are basically asking to combine what you have learned in this notebook (categorical encoding & branches) with what you learned in the previous one (using GridSearchCV for a whole Pipeline).

In [86]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    "columntransformer__num_pipe__iterativeimputer__initial_strategy":["mean", "median"],
    "decisiontreeclassifier__max_depth": range(2, 14),
    "decisiontreeclassifier__min_samples_leaf": range(3, 12)
}

search = GridSearchCV(full_pipeline,
                      param_grid,
                      cv=5,
                      verbose=1)

search.fit(X_train, y_train)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits


Traceback (most recent call last):
  File "e:\Program Files\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "e:\Program Files\Python310\lib\site-packages\sklearn\metrics\_scorer.py", line 429, in _passthrough_scorer
    return estimator.score(*args, **kwargs)
  File "e:\Program Files\Python310\lib\site-packages\sklearn\pipeline.py", line 695, in score
    Xt = transform.transform(Xt)
  File "e:\Program Files\Python310\lib\site-packages\sklearn\compose\_column_transformer.py", line 763, in transform
    Xs = self._fit_transform(
  File "e:\Program Files\Python310\lib\site-packages\sklearn\compose\_column_transformer.py", line 621, in _fit_transform
    return Parallel(n_jobs=self.n_jobs)(
  File "e:\Program Files\Python310\lib\site-packages\joblib\parallel.py", line 1046, in __call__
    while self.dispatch_one_batch(iterator):
  File "e:\Program Files\Python310\lib\site-packages\joblib\parall

In [63]:
search.best_score_

0.9186786985070248

In [64]:
search.best_params_

{'columntransformer__num_pipe__iterativeimputer__initial_strategy': 'median',
 'decisiontreeclassifier__max_depth': 4,
 'decisiontreeclassifier__min_samples_leaf': 8}

In [None]:


#another version

# select categorical and numerical column names
X_cat_columns = X.select_dtypes(exclude="number").columns
X_num_columns = X.select_dtypes(include="number").columns
min_frequency= int(len(X)*0.002)
max_categories = int(len(X)*0.995)
# create numerical pipeline, only with the SimpleImputer(strategy="mean")
numeric_pipe0 = make_pipeline(
    SimpleImputer(strategy="mean"))
 
 # create categorical pipeline, with the SimpleImputer(fill_value="N_A") and the OneHotEncoder
categoric_pipe0 = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="N_A"),
    OneHotEncoder(min_frequency = min_frequency,handle_unknown='infrequent_if_exist', max_categories=max_categories, drop="first")
)

#Using ColumnTransformer a pipeline with 2 branches (the preprocessor):

preprocessor0 = ColumnTransformer(
    transformers=[
        ("num_pipe", numeric_pipe0, X_num_columns),
        ("cat_pipe", categoric_pipe0, X_cat_columns),
    ]
)

full_pipeline0 = make_pipeline(preprocessor0, 
                              DecisionTreeClassifier())
full_pipeline0.fit(X_train, y_train)


param_grid0 = {    
    "columntransformer__num_pipe__simpleimputer__strategy":["mean", "median"],
    "columntransformer__cat_pipe__onehotencoder__handle_unknown":["infrequent_if_exist","ignore"],
    "decisiontreeclassifier__max_depth": range(2, 14),
    "decisiontreeclassifier__min_samples_leaf": range(3, 12)
}

search0 = GridSearchCV(full_pipeline0,
                      param_grid0,
                      cv=5,
                      verbose=1)

search0.fit(X_train, y_train)

In [118]:
search0.best_score_

0.9186786985070248

In [124]:
search0.best_params_

{'columntransformer__cat_pipe__onehotencoder__handle_unknown': 'infrequent_if_exist',
 'columntransformer__num_pipe__simpleimputer__strategy': 'mean',
 'decisiontreeclassifier__max_depth': 4,
 'decisiontreeclassifier__min_samples_leaf': 9}

we have new Data, and i try to do all steps for new data. its similar, with a little changes in features

In [172]:
housing4 = pd.read_csv('https://raw.githubusercontent.com/sherwan-m/predicting_housing_prices/main/Data/housing-classification-iter4.csv')

# X and y creation
X = housing4.copy()
# X.loc[lambda df_ : df_['Heating'] == 'Floor','Heating'] = 'GasA'
y = X.pop("Expensive")

# Feature Engineering

# data splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [127]:
housing4

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,Expensive,...,CentralAir,Foundation,ExterQual,ExterCond,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,KitchenQual,FireplaceQu
0,8450,65.0,856,3,0,0,2,0,0,0,...,Y,PConc,Gd,TA,Gd,TA,No,GLQ,Gd,
1,9600,80.0,1262,3,1,0,2,298,0,0,...,Y,CBlock,TA,TA,Gd,TA,Gd,ALQ,TA,TA
2,11250,68.0,920,3,1,0,2,0,0,0,...,Y,PConc,Gd,TA,Gd,TA,Mn,GLQ,Gd,TA
3,9550,60.0,756,3,1,0,3,0,0,0,...,Y,BrkTil,TA,TA,TA,Gd,No,ALQ,Gd,Gd
4,14260,84.0,1145,4,1,0,3,192,0,0,...,Y,PConc,Gd,TA,Gd,TA,Av,GLQ,Gd,TA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,7917,62.0,953,3,1,0,2,0,0,0,...,Y,PConc,TA,TA,Gd,TA,No,Unf,TA,TA
1456,13175,85.0,1542,3,2,0,2,349,0,0,...,Y,CBlock,TA,TA,Gd,TA,No,ALQ,TA,TA
1457,9042,66.0,1152,4,2,0,1,0,0,1,...,Y,Stone,Ex,Gd,TA,Gd,No,GLQ,Gd,Gd
1458,9717,68.0,1078,2,0,0,1,366,0,0,...,Y,CBlock,TA,TA,TA,TA,Mn,GLQ,Gd,


new dataset has some additional categorical features. These categories have a particularity: their classes can be meaningfully sorted —there is an order to them.
i try to apply ordinary encoder on them

In [182]:
import re
# select categorical and numerical column names
X_cat_columns = X.select_dtypes(exclude="number").columns
X_num_columns = X.select_dtypes(include="number").columns
X_cat_oneHot_columns = [X_cat_column for X_cat_column in X_cat_columns if not re.search("Cond$|Qu|^Bsmt", X_cat_column)]
X_cat_ordinal_columns = [X_cat_column for X_cat_column in X_cat_columns if re.search("Cond$|Qu|^Bsmt", X_cat_column)]

normal_scores = ['Ex', 'Gd', 'TA', 'Fa', 'Po']
extra_scores = ['Ex', 'Gd', 'TA', 'Fa', 'Po' , 'NA']
BsmtExposure_scores = ['Gd', 'Av', 'Mn', 'No', 'NA']
BsmtFinType1_scores = ['GLQ', 'ALQ', 'BLQ', 'Rec', 'LwQ', 'Unf', 'NA']


min_frequency= int(len(X)*0.002)
max_categories = int(len(X)*0.995)

# create numerical pipeline, only with the IterativeImputer(random_state=0, initial_strategy="mean")
numeric_pipe = make_pipeline(
    IterativeImputer(random_state=0, initial_strategy="mean"))
 
 
 # create categorical pipeline, with the SimpleImputer(fill_value="N_A") and the OneHotEncoder
cat_onehot_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="NA"),
    OneHotEncoder(min_frequency = min_frequency, max_categories=max_categories, handle_unknown='infrequent_if_exist', drop="first"),
)

cat_ordinal_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="NA"),
    OrdinalEncoder(categories=[extra_scores, extra_scores, extra_scores, extra_scores, BsmtExposure_scores, BsmtFinType1_scores, extra_scores, extra_scores])
)
#Using ColumnTransformer a pipeline with 2 branches (the preprocessor):

preprocessor = ColumnTransformer(
    transformers=[
        ("num_pipe", numeric_pipe, X_num_columns),
        ("cat_onehot_pipe", cat_onehot_pipe, X_cat_oneHot_columns),
        ("cat_ordinal_pipe", cat_ordinal_pipe, X_cat_ordinal_columns)
    ]
)

full_pipeline = make_pipeline(preprocessor, 
                              DecisionTreeClassifier())
full_pipeline.fit(X_train, y_train)


param_grid = {    
    "columntransformer__num_pipe__iterativeimputer__initial_strategy":["mean", "median"],
    "decisiontreeclassifier__max_depth": range(2, 14),
    "decisiontreeclassifier__min_samples_leaf": range(3, 12)
}

search = GridSearchCV(full_pipeline0,
                      param_grid0,
                      cv=5,
                      verbose=1)

search.fit(X_train, y_train)

Fitting 5 folds for each of 432 candidates, totalling 2160 fits




In [178]:
accuracy_score(y_true = y_train,
               y_pred = full_pipeline.predict(X_train)
              )

1.0

In [181]:
accuracy_score(y_true = y_test,
               y_pred = full_pipeline.predict(X_test)
              )

0.8972602739726028

In [183]:
search.best_score_

0.9186786985070248

In [184]:
search.best_params_

{'columntransformer__cat_pipe__onehotencoder__handle_unknown': 'infrequent_if_exist',
 'columntransformer__num_pipe__simpleimputer__strategy': 'mean',
 'decisiontreeclassifier__max_depth': 4,
 'decisiontreeclassifier__min_samples_leaf': 8}

# K-Nearest Neighbors

In [185]:
housing5 = pd.read_csv('https://raw.githubusercontent.com/sherwan-m/predicting_housing_prices/main/Data/housing-classification-iter5.csv')

# X and y creation
X = housing4.copy()
# X.loc[lambda df_ : df_['Heating'] == 'Floor','Heating'] = 'GasA'
y = X.pop("Expensive")

# Feature Engineering

# data splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [187]:
housing5.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 51 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   LotArea        1460 non-null   int64  
 1   LotFrontage    1201 non-null   float64
 2   TotalBsmtSF    1460 non-null   int64  
 3   BedroomAbvGr   1460 non-null   int64  
 4   Fireplaces     1460 non-null   int64  
 5   PoolArea       1460 non-null   int64  
 6   GarageCars     1460 non-null   int64  
 7   WoodDeckSF     1460 non-null   int64  
 8   ScreenPorch    1460 non-null   int64  
 9   Expensive      1460 non-null   int64  
 10  MSZoning       1460 non-null   object 
 11  Condition1     1460 non-null   object 
 12  Heating        1460 non-null   object 
 13  Street         1460 non-null   object 
 14  CentralAir     1460 non-null   object 
 15  Foundation     1460 non-null   object 
 16  ExterQual      1460 non-null   object 
 17  ExterCond      1460 non-null   object 
 18  BsmtQual