hi

# Read the Data

In [147]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier 
from sklearn.pipeline import make_pipeline
# explicitly require this experimental feature
from sklearn.experimental import enable_iterative_imputer  # noqa
# now you can import normally from sklearn.impute
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler

In [2]:
housing = pd.read_csv('https://raw.githubusercontent.com/sherwan-m/predicting_housing_prices/main/Data/housing-classification-iter3.csv')

## Initial exploration

What columns exist on this data? What are their data types?

In [6]:
housing.head()

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,Expensive,MSZoning,Condition1,Heating,Street,CentralAir,Foundation
0,8450,65.0,856,3,0,0,2,0,0,0,RL,Norm,GasA,Pave,Y,PConc
1,9600,80.0,1262,3,1,0,2,298,0,0,RL,Feedr,GasA,Pave,Y,CBlock
2,11250,68.0,920,3,1,0,2,0,0,0,RL,Norm,GasA,Pave,Y,PConc
3,9550,60.0,756,3,1,0,3,0,0,0,RL,Norm,GasA,Pave,Y,BrkTil
4,14260,84.0,1145,4,1,0,3,192,0,0,RL,Norm,GasA,Pave,Y,PConc


Do we have missing values on this dataset?

In [5]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   LotArea       1460 non-null   int64  
 1   LotFrontage   1201 non-null   float64
 2   TotalBsmtSF   1460 non-null   int64  
 3   BedroomAbvGr  1460 non-null   int64  
 4   Fireplaces    1460 non-null   int64  
 5   PoolArea      1460 non-null   int64  
 6   GarageCars    1460 non-null   int64  
 7   WoodDeckSF    1460 non-null   int64  
 8   ScreenPorch   1460 non-null   int64  
 9   Expensive     1460 non-null   int64  
 10  MSZoning      1460 non-null   object 
 11  Condition1    1460 non-null   object 
 12  Heating       1460 non-null   object 
 13  Street        1460 non-null   object 
 14  CentralAir    1460 non-null   object 
 15  Foundation    1460 non-null   object 
dtypes: float64(1), int64(9), object(6)
memory usage: 182.6+ KB


Do we have duplicated information?

In [7]:
housing.duplicated().sum()

11

Is there any column that helps us identify if a house is expensive or not?

In [None]:
housing.loc[lambda df_ :df_['Expensive'] ==1].head(50)

## Data Splitting (Train - Test)

Setting X and y

- **X**: columns that help us make a prediction.
- **y**: the column that we want to predict.

In [9]:
X = housing.copy()
y = X.pop('Expensive')

## Feature Selection: 
1. convert non_numerics columns to numeric
2. choose the helpful featuers

Scikit-Learn models cannot deal with categorical features

In [63]:
class NumericConvertor:
    data =0
    def __init__(self):
        super().__init__()
    def fit_transform(self, X, y=None):
        num_data_df= X.copy()
        object_columns = num_data_df.select_dtypes(include=object).columns
        if len(object_columns) == 0:
            return  num_data_df
        
        data_lenth = len(num_data_df[object_columns[0]])

        for object_column in object_columns:
            values = num_data_df[object_column].unique()
            if len(values) > 0.5 * data_lenth:
                num_data_df.drop(columns= object_column, inplace=True)
                continue
            num_data_df[object_column] = pd.factorize(num_data_df[object_column])[0]
            # num_data_df[object_column] = num_data_df[object_column].apply(lambda x: pd.factorize(x)[0])
        return num_data_df
        return 
    def transform(self, X):
        num_data_df= X.copy()
        object_columns = num_data_df.select_dtypes(include=object).columns
        if len(object_columns) == 0:
            return  num_data_df
        
        data_lenth = len(num_data_df[object_columns[0]])

        for object_column in object_columns:
            values = num_data_df[object_column].unique()
            if len(values) > 0.5 * data_lenth:
                num_data_df.drop(columns= object_column, inplace=True)
                continue
            num_data_df[object_column] = pd.factorize(num_data_df[object_column])[0]
            # num_data_df[object_column] = num_data_df[object_column].apply(lambda x: pd.factorize(x)[0])
        return num_data_df
    def fit(self, X, y=None):
        
        self.fit_transform(X)
        return self

In [54]:
#this fuction converts string value columns to numeric
#its a first version of it, i will improve and adjust it for this case

def convertor_to_numeric(data_df):
  num_data_df= data_df.copy()
  object_columns = num_data_df.select_dtypes(include=object).columns
  if len(object_columns) == 0:
    return  num_data_df
  
  data_lenth = len(num_data_df[object_columns[0]])

  for object_column in object_columns:
    values = num_data_df[object_column].unique()
    if len(values) > 0.5 * data_lenth:
      num_data_df.drop(columns= object_column, inplace=True)
      continue
    num_data_df[object_column] = pd.factorize(num_data_df[object_column])[0]
    # num_data_df[object_column] = num_data_df[object_column].apply(lambda x: pd.factorize(x)[0])
  return num_data_df

In [55]:
X_num = convertor_to_numeric(X)
X_num

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,MSZoning,Condition1,Heating,Street,CentralAir,Foundation
0,8450,65.0,856,3,0,0,2,0,0,0,0,0,0,0,0
1,9600,80.0,1262,3,1,0,2,298,0,0,1,0,0,0,1
2,11250,68.0,920,3,1,0,2,0,0,0,0,0,0,0,0
3,9550,60.0,756,3,1,0,3,0,0,0,0,0,0,0,2
4,14260,84.0,1145,4,1,0,3,192,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,7917,62.0,953,3,1,0,2,0,0,0,0,0,0,0,0
1456,13175,85.0,1542,3,2,0,2,349,0,0,0,0,0,0,1
1457,9042,66.0,1152,4,2,0,1,0,0,0,0,0,0,0,5
1458,9717,68.0,1078,2,0,0,1,366,0,0,0,0,0,0,1


choose the helpful featuers

### Data splitting

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_num, 
                                                            y, 
                                                            test_size=0.2, 
                                                            random_state=123)

### Imputing missing values

(Fit on train, transform train & test)

In [17]:
imp_mean = IterativeImputer(random_state=0)

imp_mean.fit(X_train)

# transform train 
X_imputed_train = imp_mean.transform(X_train)
X_imputed_test = imp_mean.transform(X_test)

## Create our first models

Based on the previous exploration, you have found some column that have some relation to the price of a house. Now it's your turn to create a python function to classify if a house is going to be expensive (1) or not (0).

there are the simples models, i will try to make them better

In [30]:
def dummy_model_v0(X_train, X_test, y_train, y_test):
      pred_pessimistic_train_v0 =  X_train.copy()
      pred_pessimistic_train_v0['pred'] = 0
      train_accuracy = accuracy_score(y_true = y_train,
                                 y_pred = pred_pessimistic_train_v0['pred']
                                 )
      pred_pessimistic_test_v0 =  X_test.copy()
      pred_pessimistic_test_v0['pred'] = 0
      test_accuracy = accuracy_score(y_true = y_test,
                               y_pred = pred_pessimistic_test_v0['pred']
                               )
      
      return round(train_accuracy,2),round(test_accuracy,2)

In [31]:
def dummy_model_v1(X_train, X_test, y_train, y_test):
      pred_pessimistic_train_v1 =  X_train.copy()
      pred_pessimistic_train_v1['pred'] = 0
      pred_pessimistic_train_v1.loc[lambda df_ :df_['BedroomAbvGr'] > 2,'pred'] = 1
      train_accuracy = accuracy_score(y_true = y_train,
                                 y_pred = pred_pessimistic_train_v1['pred']
                                 )
      pred_pessimistic_test_v1 =  X_test.copy()
      pred_pessimistic_test_v1['pred'] = 0
      pred_pessimistic_test_v1.loc[lambda df_ :df_['BedroomAbvGr'] > 2,'pred'] = 1
      test_accuracy = accuracy_score(y_true = y_test,
                               y_pred = pred_pessimistic_test_v1['pred']
                               )
      
      return round(train_accuracy,2),round(test_accuracy,2)

In [39]:
def dummy_model_v2(X_train, X_test, y_train, y_test):
      pred_pessimistic_train_v1 =  X_train.copy()
      pred_pessimistic_train_v1['pred'] = 0
      pred_pessimistic_train_v1.loc[lambda df_ :df_['GarageCars'] > 2.5,'pred'] = 1
      train_accuracy = accuracy_score(y_true = y_train,
                                 y_pred = pred_pessimistic_train_v1['pred']
                                 )
      pred_pessimistic_test_v1 =  X_test.copy()
      pred_pessimistic_test_v1['pred'] = 0
      pred_pessimistic_test_v1.loc[lambda df_ :df_['GarageCars'] > 2.5,'pred'] = 1
      test_accuracy = accuracy_score(y_true = y_test,
                               y_pred = pred_pessimistic_test_v1['pred']
                               )
      
      return round(train_accuracy,2),round(test_accuracy,2)

In [33]:
def dummy_model_v3(X_train, X_test, y_train, y_test):
      pred_pessimistic_train_v1 =  X_train.copy()
      pred_pessimistic_train_v1['pred'] = 0
      pred_pessimistic_train_v1.loc[lambda df_ :df_['LotArea'] > 10000.0,'pred'] = 1
      train_accuracy = accuracy_score(y_true = y_train,
                                 y_pred = pred_pessimistic_train_v1['pred']
                                 )
      pred_pessimistic_test_v1 =  X_test.copy()
      pred_pessimistic_test_v1['pred'] = 0
      pred_pessimistic_test_v1.loc[lambda df_ :df_['LotArea'] > 10000.0,'pred'] = 1
      test_accuracy = accuracy_score(y_true = y_test,
                               y_pred = pred_pessimistic_test_v1['pred']
                               )
      
      return round(train_accuracy,2),round(test_accuracy,2)

In [40]:
dummy_model_v2(X_train, X_test, y_train, y_test)

(0.9, 0.9)

## Modelling: Decision Tree

### creating Model & fitting 

In [56]:
my_tree = DecisionTreeClassifier(min_samples_leaf=6)


my_tree.fit(X = X_imputed_train, 
            y = y_train)

### Apply your model

In [57]:
y_pred_tree_train = my_tree.predict(X_imputed_train)


array([1, 0, 1, ..., 1, 0, 0], dtype=int64)

## Check the performance of your model

In [58]:
accuracy_score(y_true = y_train,
               y_pred = y_pred_tree_train
              )

0.9452054794520548

In [129]:
y_pred_tree_test = my_tree.predict(X_imputed_test)

accuracy_score(y_true = y_test,
               y_pred = y_pred_tree_test
              )

0.8972602739726028

# Creating a Pipeline

In [164]:
## version0
# reading
data = pd.read_csv('https://raw.githubusercontent.com/JoanClaverol/housing_data/main/housing-classification-iter3.csv')

# X and y creation
X = data
y = X.pop("Expensive")

# feature selection: only numericals
X_num = convertor_to_numeric(X)

# data splitting
X_train0, X_test0, y_train0, y_test0 = train_test_split(X_num, y, test_size=0.2, random_state=123)

# initialize transformers &amp; model
imputer0 = IterativeImputer(random_state=0)
dtree0 = DecisionTreeClassifier()
 
# Create a pipeline


pipe0 = make_pipeline(imputer0,
                     dtree0)

In [165]:
## version1
# reading
data = pd.read_csv('https://raw.githubusercontent.com/JoanClaverol/housing_data/main/housing-classification-iter3.csv')

# X and y creation
X = data
y = X.pop("Expensive")

# feature selection: only numericals
feature_selector = NumericConvertor()

# data splitting
X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y, test_size=0.2, random_state=123)

# initialize transformers &amp; model
imputer1 = IterativeImputer(random_state=0)
dtree1 = DecisionTreeClassifier()
 
# Create a pipeline
pipe1 = make_pipeline(feature_selector,
                     imputer1,
                     dtree1)



In [188]:
pipe0.fit(X_train0, y_train0)
pipe0.score(X_test0, y_test0)
# pipe1.predict(X_test)

0.8938356164383562

In [187]:
pipe1.fit(X_train1, y_train1)
pipe1.score(X_test1, y_test1)
# pipe1.predict(X_test)

0.8904109589041096

## Cross Validation with the whole pipeline:


In [192]:
param_grid = {
    "iterativeimputer__initial_strategy":["mean", "median","most_frequent"],
    # "iterativeimputer__max_iter":range(10,20,5),
    "decisiontreeclassifier__max_depth": range(2, 14),
    "decisiontreeclassifier__min_samples_leaf": range(3, 10),
    "decisiontreeclassifier__criterion":["gini", "entropy"]
}

In [193]:
from sklearn.model_selection import GridSearchCV

search0 = GridSearchCV(pipe0,
                      param_grid,
                      cv=10,
                      verbose=1)
search1 = GridSearchCV(pipe1,
                      param_grid,
                      cv=10,
                      verbose=1)

In [194]:
search0.fit(X_train0, y_train0)
search1.fit(X_train1, y_train1)

Fitting 10 folds for each of 504 candidates, totalling 5040 fits


KeyboardInterrupt: 

In [None]:
search0.best_params_

In [None]:
search1.best_params_

In [None]:
search0.best_score_

In [None]:
search1.best_score_