<a href="https://colab.research.google.com/github/starkjones/Neural-Networks/blob/main/Neural_Network_Kaggle_Exercise.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Neural Network Kaggle Exercise**
Jonathan Jones

22.06.09

In [202]:
# Libraries: 

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tensorflow.keras import metrics
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import EarlyStopping


In [203]:
# Mount google drive:

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [204]:
# Load training data:

data = '/content/drive/MyDrive/SI/Kaggle/train.csv'

df = pd.read_csv(data)

df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [205]:
# Load test data:

test_data = '/content/drive/MyDrive/SI/Kaggle/test.csv'

df_test = pd.read_csv(test_data)

df_test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [206]:
# convert column names to lower case:

df.columns = df.columns.str.lower()

In [207]:
# Duplicated rows:

df.duplicated().sum()

0

In [208]:
# Datatypes and dictionary conformity: 

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             1460 non-null   int64  
 1   mssubclass     1460 non-null   int64  
 2   mszoning       1460 non-null   object 
 3   lotfrontage    1201 non-null   float64
 4   lotarea        1460 non-null   int64  
 5   street         1460 non-null   object 
 6   alley          91 non-null     object 
 7   lotshape       1460 non-null   object 
 8   landcontour    1460 non-null   object 
 9   utilities      1460 non-null   object 
 10  lotconfig      1460 non-null   object 
 11  landslope      1460 non-null   object 
 12  neighborhood   1460 non-null   object 
 13  condition1     1460 non-null   object 
 14  condition2     1460 non-null   object 
 15  bldgtype       1460 non-null   object 
 16  housestyle     1460 non-null   object 
 17  overallqual    1460 non-null   int64  
 18  overallc

In [209]:
# Removing columns with less than 50% of their data: 

df.dropna(axis = 1, thresh = 45, inplace = True)

# cat = make_column_selector(dtype_include= 'object')

# cat(df)

In [210]:
# Removal of ineffectual columns:

dropped_columns = ['alley', 'miscfeature', 'masvnrtype', 'masvnrarea', 
                   'fireplacequ', 'bsmtfintype1', 'bsmtfintype2', 'garagequal', 
                   'garagecond','fence', 'bsmtqual', 'bsmtcond']

df.drop(columns = dropped_columns, inplace = True)

In [211]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 68 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             1460 non-null   int64  
 1   mssubclass     1460 non-null   int64  
 2   mszoning       1460 non-null   object 
 3   lotfrontage    1201 non-null   float64
 4   lotarea        1460 non-null   int64  
 5   street         1460 non-null   object 
 6   lotshape       1460 non-null   object 
 7   landcontour    1460 non-null   object 
 8   utilities      1460 non-null   object 
 9   lotconfig      1460 non-null   object 
 10  landslope      1460 non-null   object 
 11  neighborhood   1460 non-null   object 
 12  condition1     1460 non-null   object 
 13  condition2     1460 non-null   object 
 14  bldgtype       1460 non-null   object 
 15  housestyle     1460 non-null   object 
 16  overallqual    1460 non-null   int64  
 17  overallcond    1460 non-null   int64  
 18  yearbuil

In [240]:
# Correlation map:

correlation = df.corr().round(2)
c_list = correlation.unstack().sort_values(ascending = False)

remove = c_list == 1

test_list = ~remove
test_val = []

for i in test_list:
  if i < 1 and i > .3:
    i.append(test_val)
    print(test_val)

## **Preprocessing**

In [214]:
# Set target:

X = df.drop(columns = 'saleprice')
y = df['saleprice']

# Train Test Split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [215]:
from sklearn.pipeline import make_pipeline
from pandas.core.arrays import categorical
from sklearn.compose import make_column_selector, make_column_transformer

# Column selection / separation by data type:

cat = make_column_selector(dtype_include= 'object')

cat(X_train)

categorical_columns = ['mszoning',
 'street',
 'lotshape',
 'landcontour',
 'utilities',
 'lotconfig',
 'landslope',
 'neighborhood',
 'condition1',
 'condition2',
 'bldgtype',
 'housestyle',
 'roofstyle',
 'roofmatl',
 'exterior1st',
 'exterior2nd',
 'exterqual',
 'extercond',
 'foundation',
 'bsmtexposure',
 'heating',
 'heatingqc',
 'centralair',
 'electrical',
 'kitchenqual',
 'functional',
 'garagetype',
 'garagefinish',
 'paveddrive',
 'saletype',
 'salecondition']

numeric_columns = X_train.columns.drop(categorical_columns)

In [216]:
# Scaler:
scaler = StandardScaler()

# One Hot Encoder:
OHE = OneHotEncoder(sparse = False, handle_unknown= 'ignore')

In [217]:
# Instantiate imputers:

mean_imputer = SimpleImputer(strategy= 'mean')
frequent_imputer = SimpleImputer(strategy= 'most_frequent')

In [218]:
# Pipelines:

categorical_pipeline = make_pipeline(frequent_imputer, OHE)
numeric_pipleine = make_pipeline(mean_imputer, scaler)

In [219]:
# Preprocessing tuples:
categorical_tuple = (categorical_pipeline, categorical_columns)
numeric_tuple = (numeric_pipleine, numeric_columns)

# Column transformer:
preprocessor = make_column_transformer(numeric_tuple, categorical_tuple, remainder= 'passthrough')

# Data processing:

preprocessor.fit(X_train, y_train)

X_train_processed = preprocessor.transform(X_train)
X_test_processed = preprocessor.transform(X_test)

## **Modeling**

In [234]:
from sklearn.ensemble import GradientBoostingClassifier

# Instantiate Gradient Boosting Model:

gbc = GradientBoostingClassifier()

# Fit model on training data:

gbc.fit(X_train_processed, y_train)

KeyboardInterrupt: ignored