<a href="https://colab.research.google.com/github/shyleenk/ml_labs/blob/main/ML_Data_Cleaning_Task_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

111218 Shyleen Mwadeghu

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LassoCV
from google.colab import drive


# **Preparing the Dataset for training**

In [None]:
#mount google drive to access the dataset directly from the drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#fetch the dataset from the drive
data=pd.read_csv('/content/drive/MyDrive/Machine Learning/Data/house-prices-advanced-regression-techniques/modified_data.csv')

In [None]:
#check if the dataset is imported successfully 
data.head(2)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,...,0,0,0,0,0,5,2007,WD,Normal,181500


In [None]:
#Dataset Shape
data.shape

(1460, 77)

In [None]:
#Dataset length
len(data)

1460

In [None]:
#The id column isn't required in the computation hence we drop it
data.drop('Id',inplace=True,axis=1)

In [None]:
#Original features 
crude_features=list(data.columns)
crude_features

In [None]:
#Identify columns(features) missing values 
#isna() returns true if a specific tuple is NA(missing value). 
#isna().any() returns true if there is any tuple with a missing value (NA) in a certain column 
features_missing_values=list(data.columns[data.isna().any()])
features_missing_values

['LotFrontage',
 'MasVnrType',
 'MasVnrArea',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Electrical',
 'FireplaceQu',
 'GarageType',
 'GarageYrBlt',
 'GarageFinish',
 'GarageQual',
 'GarageCond']

# **Filling in Missing Features**

In [None]:
#Declare and initialize lists that will hold the classified missing features based on their data type
#bfill holds features_missing_values that have strings(objects) data type
#mean_fill holds features_missing_values that have float data type
bfill,mean_fill=[],[]

In [None]:
#Using loop to separate and populate features_missing_values based on their data types 
for feature in features_missing_values:
  #Displaying features data types and missed value percentage for debugging purposed
  print(feature,' ',data[feature].isna().sum()/len(data)*100,'% ',data[feature].dtype)
  #categorizing the features_missing_values based on their data types
  if data[feature].dtype=='float64':
    mean_fill.append(feature)
  else:
    bfill.append(feature)

LotFrontage   17.73972602739726 %  float64
MasVnrType   0.547945205479452 %  object
MasVnrArea   0.547945205479452 %  float64
BsmtQual   2.5342465753424657 %  object
BsmtCond   2.5342465753424657 %  object
BsmtExposure   2.6027397260273974 %  object
BsmtFinType1   2.5342465753424657 %  object
BsmtFinType2   2.6027397260273974 %  object
Electrical   0.0684931506849315 %  object
FireplaceQu   47.26027397260274 %  object
GarageType   5.5479452054794525 %  object
GarageYrBlt   5.5479452054794525 %  float64
GarageFinish   5.5479452054794525 %  object
GarageQual   5.5479452054794525 %  object
GarageCond   5.5479452054794525 %  object


In [None]:
mean_fill

['LotFrontage', 'MasVnrArea', 'GarageYrBlt']

In [None]:
bfill

['MasVnrType',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Electrical',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond']

In [None]:
#Populating features that have float data types
#mean() calculates the mean of the values in the a specific feature
#fillna() fills a tuple with a missing value(NA) by the parameter passed 
for feature in mean_fill:
  data[feature].fillna(data[feature].mean(),inplace=True)

In [None]:
#Populating features that have object(String) data types
#method bfill used to indicate backward filling of data 
#fillna() fills a tuple with a missing value(NA) by the parameter passed 
for feature in bfill:
  data[feature].fillna(method='bfill',inplace=True)

In [None]:
#Recheck if there is any missing value left
data.columns[data.isna().any()]

Index([], dtype='object')

In [None]:
#calculate the total number of tuples missing values in the hand picked feature
data['FireplaceQu'].isna().sum()

0

In [None]:
#populate the feature missing values
#This time we use method ffill
#It specifies forward filling mechanism
data['FireplaceQu'].fillna(method='ffill',inplace=True)

# ***Encoding the Dataset***

In [None]:
#pick categorical features from the dataset using select_dtypes()
#convert the returned data to list for computation purposes
#categorical features have a data type of object in python  
categorical_features=list(data.select_dtypes(include=['object']).columns)
categorical_features

['MSZoning', 'LandContour', 'Neighborhood']

In [None]:
#pick numerical features from the dataset by taking the set difference between the crude features and the categorical features
#convert the returned data to list for computation purposes
#numerical features have a data type of float in python
numerical=list(set(crude_features)-set(categorical_features))
numerical


In [None]:
#Let's take a moment to look at the categorical features
categorical_features

['MSZoning', 'LandContour', 'Neighborhood']

In [None]:
#Categorical features can be classified into two categories: Nominal and Ordinal features
#Nominal data: Categorical data with no natural order among the data
#Ordinal data: Categorical data with natural order among the data
#Based on the data description provided, the nominal features are hand-picked 
#we get the ordinal features by calculating the set difference between the overall categorical features and the nominal features 
nominal=['MSZoning','LandContour','Neighborhood']
ordinal=list(set(categorical_features)-set(nominal))

In [None]:
#Let's take a moment to look at the whole ordinal data we have
data[ordinal]


In [None]:
#Let's take a moment to look at the whole nominal data we have
data[nominal]


In [None]:
#Let's not forget our target feature -_-
target=['SalePrice']

In [None]:
#For the sake of performing computation we convert categorical data to numerical data using get_dummies()
#get_dummies() converts categorical variable into dummy/indicator variables.
#get_dummies() workes for datasets that have object data types
#Let's change our nominal features to numerical values 
df_nominal=pd.get_dummies(data[nominal])
df_nominal.head(3)

In [None]:
#For the sake of performing computation we convert categorical data to numerical data cat.codes
#cat.codes converts categorical variable into dummy/indicator variables.
#cat.codes workes for categorical data
#Let's change our ordinal features to numerical values 
for feature in ordinal:
  data[feature]=data[feature].astype('category').cat.codes

df_ordinal=data[ordinal]

In [None]:
#converted ordinal data
df_ordinal

In [None]:
#our numerical data
data[numerical]

In [None]:
#Finally, we combine our nominal, ordinal and numerical data to create a well encoded dataset
new_data=pd.concat([df_nominal,df_ordinal,data[numerical]],axis=1)

In [None]:
#Check if the shape of our dataset has remained intact 
new_data.shape

(1460, 107)

In [None]:
#our beautiful, well encoded dataset
new_data

Unnamed: 0,MSZoning_C (all),MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,LandContour_Bnk,LandContour_HLS,LandContour_Low,LandContour_Lvl,Neighborhood_Blmngtn,...,Electrical,OpenPorchSF,LotShape,GarageArea,FireplaceQu,MasVnrArea,MoSold,Condition1,YrSold,TotalBsmtSF
0,0,0,0,1,0,0,0,0,1,0,...,4,61,3,548,4,196.0,2,2,2008,856
1,0,0,0,1,0,0,0,0,1,0,...,4,0,3,460,4,0.0,5,1,2007,1262
2,0,0,0,1,0,0,0,0,1,0,...,4,42,0,608,4,162.0,9,2,2008,920
3,0,0,0,1,0,0,0,0,1,0,...,4,35,0,642,2,0.0,2,2,2006,756
4,0,0,0,1,0,0,0,0,1,0,...,4,84,0,836,4,350.0,12,2,2008,1145
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,0,0,0,1,0,0,0,0,1,0,...,4,40,3,460,4,0.0,8,2,2007,953
1456,0,0,0,1,0,0,0,0,1,0,...,4,0,3,500,4,119.0,2,2,2010,1542
1457,0,0,0,1,0,0,0,0,1,0,...,4,60,3,252,2,0.0,5,2,2010,1152
1458,0,0,0,1,0,0,0,0,1,0,...,0,0,3,240,2,0.0,4,2,2010,1078


# **Standardizing the Dataset**

In [None]:
#Drop the target feature
new_data.drop('SalePrice',axis=1,inplace=True)

In [None]:
#Check the new dataset
new_data

Unnamed: 0,MSZoning_C (all),MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,LandContour_Bnk,LandContour_HLS,LandContour_Low,LandContour_Lvl,Neighborhood_Blmngtn,...,BsmtFinSF2,GarageArea,YearBuilt,MasVnrArea,FullBath,YearRemodAdd,MoSold,TotRmsAbvGrd,YrSold,TotalBsmtSF
0,0,0,0,1,0,0,0,0,1,0,...,0,548,2003,196.0,2,2003,2,8,2008,856
1,0,0,0,1,0,0,0,0,1,0,...,0,460,1976,0.0,2,1976,5,6,2007,1262
2,0,0,0,1,0,0,0,0,1,0,...,0,608,2001,162.0,2,2002,9,6,2008,920
3,0,0,0,1,0,0,0,0,1,0,...,0,642,1915,0.0,1,1970,2,7,2006,756
4,0,0,0,1,0,0,0,0,1,0,...,0,836,2000,350.0,2,2000,12,9,2008,1145
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,0,0,0,1,0,0,0,0,1,0,...,0,460,1999,0.0,2,2000,8,7,2007,953
1456,0,0,0,1,0,0,0,0,1,0,...,163,500,1978,119.0,2,1988,2,7,2010,1542
1457,0,0,0,1,0,0,0,0,1,0,...,0,252,1941,0.0,2,2006,5,9,2010,1152
1458,0,0,0,1,0,0,0,0,1,0,...,1029,240,1950,0.0,1,1996,4,5,2010,1078


In [None]:
# to_numpy() is used to return a NumPy ndarray representing the values in given Series or Index
# to_numpy() converts the pandas Series to a numpy Array
numpy_array = new_data.to_numpy()
numpy_array

array([[   0.,    0.,    0., ...,    8., 2008.,  856.],
       [   0.,    0.,    0., ...,    6., 2007., 1262.],
       [   0.,    0.,    0., ...,    6., 2008.,  920.],
       ...,
       [   0.,    0.,    0., ...,    9., 2010., 1152.],
       [   0.,    0.,    0., ...,    5., 2010., 1078.],
       [   0.,    0.,    0., ...,    6., 2008., 1256.]])

In [None]:
#Check how the first row in the new numpy array looks like 
numpy_array[1]

In [None]:
#we need to standardize the dataset at this point
#StrandardScaler() helps to get standardized distribution, with a zero mean and standard deviation of one (unit variance)
#It standardizes features by subtracting the mean value from the feature and then dividing the result by feature standard deviation
#before we standardize the dataset, we need to transform and fit the data using fit_transform()
#Fit_tranform is used to scale the training data and also learn the scaling parameters of that data
#our standardized data represents variable X in the Y= mX + b equation
standardized_data=StandardScaler().fit_transform(numpy_array)

In [None]:
#look at a smaple of the standerdized dataset
standardized_data[0]

In [None]:
#Target feature represents the Y variable in the Y=mX+b equation
target_feature=data[target]
target_feature

In [None]:
#we convert the target feature to numpy array using to_numpy()
target_feature=target_feature.to_numpy()

In [None]:
target_feature.shape

(1460, 1)

In [None]:
#A peek of how our target_feature looks like in numpy array's format
target_feature

In [None]:
#Stranderdize the target feature using StrandardScaler()
#fit and transform the target feature using fit_transform()
target_feature=StandardScaler().fit_transform(target_feature)

In [None]:
#A peek of how our target_feature looks like after standardization
target_feature

In [None]:
standardized_data.shape

(1460, 106)

# ***Extracting Feature Using PCA***

In [None]:
#PCA: Principal Component Analysis
#specifying the number of components we are extracting
pca=PCA(n_components=4)

In [None]:
#fit and transform the target feature using fit_transform()
p_components=pca.fit_transform(standardized_data)

In [None]:
#peek of the components extracted 
p_components

array([[ 2.4433097 , -1.27896883, -1.03947519, -1.41896811],
       [ 0.31645543,  2.63975128, -1.23374282, -0.62225771],
       [ 2.85722029, -0.81104242, -0.88052826, -1.16898726],
       ...,
       [ 1.26182064, -0.85886434,  2.77459822, -1.49861468],
       [-2.85096273,  3.29384871, -1.45669888,  0.44760798],
       [-0.77738539,  2.79013395, -1.36336903, -0.8580012 ]])

In [None]:
#explained_variance_ratio_ method of PCA is used to get the ratio of variance (eigenvalue / total eigenvalues)
pca.explained_variance_ratio_

array([0.10560233, 0.04472091, 0.04181921, 0.03216466])

# Doing Feature Selection

In [None]:
#get regressor to find the L1 Coefficients for the columns in the PCS
regressor = LassoCV();
pca_data = pd.DataFrame(p_components)
pca_data
regressor.fit(pca_data, np.ravel(target_feature))

LassoCV()

In [None]:
# Get the regressor cofficients for each column
coef  = pd.Series(regressor.coef_, index = pca_data.columns)
num_of_selected_features = sum(coef != 0)
num_of_selected_features


4

# New Dataset Ready for Training


# Train the Model

In [None]:
#LassoCV is the Lasso Regression model to have a more accurate prediction
#CV is cross-validation and the default fold is 5
reg=LassoCV()

In [None]:
#get pca with 2 components
pca = PCA(n_components = 2)
p_components = pca.fit_transform(standardized_data)
pca_data = pd.DataFrame(p_components)

In [None]:
#reshape target feature
#r_target_feature = np.reshape(target_feature, (-1, 1))
reg.fit(pca_data, np.ravel(target_feature))

LassoCV()

In [None]:
#
reg.coef_

array([0.25387739, 0.01887053])