In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
import warnings

warnings.filterwarnings('ignore')

1 Read data

In [2]:
train_csv = "train.csv"
test_csv = "test.csv"
train_df = pd.read_csv(train_csv)
test_df = pd.read_csv(test_csv)

2 Seperating numerical and categorical values

In [3]:
numerical_feature_description = train_df.select_dtypes(exclude = ['object']).describe().round(decimals=2).transpose()
categorical_feature_description = train_df.select_dtypes(include = ['object']).describe().transpose()

Getting the columns and their null values

In [5]:
train_df.isnull().sum().sort_values(ascending=False)

PoolQC         1453
MiscFeature    1406
Alley          1369
Fence          1179
FireplaceQu     690
               ... 
ExterQual         0
Exterior2nd       0
Exterior1st       0
RoofMatl          0
SalePrice         0
Length: 81, dtype: int64

Deleting Id, PoolQC, MiscFeature, Alley, Fence as they have more than 80% missing values

In [6]:
deleting_columns = ["Id","PoolQC","MiscFeature","Alley","Fence"]
train_df_processed = train_df.drop(deleting_columns, axis=1)
numerical_feature = list(set(train_df.select_dtypes(exclude = ['object']).columns) - set(deleting_columns))
categorical_feature = list(set(train_df.select_dtypes(include = ['object']).columns) - set(deleting_columns))

**4. Handling missing values**

4.1 for numerical missing values

In [7]:
train_df_numeric = train_df_processed[numerical_feature]
numeric_columns = train_df_numeric.columns
train_df_numeric.shape

(1460, 37)

In [8]:
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=3)
train_array_numeric_processed = imputer.fit_transform(train_df_numeric)
train_df_numeric_processed = pd.DataFrame(train_array_numeric_processed, columns = numeric_columns)

In [9]:
train_df_processed[numerical_feature] = train_df_numeric_processed[numerical_feature]

In [10]:
train_df_processed.columns[train_df_processed.isnull().any()]

Index(['MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
       'BsmtFinType2', 'Electrical', 'FireplaceQu', 'GarageType',
       'GarageFinish', 'GarageQual', 'GarageCond'],
      dtype='object')

4.2. For Categorical missing values

In [11]:
catfeatures_containing_null = ['MasVnrType','BsmtFinType1','BsmtFinType2','GarageQual','GarageType','BsmtExposure','Electrical','BsmtQual','GarageCond','FireplaceQu','GarageFinish','BsmtCond']

In [12]:
train_df_processed_copy = train_df_processed.copy()

In [13]:
data_X = train_df_processed_copy.drop(catfeatures_containing_null, axis=1)


In [14]:
numerical_data_X = data_X.select_dtypes(exclude = ['object']).columns
categorical_data_X = data_X.select_dtypes(include = ['object']).columns

In [15]:
#numericals to scaling
num_df = data_X[numerical_data_X]

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(num_df)
num_df_scaled = scaler.transform(num_df)

num_df_scaled = pd.DataFrame(num_df_scaled, columns = num_df.columns)
num_df_scaled.shape

(1460, 37)

In [16]:
#categorical encoding
#all remove stage

cat_df = data_X[categorical_data_X]

from sklearn.preprocessing import OneHotEncoder
onc = OneHotEncoder()
onc.fit(cat_df)

cat_df_encoded = onc.transform(cat_df)
cat_df_encoded = pd.DataFrame(cat_df_encoded.toarray())
cat_df_encoded.shape

(1460, 182)

In [17]:
total_df = pd.concat([num_df_scaled, cat_df_encoded], axis=1)

In [18]:
from sklearn.neighbors import KNeighborsClassifier

In [19]:
knc = KNeighborsClassifier()

In [20]:
catfeatures_containing_null = ['Electrical','MasVnrType','BsmtFinType1','BsmtFinType2','GarageQual','GarageType','BsmtExposure','BsmtQual','GarageCond','FireplaceQu','GarageFinish','BsmtCond']
total_df_copy = total_df.copy()
for col in catfeatures_containing_null:

  data_Y = train_df_processed[col]
  data_X = total_df_copy

  knc = KNeighborsClassifier()
  train_idx = [index for index, row in pd.DataFrame(data_Y).iterrows() if row.isnull().any()]

  X_val = data_X.iloc[train_idx]
  y_val = data_Y.iloc[train_idx]

  val_df = data_X.index.isin(train_idx)

  X_train = data_X[~val_df]
  y_train = data_Y[~val_df]

  knc.fit(X_train, y_train)
  y_pred = knc.predict(X_val)

  new = np.array(y_train.tolist() + y_pred.tolist())
  new_arr = new.reshape(-1, 1)

  #numericals to scaling
  from sklearn.preprocessing import OneHotEncoder
  onc = OneHotEncoder()
  onc.fit(new_arr)
  new_encoded = onc.transform(new_arr)

  new_encoded_df = pd.DataFrame(new_encoded.toarray())

  new_encoded_df.columns = [col+str(i) for i in range(0,new_encoded_df.shape[1])] 

  total_df_copy = pd.concat([total_df_copy, new_encoded_df], axis=1)

In [21]:
total_df_copy.shape

(1460, 276)

Transforming target


In [22]:
from numpy.ma.core import absolute
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.linear_model import HuberRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import TransformedTargetRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RANSACRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso


X,y = total_df_copy.drop(['SalePrice'], axis=1), train_df_processed['SalePrice']
pipeline = Pipeline(steps=[('normalize', MinMaxScaler()), ('model', HuberRegressor())])
model = TransformedTargetRegressor(regressor=pipeline, transformer = MinMaxScaler())
cv = KFold(n_splits=10, shuffle=True, random_state=1)
scores = cross_validate(model, X,y, scoring = ('r2', 'neg_mean_squared_error'), cv = cv, n_jobs=1)

r2_mean = np.mean(scores['test_r2'])
neg_mean_squared_error_mean = np.mean(absolute(scores['test_neg_mean_squared_error']))

print(neg_mean_squared_error_mean)
print(r2_mean)

1028797211.4538021
0.8253812671415945


In [None]:
X,y = total_df_copy.drop(['SalePrice'], axis=1), train_df_processed['SalePrice']
pipeline = Pipeline(steps=[('normalize', MinMaxScaler()), ('model', LinearRegression())])
model = TransformedTargetRegressor(regressor=pipeline, transformer = MinMaxScaler())
cv = KFold(n_splits=10, shuffle=True, random_state=1)
scores = cross_validate(model, X,y, scoring = ('r2', 'neg_mean_squared_error'), cv = cv, n_jobs=1)


r2_mean = np.mean(scores['test_r2'])
neg_mean_squared_error_mean = np.mean(absolute(scores['test_neg_mean_squared_error']))

print(neg_mean_squared_error_mean)
print(r2_mean)

1.0880349242170358e+29
-1.7890261442630791e+19


In [None]:
from sklearn.linear_model import Ridge

X,y = total_df_copy.drop(['SalePrice'], axis=1), train_df_processed['SalePrice']
pipeline = Pipeline(steps=[('normalize', MinMaxScaler()), ('model', Ridge(alpha=100, solver='cholesky', tol=0.0001, random_state=42))])
model = TransformedTargetRegressor(regressor=pipeline, transformer = MinMaxScaler())
cv = KFold(n_splits=10, shuffle=True, random_state=1)
scores = cross_validate(model, X,y, scoring = ('r2', 'neg_mean_squared_error'), cv = cv, n_jobs=1)


r2_mean = np.mean(scores['test_r2'])
neg_mean_squared_error_mean = np.mean(absolute(scores['test_neg_mean_squared_error']))

print(neg_mean_squared_error_mean)
print(r2_mean)

1489916818.3822799
0.7620748473034932


In [None]:


X,y = total_df_copy.drop(['SalePrice'], axis=1), train_df_processed['SalePrice']
pipeline = Pipeline(steps=[('normalize', MinMaxScaler()), ('model', Lasso(alpha=0.1, 
              precompute=True, 
#               warm_start=True, 
              positive=True, 
              selection='random',
              random_state=42))])
model = TransformedTargetRegressor(regressor=pipeline, transformer = MinMaxScaler())
cv = KFold(n_splits=10, shuffle=True, random_state=1)
scores = cross_validate(model, X,y, scoring = ('r2', 'neg_mean_squared_error'), cv = cv, n_jobs=1)
# scores = absolute(scores)
# s_mean = np.mean(scores)
# print(s_mean)


r2_mean = np.mean(scores['test_r2'])
neg_mean_squared_error_mean = np.mean(absolute(scores['test_neg_mean_squared_error']))

print(neg_mean_squared_error_mean)
print(r2_mean)

6311984871.911216
-0.004258855785411808
