### Pipeline :

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

train = pd.read_csv("../rawdata/train.csv")
test = pd.read_csv("../rawdata/test.csv")

x = train.drop(["SalePrice", "Id"], axis=1).copy()
y = train["SalePrice"].copy()

x_train_full, x_val_full, y_train, y_val = train_test_split(x, y, train_size=0.8)

numerical_cols = [c for c in x_train_full.columns if x_train_full[c].dtype in ['int64', 'float64']]
low_cardinality_cols = [c for c in x_val_full.columns if x_val_full[c].nunique()<10 and x_val_full[c].dtype=="object"]

x_train = x_train_full[numerical_cols + low_cardinality_cols].copy()
x_val = x_val_full[numerical_cols + low_cardinality_cols].copy()



In [2]:
print(train.shape)
print(x.shape)
print(x_train_full.shape)
print(len(numerical_cols))
print(len(low_cardinality_cols))
print(x_train.shape)

(1460, 81)
(1460, 79)
(1168, 79)
36
40
(1168, 76)


#### Step 1 - Define preprocessing steps :

In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

numerical_transformer = SimpleImputer()

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('nums', numerical_transformer, numerical_cols),
    ('cats', categorical_transformer, low_cardinality_cols)
])

#### Step 2 - Define the model :

In [4]:
from sklearn.ensemble import RandomForestRegressor
rfmodel = RandomForestRegressor(n_estimators=80)

#### Step 3 - Create and Evaluate the pipeline :

In [5]:
from sklearn.metrics import mean_absolute_error

#Bundle preprocessing and modeling code
my_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', rfmodel)
])

my_pipeline.fit(x_train, y_train)

predicted = my_pipeline.predict(x_val)
print("MAE : ", mean_absolute_error(y_val, predicted))

MAE :  19015.459803082194


### Generating Kaggle Submission :

In [6]:
predicted = my_pipeline.predict(test)
submission = pd.DataFrame({"Id" : test.Id,
                          "SalePrice" : predicted})
submission.to_csv("../rawdata/submission_v1n1.csv", index=False)
#Kaggle Rank : 3360 / 5395 (Score : 0.15029)

### Cross Validation :

In [10]:
from sklearn.model_selection import cross_val_score

scores = -1 * cross_val_score(my_pipeline, x, y, cv=5, scoring = 'neg_mean_absolute_error')

print("MAE Scores :\n", scores)
print("Average MAE Score (across experiments) : ", scores.mean())

MAE Scores :
 [17766.04101027 17779.74670377 18341.38313356 16443.9916524
 19260.99178082]
Average MAE Score (across experiments) :  17918.430856164385


### Gradient Boosting (xgboost) :

In [14]:
from xgboost import XGBRegressor

xgbmodel = XGBRegressor()

xgb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', xgbmodel)
])

In [15]:
xgb_pipeline.fit(x_train, y_train)

  if getattr(data, 'base', None) is not None and \




Pipeline(memory=None,
     steps=[('preprocessor', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('nums', SimpleImputer(copy=True, fill_value=None, missing_values=nan, strategy='mean',
       verbose=0), ['MSSubClass', 'LotFrontage', 'LotArea',...lpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=None, subsample=1, verbosity=1))])

In [17]:
predicted = xgb_pipeline.predict(x_val)
print("MAE (xgboost, single ) : ",mean_absolute_error(y_val, predicted))

MAE (xgboost) :  17143.84357609161


In [19]:
score = -1 * cross_val_score(xgb_pipeline, x, y, cv=5, scoring='neg_mean_absolute_error')
print("CV Scores : ",score)
print("MAE(xgboost) CV Score : ",score.mean())

  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \


CV Scores :  [15860.38366866 17069.35680651 16689.0335777  15293.57029912
 17165.77492241]
MAE(xgboost) CV Score :  16415.623854880138


### Generating Kaggle Sunmission :

In [20]:
predicted = xgb_pipeline.predict(test)
submission = pd.DataFrame({"Id" : test.Id,
                          "SalePrice" : predicted})
submission.to_csv("../rawdata/submission_v1n1(2).csv", index=False)
#Kaggle Rank : 2883 / 5424 (Score : 0.13961)