In [1]:
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_selection import SelectKBest, f_regression, RFE


In [2]:
# loading datasets
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

In [3]:
# cleaning dataset
# get numerical columns

# drop unnecessary columns 'Id', 'PID'
# drop target column 'SalePrice'
numerical_features = list(train._get_numeric_data().drop(['Id', 'PID', 'SalePrice'], axis=1))
features = numerical_features

# replace missing numerical values with -999
for feature in numerical_features:
    train[feature] = train[feature].fillna(-999)
    test[feature] = test[feature].fillna(-999)
    
# get categorical columns
categorical_features = list(train.select_dtypes(include='object').columns)

for col in categorical_features:
    # replace missing columns with 'N/A'
    train[col] = train[col].fillna('N/A')
    test[col] = test[col].fillna('N/A')
    
    # get unique columns names and sort
    train_values = sorted(list(train[col].unique()))
    test_values = sorted(list(test[col].unique()))
    
    categories = set(train_values + test_values)
    
    # create new dataframe (Represents a categorical variable)
    train[col] = pd.Categorical(train[col], categories=categories)
    test[col] = pd.Categorical(test[col], categories=categories)

# dummy categories
train_dummies = pd.get_dummies(train[categorical_features], drop_first=True)
test_dummies = pd.get_dummies(test[categorical_features], drop_first=True)

# create cleaned datasets
X_train = pd.concat([train_dummies, train[numerical_features]], axis=1)
X_test = pd.concat([test_dummies, test[numerical_features]], axis=1)
y = train['SalePrice']

In [4]:
# train/test split 8:2
X_train_split, X_test_split, y_train_split, y_test_split = train_test_split(X_train, y, test_size=0.2, random_state=42)

In [5]:
X_train_split.shape, X_test_split.shape, y_train_split.shape

((1640, 272), (411, 272), (1640,))

In [6]:
param_grid = { 
    'n_estimators': [500, 1000, 1500],
    'max_depth' : [10, 15, 20],
    'criterion' : ['mse', 'mae']
}
rfc = RandomForestRegressor(random_state=42) 
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=5)

In [7]:
CV_rfc.fit(X_train_split, y_train_split)

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [500, 1000, 1500], 'max_depth': [10, 15, 20], 'criterion': ['mse', 'mae']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [11]:
print(CV_rfc.best_params_)
rf_model = RandomForestRegressor(random_state=42, criterion='mse', max_depth=20, n_estimators=500)
rf_model.fit(X_train_split, y_train_split)

{'criterion': 'mse', 'max_depth': 20, 'n_estimators': 500}


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=20,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [12]:
# create a .csv
price = rf_model.predict(X_test)
ids = test['Id']
df = pd.DataFrame({'Id': ids, 'SalePrice':price}) 
df.sort_values('Id', inplace=True)
df.to_csv('./test_submission.csv', index=False)