In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv


In [2]:
from sklearn.ensemble import RandomForestClassifier

In [3]:
# Specify the paths to the training and the validation data files.

training_data_file_path = '../input/house-prices-advanced-regression-techniques/train.csv'

validation_data_file_path = '../input/house-prices-advanced-regression-techniques/test.csv'

In [4]:
# Read the .csv files

training_data = pd.read_csv(training_data_file_path)

validation_data = pd.read_csv(validation_data_file_path)

In [5]:
# Set up the target variable for the training dataset

training_target_variable = training_data.SalePrice

In [6]:
# Apply one-hot encoding to the 'Foundation' feature

one_hot_encoded_test_foundation = pd.get_dummies(training_data.Foundation, prefix = 'Foundation')

one_hot_encoded_validation_foundation = pd.get_dummies(validation_data.Foundation, prefix = 'Foundation')

In [7]:
# Merge one-hot encoded dataframes to training_data dataframe and drop one-hot encoded columns

training_data = pd.concat([training_data, one_hot_encoded_test_foundation], axis = 1)

training_data = training_data.drop(['Foundation'], axis = 1)

validation_data = pd.concat([validation_data, one_hot_encoded_validation_foundation], axis = 1)

validation_data = validation_data.drop(['Foundation'], axis = 1)

In [8]:
# Store columns that now contain 'Foundation' as part of their name

test_foundation_columns = [column for column in training_data.columns if column.startswith('Foundation')]

validation_foundation_columns = [column for column in validation_data.columns if column.startswith('Foundation')]

In [9]:
# Set up the features

test_features = ['OverallQual'] + test_foundation_columns

validation_features = ['OverallQual'] + validation_foundation_columns

In [10]:
# Set up the independent variables for the training and validation datasets

test_independent_variable = training_data[test_features]

validation_independent_variable = validation_data[validation_features]

In [11]:
# Specify the model and its parameters

model = RandomForestClassifier(n_estimators = 50, max_depth = 5)

In [12]:
# Fit the model to the training data

model.fit(test_independent_variable, training_target_variable)

In [13]:
# Predict the house sale price on the training data

prediction = model.predict(test_independent_variable)

In [14]:
# Make a prediction of sale price on the validation data

validation_prediction = model.predict(validation_independent_variable)

In [15]:
# Save the output in the desired format and output to .csv file

output = pd.DataFrame({'Id': validation_data.Id, 'SalePrice': validation_prediction})

output.to_csv('submission.csv', index = False)