In [39]:
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression, RFE


In [40]:
# loading datasets
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

In [41]:
# cleaning dataset
# get numerical columns

# drop unnecessary columns 'Id', 'PID'
# drop target column 'SalePrice'
numerical_features = list(train._get_numeric_data().drop(['Id', 'PID', 'SalePrice'], axis=1))
features = numerical_features

# replace missing numerical values with -999
for feature in numerical_features:
    train[feature] = train[feature].dropna()
    test[feature] = test[feature].dropna()
    
# get categorical columns
categorical_features = list(train.select_dtypes(include='object').columns)

for col in categorical_features:
    # replace missing columns with 'N/A'
    train[col] = train[col].fillna('N/A')
    test[col] = test[col].fillna('N/A')
    
    # get unique columns names and sort
    train_values = sorted(list(train[col].unique()))
    test_values = sorted(list(test[col].unique()))
    
    categories = set(train_values + test_values)
    
    # create new dataframe (Represents a categorical variable)
    train[col] = pd.Categorical(train[col], categories=categories)
    test[col] = pd.Categorical(test[col], categories=categories)

# dummy categories
train_dummies = pd.get_dummies(train[categorical_features], drop_first=True)
test_dummies = pd.get_dummies(test[categorical_features], drop_first=True)

# create cleaned datasets
X_train = pd.concat([train_dummies, train[numerical_features]], axis=1)
X_test = pd.concat([test_dummies, test[numerical_features]], axis=1)


In [42]:
X_train.head()

Unnamed: 0,MS Zoning_RH,MS Zoning_RM,MS Zoning_C (all),MS Zoning_A (agr),MS Zoning_I (all),MS Zoning_RL,Street_Pave,Alley_Pave,Alley_N/A,Lot Shape_IR3,...,Garage Area,Wood Deck SF,Open Porch SF,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Misc Val,Mo Sold,Yr Sold
0,0,0,0,0,0,1,1,0,1,0,...,475.0,0,44,0,0,0,0,0,3,2010
1,0,0,0,0,0,1,1,0,1,0,...,559.0,0,74,0,0,0,0,0,4,2009
2,0,0,0,0,0,1,1,0,1,0,...,246.0,0,52,0,0,0,0,0,1,2010
3,0,0,0,0,0,1,1,0,1,0,...,400.0,100,0,0,0,0,0,0,4,2010
4,0,0,0,0,0,1,1,0,1,0,...,484.0,0,59,0,0,0,0,0,3,2010


In [43]:
X = X_train
y = train['SalePrice']
X_train_split, X_test_split, y_train_split, y_test_split = train_test_split(X_train, y, test_size=0.2, random_state=42)

X.shape, X_train_split.shape, X_test_split.shape

((2051, 272), (1640, 272), (411, 272))

In [37]:
rfc = RandomForestClassifier(random_state=42) 

param_grid = { 
    'n_estimators': [500],
    'max_features': ['auto'],
    'max_depth' : [3,4,5],
    'criterion' : ['gini']
}

CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=5)
CV_rfc.fit(X_train_split, y_train_split)
print(CV_rfc.best_params_)




{'criterion': 'gini', 'max_depth': 5, 'max_features': 'auto', 'n_estimators': 500}


In [50]:
rf

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [49]:
rfc1

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [46]:
rfc1 = RandomForestClassifier(random_state=42, n_estimators= 500, criterion='gini')
rfc1_fitted = rfc1.fit(X_train_split, y_train_split)
print(mean_squared_error(y_test_split, rfc1_fitted.predict(X_test_split)) ** .5)
print(r2_score(y_test_split, rfc1_fitted.predict(X_test_split)))

32811.57182189475
0.8188005658598475


In [47]:
# random forest
rf = RandomForestClassifier(random_state=42, n_estimators=500)
rf_fitted = rf.fit(X_train_split, y_train_split)
print(mean_squared_error(y_test_split, rf_fitted.predict(X_test_split)) ** .5)

32811.57182189475


In [48]:
r2_score(y_test_split, rf_fitted.predict(X_test_split))

0.8188005658598475

In [10]:
# create a .csv
price = rf_fitted.predict(X_test)
ids = test['Id']
df = pd.DataFrame({'Id': ids, 'SalePrice':price}) 
df.sort_values('Id', inplace=True)
df.to_csv('./test_submission.csv', index=False)

In [44]:
train[train['Gr Liv Area']>4000].T


Unnamed: 0,960,1885
Id,1499,2181
PID,908154235,908154195
MS SubClass,60,20
MS Zoning,RL,RL
Lot Frontage,313,128
Lot Area,63887,39290
Street,Pave,Pave
Alley,,
Lot Shape,IR3,IR1
Land Contour,Bnk,Bnk
