In [33]:
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression, RFE


In [34]:
# loading datasets
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

In [35]:
# cleaning dataset
# get numerical columns

# drop unnecessary columns 'Id', 'PID'
# drop target column 'SalePrice'
numerical_features = list(train._get_numeric_data().drop(['Id', 'PID', 'SalePrice'], axis=1))
features = numerical_features

# replace missing numerical values with -999
for feature in numerical_features:
    train[feature] = train[feature].fillna(-999)
    test[feature] = test[feature].fillna(-999)

# get categorical columns
categorical_features = list(train.select_dtypes(include='object').columns)

for col in categorical_features:
    # replace missing columns with 'N/A'
    train[col] = train[col].fillna('N/A')
    test[col] = test[col].fillna('N/A')
    
    # get unique columns names and sort
    train_values = sorted(list(train[col].unique()))
    test_values = sorted(list(test[col].unique()))
    
    categories = set(train_values + test_values)
    
    # create new dataframe (Represents a categorical variable)
    train[col] = pd.Categorical(train[col], categories=categories)
    test[col] = pd.Categorical(test[col], categories=categories)

# dummy categories
train_dummies = pd.get_dummies(train[categorical_features], drop_first=True)
test_dummies = pd.get_dummies(test[categorical_features], drop_first=True)

# create cleaned datasets
X_train = pd.concat([train_dummies, train[numerical_features]], axis=1)
X_test = pd.concat([test_dummies, test[numerical_features]], axis=1)

# remove where 'Yr Garage' > 2010
print(train[train[features]['Garage Yr Blt'] > 2010].index)
X_train = X_train.drop(train[train[features]['Garage Yr Blt'] > 2010].index)
y = train['SalePrice']

Int64Index([1699], dtype='int64')


In [32]:
X_train_split, X_test_split, y_train_split, y_test_split = train_test_split(X_train, y, random_state=42)

# Make Polynomial Features
pf = PolynomialFeatures()
X_train_split_poly = pf.fit_transform(X_train_split, 2)
X_test_split_poly = pf.fit_transform(X_test_split, 2)
X_test_poly = pf.fit_transform(X_test, 2)

# Put these in a DataFrame 
features = pf.get_feature_names(X_train.columns)
poly_df = pd.DataFrame(X_train_split_poly, columns=features)

# DF of all polynomial features with at least a .6 correlation with our target
poly_training_cleaned = poly_df.loc[:, poly_df.corrwith(train['SalePrice']).abs() >.4]

# modifying test.csv
poly_test_df = pd.DataFrame(X_test_poly, columns=features)
poly_test_cleaned = poly_test_df[poly_training_cleaned.columns]
X_test_split_df = pf.DataFrame(X_test_poly, columns=features)
X_test_split_cleaned = X_test_split_df[poly_training_cleaned.columns]

poly_training_cleaned.shape, poly_test_cleaned.shape, X_test_split_df.shape

ValueError: Found input variables with inconsistent numbers of samples: [2050, 2051]

In [5]:
# train-test split
X = poly_training_cleaned
y = train['SalePrice']
#poly_test_cleaned

In [6]:
# # random forest
# rf = RandomForestRegressor()
# rf_fitted = rf.fit(X_train_split, y_train_split)
# print(rf.score(X_test_split, y_test_split))
# print(mean_squared_error(y_test_split, rf.predict(X_test_split)) ** .5)
# print(mean_squared_error(y_train_split, rf.predict(X_train_split)) ** .5)


In [7]:
rfc = RandomForestClassifier(random_state=42) 

param_grid = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}


In [8]:

CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=5)



In [9]:
CV_rfc.fit(X, y)
print(CV_rfc.best_params_)



KeyboardInterrupt: 

In [37]:
rfc1 = RandomForestClassifier(random_state=42, max_features='auto', n_estimators= 500, max_depth=8, criterion='gini')

In [38]:
rfc1.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=8, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [39]:
# create a .csv
price = rfc1.predict(poly_test_cleaned)
ids = test['Id']
df = pd.DataFrame({'Id': ids, 'SalePrice':price}) 
df.sort_values('Id', inplace=True)
df.to_csv('./test_submission.csv', index=False)