# Introduction
**This will be your workspace for Kaggle's Machine Learning education track.**

You will build and continually improve a model to predict housing prices as you work through each tutorial.  Fork this notebook and write your code in it.

The data from the tutorial, the Melbourne data, is not available in this workspace.  You will need to translate the concepts to work with the data in this notebook, the Iowa data.

Come to the [Learn Discussion](https://www.kaggle.com/learn-forum) forum for any questions or comments. 

# Write Your Code Below



In [1]:
import pandas as pd

main_file_path = '../input/train.csv'
iowa_data = pd.read_csv(main_file_path)
print(iowa_data.describe())

In [2]:
print(iowa_data.columns)

In [3]:
iowa_data.columns.size

In [4]:
SaleCondition=iowa_data.SaleCondition
print(SaleCondition.head())

In [5]:
two_columns_of_data=iowa_data[['Utilities','SalePrice']]
print(two_columns_of_data.describe())
print(two_columns_of_data.head())

In [6]:
y=iowa_data.SalePrice
y.head() #target variable

In [7]:
X=iowa_data[['LotArea','YearBuilt','1stFlrSF','2ndFlrSF','FullBath','BedroomAbvGr','TotRmsAbvGrd']]
X.head() #predictor data

In [8]:
from sklearn.tree import DecisionTreeRegressor

iowa_model=DecisionTreeRegressor()
iowa_model.fit(X,y)

In [9]:
print("Prediction for below 5 datapoints:")
print(X.head())
print("The predictions are:")
print(iowa_model.predict(X.head()))
print("The actual prices are:")
print(y.head())

In [10]:
from sklearn.metrics import mean_absolute_error

predicted_prices=iowa_model.predict(X)
mean_absolute_error(predicted_prices,y)

In [11]:
from sklearn.model_selection import train_test_split

train_X,val_X,train_y,val_y=train_test_split(X,y,random_state=0)

iowa_model=DecisionTreeRegressor()
iowa_model.fit(train_X,train_y)

predictions=iowa_model.predict(val_X)
print(mean_absolute_error(predictions,val_y))

In [12]:

def get_mae(leafnodes,train_X,val_X,train_y,val_y):
    iowa_model=DecisionTreeRegressor(max_leaf_nodes=leafnodes,random_state=0)
    iowa_model.fit(train_X,train_y)
    predictions=iowa_model.predict(val_X)
    return(mean_absolute_error(predictions,val_y))

train_X,val_X,train_y,val_y=train_test_split(X,y,random_state=0)

for leafnodes in [10,50,100,250,500,1000,2500,5000]:
    mae=get_mae(leafnodes,train_X,val_X,train_y,val_y)
    print("Leaf Nodes: %d \t Mean Absolute Error: %d" %(leafnodes,mae))


In [13]:
mae_progress=pd.DataFrame([],columns=['LeafCount','MAE'])
for leafnodes in range(10,5000,10):
    mae=get_mae(leafnodes,train_X,val_X,train_y,val_y)
    df_temp=pd.DataFrame([[leafnodes,mae]],columns=['LeafCount','MAE'])
    mae_progress=mae_progress.append(df_temp,ignore_index=True)
print(mae_progress.head())

In [14]:
%matplotlib inline
import matplotlib.pyplot as plt

plt.plot(mae_progress.LeafCount,mae_progress.MAE)

In [15]:
plt.plot(mae_progress[mae_progress["LeafCount"]<500].LeafCount,mae_progress[mae_progress["LeafCount"]<500].MAE)

In [16]:
from sklearn.ensemble import RandomForestRegressor

iowa_forestmodel=RandomForestRegressor()
iowa_forestmodel.fit(train_X,train_y)
predictions=iowa_forestmodel.predict(val_X)
print(mean_absolute_error(val_y,predictions))

In [17]:
test = pd.read_csv('../input/test.csv')
test_X=test[['LotArea','YearBuilt','1stFlrSF','2ndFlrSF','FullBath','BedroomAbvGr','TotRmsAbvGrd']]

predictions=iowa_forestmodel.predict(test_X)

my_submission = pd.DataFrame({'Id': test.Id, 'SalePrice': predictions})
my_submission.to_csv('submission.csv', index=False)

In [18]:
predictions

In [19]:
#This is where Level2 of tutorial begins(and a lot of experimenting with python)

dtype_table=pd.DataFrame(iowa_data.dtypes,columns=['dtype']).reset_index()
dtype_table.head(10)


In [20]:
missing_info=pd.DataFrame(iowa_data.isnull().sum(),columns=['Missing']).reset_index()
print(missing_info.head())

In [21]:
missing_vnames=missing_info[missing_info['Missing']!=0]
missing_vnames=missing_vnames.sort_values(['Missing'],ascending=False)
print(missing_vnames)
#Variables we used to train our model till this point- 'LotArea','YearBuilt','1stFlrSF','2ndFlrSF','FullBath','BedroomAbvGr','TotRmsAbvGrd'

In [22]:
missing_vtypes=pd.concat([dtype_table,missing_vnames],axis=1,join='inner').sort_values(['Missing'],ascending=False)
missing_vtypes
#Now we check the non-object vars from below in out model- 'LotFrontage','GarageYrBlt','MasVnrArea'


In [23]:
y=iowa_data.SalePrice #target variable
X2=iowa_data[['LotArea','YearBuilt','1stFlrSF','2ndFlrSF','FullBath','BedroomAbvGr','TotRmsAbvGrd','LotFrontage','GarageYrBlt','MasVnrArea']]
X2.head() #New predictor data

In [24]:
train_X2,val_X2,train_y2,val_y2=train_test_split(X2,y,random_state=0)
def score_dataset(train_X,test_X,train_y,test_y): #this function uses MAE of random forest predictions to score the model
    rforest_model=RandomForestRegressor()
    rforest_model.fit(train_X,train_y)
    pred_y=rforest_model.predict(test_X)
    return mean_absolute_error(pred_y,test_y)
print("Score before including missing fields: ",score_dataset(train_X,val_X,train_y,val_y))

In [25]:
from sklearn.preprocessing import Imputer

myimputer=Imputer()
imputed_trainX=pd.DataFrame(myimputer.fit_transform(train_X2))
imputed_trainX.columns=train_X2.columns
imputed_valX=pd.DataFrame(myimputer.transform(val_X2))
imputed_valX.columns=val_X2.columns
print("Score after imputing missing fields: ",score_dataset(imputed_trainX,imputed_valX,train_y,val_y))

In [26]:
imputed_valX.head()

In [27]:
imputed_trainXplus=train_X2.copy()
imputed_valXplus=val_X2.copy()

missingv=[col for col in X2.columns if X2[col].isnull().any()]
for col in missingv:
    imputed_trainXplus[col+'_was_missing']=imputed_trainXplus[col].isnull()
    imputed_valXplus[col+'_was_missing']=imputed_valXplus[col].isnull()

myimputer=Imputer()
imputed_trainXplus=myimputer.fit_transform(imputed_trainXplus)
imputed_valXplus=myimputer.transform(imputed_valXplus)
print("Score after imputing missing fields and adding extra columns: ",score_dataset(imputed_trainXplus,imputed_valXplus,train_y,val_y))

In [28]:
mae_progress2=pd.DataFrame([],columns=['LeafCount','MAE'])
for leafnodes in range(10,5000,10):
    mae=get_mae(leafnodes,imputed_trainXplus,imputed_valXplus,train_y,val_y)
    df_temp=pd.DataFrame([[leafnodes,mae]],columns=['LeafCount','MAE'])
    mae_progress2=mae_progress2.append(df_temp,ignore_index=True)
print(mae_progress2.head())

In [29]:
plt.plot(mae_progress2['LeafCount'],mae_progress2['MAE'])

In [30]:
plt.plot(mae_progress2[mae_progress2["LeafCount"]<500].LeafCount,mae_progress2[mae_progress2["LeafCount"]<500].MAE)

In [31]:
#when we keep all the numerical variables
X_allnum=iowa_data.select_dtypes(exclude=['object'])
y=X_allnum.SalePrice #target variable
X_allnum=X_allnum.drop(['SalePrice'],axis=1) #predictor data(with missing vars)
X_allnum_nom=X_allnum.dropna(axis=1) #predictor data(missing vars dropped)

#missing_cols=[col for col in X_allnum.columns if X_allnum[col].isnull().any()]

X_allnum_imputed=X_allnum.copy()
myimputer=Imputer()
X_allnum_imputed=pd.DataFrame(myimputer.fit_transform(X_allnum_imputed)) #imputed predictor data
X_allnum_imputed.columns=X_allnum.columns

trainXa,testXa,trainYa,testYa=train_test_split(X_allnum_nom,y,random_state=0)
trainXb,testXb,trainYb,testYb=train_test_split(X_allnum_imputed,y,random_state=0)

print("Score with dropped missing fields and all num fields included: ",score_dataset(imputed_trainXplus,imputed_valXplus,train_y,val_y))
print("Score after imputing missing fields and all num fields included: ",score_dataset(imputed_trainXplus,imputed_valXplus,train_y,val_y))


In [32]:
from sklearn.model_selection import cross_val_score

def get_mae(X,y):
    return -1*cross_val_score(RandomForestRegressor(50),X,y,scoring='neg_mean_absolute_error').mean()

X_excat = iowa_data.select_dtypes(exclude=['object']).dropna(axis=1).drop(['SalePrice'],axis=1)
mae_excat = get_mae(X_excat, y)
print('Mean Absolute Error when Dropping Categoricals and missing: ' + str(mae_excat))

X_excat2 = iowa_data.select_dtypes(exclude=['object']).fillna(iowa_data.mean()).drop(['SalePrice'],axis=1)
mae_excat2 = get_mae(X_excat2, y)
print('Mean Absolute Error with dropped Categoricals and MVI: ' + str(mae_excat2))

X_incat = pd.get_dummies(iowa_data.drop(['SalePrice','LotFrontage','GarageYrBlt','MasVnrArea','PoolQC','MiscFeature','Alley'],axis=1)) #one hot encoding
mae_incat=get_mae(X_incat,y)
print('Mean Absolute Error with One-Hot Encoding and dropped missing: ' + str(mae_incat))

X_incat2 = pd.get_dummies(iowa_data.drop(['SalePrice','LotFrontage','GarageYrBlt','MasVnrArea'],axis=1)).fillna(iowa_data.mean()) #one hot encoding
mae_incat2=get_mae(X_incat2,y)
print('Mean Absolute Error with One-Hot Encoding and MVI: ' + str(mae_incat2))

X_incat3 = pd.get_dummies(iowa_data).fillna(iowa_data.mean()).drop(['SalePrice'],axis=1) #one hot encoding no variable dropping
mae_incat3=get_mae(X_incat3,y)
print('Mean Absolute Error with One-Hot Encoding and MVI for all variables: ' + str(mae_incat3))

In [41]:
test = pd.read_csv('../input/test.csv')
test_X=pd.get_dummies(test).fillna(test.mean())
final_train,final_test=X_incat2.align(test_X,join='inner',axis=1)

trX,vlX,trY,vlY=train_test_split(final_train,y,random_state=0)

rfmodel=RandomForestRegressor()
rfmodel.fit(trX,trY)
val_preds=rfmodel.predict(vlX)
print("MAE for train-val split: ",mean_absolute_error(vlY,val_preds))

predictions=rfmodel.predict(final_test)

my_submission = pd.DataFrame({'Id': final_test.Id, 'SalePrice': predictions})
my_submission.to_csv('submission2.csv', index=False)

In [52]:
from xgboost import XGBRegressor

def xgb_mae(n_est):
    xgb_model=XGBRegressor(n_estimators=n_est)
    xgb_model.fit(trX,trY)
    predictions=xgb_model.predict(vlX)
    return mean_absolute_error(predictions,vlY)

xgb_progress=pd.DataFrame([],columns=['n_estimators','MAE'])
for i in range(20,1000,20):
    score=xgb_mae(i)
    temp=pd.DataFrame([[i,score]],columns=['n_estimators','MAE'])
    xgb_progress=xgb_progress.append(temp,ignore_index=True)
print(xgb_progress.head())

In [53]:
plt.plot(xgb_progress['n_estimators'],xgb_progress['MAE'])

In [66]:
xgb_model2=XGBRegressor(n_estimators=500)
xgb_model2.fit(trX,trY,early_stopping_rounds=5,eval_set=[(vlX,vlY)])
xgb_model2=XGBRegressor(n_estimators=92)
xgb_model2.fit(trX,trY)
predictions=xgb_model2.predict(vlX)
mean_absolute_error(predictions,vlY)

In [69]:
xgb_model3=XGBRegressor(n_estimators=500,learning_rate=0.05)
xgb_model3.fit(trX,trY,early_stopping_rounds=5,eval_set=[(vlX,vlY)])
xgb_model3=XGBRegressor(n_estimators=161,learning_rate=0.05)
xgb_model3.fit(trX,trY)
predictions=xgb_model3.predict(vlX)
mean_absolute_error(predictions,vlY)

In [70]:
predictions=xgb_model2.predict(final_test)

my_submission = pd.DataFrame({'Id': final_test.Id, 'SalePrice': predictions})
my_submission.to_csv('submission3.csv', index=False)