# House_prices_competition
## 주택 가격예측 경진대회
> Ask a home buyer to describe their dream house, and they probably won't begin with the height of the basement ceiling or the proximity to an east-west railroad. But this playground competition's dataset proves that much more influences price negotiations than the number of bedrooms or a white-picket fence With 79 explanatory variables describing (almost) every aspect of residential homes in Ames, Iowa. this competition challenges you to predict the final price of each home.

>주택 구매자에게 꿈의 집을 설명하라고 물어보세요. 지하실 천장 높이 또는 동서 철도로의 근접성으로 시작하지 않을 것입니다. 그러나 이 경쟁 데이터 세트는 Iowa와 주 Ames에있는 주거용 주택의 거의 모든것을 묘사하는 79 가지 변수로 침실 수나 흰색 울타리보다 가격 협상에 훨씬 더 많은 영향을 미친다는 것을 증명합니다. 이 경진대회는 각각의 집에 대한 최종 가격을 예측하는 것입니다.

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
import numpy as np

from pandas import DataFrame

In [None]:
train_data_path = '../train.csv'
train = pd.read_csv(train_data_path)
test_data_path = '../test.csv'
test = pd.read_csv(test_data_path)

In [None]:
import matplotlib.pyplot as plt
plt.style.use(style='ggplot')
plt.rcParams['figure.figsize'] = (10,6)

In [None]:
train.SalePrice.describe()

In [None]:
print("Skew is:", train.SalePrice.skew())
plt.hist(train.SalePrice, color='blue')
plt.show()

In [None]:
numeric_features = train.select_dtypes(include=[np.number])
numeric_features.dtypes

In [None]:
corr = numeric_features.corr()

In [None]:
print(corr['SalePrice'].sort_values(ascending=False)[:5], '\n')
print(corr['SalePrice'].sort_values(ascending=False)[-5:],)

In [None]:
train.OverallQual.unique()

quality_pivot = train.pivot_table(index='OverallQual',
                                 values='SalePrice',aggfunc=np.median)

quality_pivot

In [None]:
quality_pivot.plot(kind='bar',color='blue')
plt.xlabel('OverallQuality')
plt.ylabel('Median Sale Price')
plt.xticks(rotation=0)

plt.show()

In [None]:
plt.scatter(x=train['GrLivArea'], y=target)
plt.ylabel('SalePrice')
plt.xlabel('Above grade (ground) living area square feet')
plt.show()

In [None]:
plt.scatter(x=train['GarageArea'], y=target)
plt.ylabel('SalePrice')
plt.xlabel('Garage Area')
plt.show()


In [None]:
train = train[train['GarageArea']<1200]
train = train[train['GarageArea']>0]

In [None]:
plt.scatter(x=train['GarageArea'], y=np.log(train.SalePrice))
plt.xlim(-200,1600)
plt.ylabel('SalePrice')
plt.xlabel('Garage Area')
plt.show()

In [None]:
nulls = pd.DataFrame(train.isnull().sum().sort_values(ascending=False)[:25])
nulls.columns = ['Null Count']
nulls.index.name = 'Feature'

In [None]:
print("PoolQC's Unique values are:",train.PoolQC.unique())
print("MiscFeature's Unique values are:",train.MiscFeature.unique())
print("Alley's Unique values are:",train.Alley.unique())
print("Fence's Unique values are:",train.Fence.unique())
print("FireplaceQu's Unique values are:",train.FireplaceQu.unique())
print("BsmtFinType2's Unique values are:",train.BsmtFinType2.unique())
print("BsmtExposure's Unique values are:",train.BsmtExposure.unique())
print("BsmtQual's Unique values are:",train.BsmtQual.unique())
print("BsmtCond's Unique values are:",train.BsmtCond.unique())
print("BsmtFinType1's Unique values are:",train.BsmtFinType1.unique())
print("MasVnrType's Unique values are:",train.MasVnrType.unique())
print("Electrical's Unique values are:",train.Electrical.unique())

print("MasVnrArea's Unique values are:",train.MasVnrArea.unique())


In [None]:
categoricals = train.select_dtypes(exclude=[np.number])
categoricals.describe()
print("original: \n")
print(train.Street.value_counts(), "\n")

In [None]:
train['enc_street'] = pd.get_dummies(train.Street, drop_first=True)
test['enc_street'] = pd.get_dummies(train.Street, drop_first=True)

In [None]:
print('Encoded: \n')
print(train.enc_street.value_counts())

In [None]:
condition_pivot = train.pivot_table(index='Street',
                                   values='SalePrice',aggfunc=np.median)
condition_pivot.plot(kind='bar', color='blue')
plt.xlabel('Street')
plt.ylabel('Median Sale Price')
plt.xticks(rotation=0)
plt.show()

In [None]:
condition_pivot = train.pivot_table(index='SaleCondition',
                                   values='SalePrice',aggfunc=np.median)
condition_pivot.plot(kind='bar', color='blue')
plt.xlabel('Sale Condition')
plt.ylabel('Median Sale Price')
plt.xticks(rotation=0)
plt.show()

In [None]:
def encode(x):
    return 1 if x == 'Partial' else 0
train['enc_condition'] = train.SaleCondition.apply(encode)
test['enc_condition'] = train.SaleCondition.apply(encode)

In [None]:
condition_pivot = train.pivot_table(index='enc_condition',
                                   values='SalePrice',aggfunc=np.median)
condition_pivot.plot(kind='bar', color='blue')
plt.xlabel('encoded Sale Condition')
plt.ylabel('Median Sale Price')
plt.xticks(rotation=0)
plt.show()


In [None]:
data = train.select_dtypes(include=[np.number]).interpolate().dropna()

sum(data.isnull().sum() != 0)

y = np.log(train.SalePrice)
X = data.drop(['SalePrice','Id'], axis=1)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42, test_size=.33)

In [None]:
from sklearn import linear_model
lr = linear_model.LinearRegression()

model = lr.fit(X_train,y_train)

In [None]:
print("R^2 is: \n",model.score(X_test, y_test))

predictions = model.predict(X_test)

In [None]:
from sklearn.metrics import mean_squared_error
print('RMSE is: \n', mean_squared_error(y_test,predictions))


actual_values = y_test
plt.scatter(predictions, actual_values, alpha=.75,
           color='b')
plt.xlabel('Predicted Price')
plt.ylabel('Actual Price')
plt.title('Linear Regression model')
plt.show()

In [None]:
for i in range(-2, 3):
    alpha = 10 ** i
    rm = linear_model.Ridge(alpha=alpha)
    ridge_model = rm.fit(X_train, y_train)
    preds_ridge = ridge_model.predict(X_test)

    plt.scatter(preds_ridge, actual_values, alpha=.75, color='b')
    plt.xlabel('Predicted Price')
    plt.ylabel('Actual Price')
    plt.title('Ridge Regularization with alpha ={}'.format(alpha))
    overlay = 'R^2 is : {}\n RMSE is: {}'.format(
        ridge_model.score(X_test, y_test),
        mean_squared_error(y_test, preds_ridge))
    # plt.annotate(s=overlay,xy=(12.1,10.6),size='x-large')
    plt.show()


In [None]:
submission = pd.DataFrame()
submission['Id'] = test.Id

feats = test.select_dtypes(include=[np.number]).drop(['Id'], axis=1).interpolate()

predictions = model.predict(feats)

final_predictions = np.exp(predictions)

print("original", predictions[:], "\n")
print("Final", final_predictions)

In [None]:
submission['SalePrice'] = final_predictions
submission.head()
submission.to_csv('submission1.csv', index=False)