In [118]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Read data and split the data to training and test set

In [119]:
data=pd.read_csv("input/train.csv")
real_test=pd.read_csv("input/test.csv")
print('The shape of data is:', data.shape)
data.head()

The shape of data is: (1460, 81)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [120]:
X=data.iloc[:, 0:-1]
y=data.iloc[:, -1]
y=y.apply(np.log)

frames=[X, real_test]
X=pd.concat(frames)


print("shape of X:", X.shape)
print("shape of y:", y.shape)


all_features=set(X.columns)
print("Total number of features:", len(all_features))

#Seperate number features and string features
g=X.columns.to_series().groupby(X.dtypes).groups
fgroup={k.name: v for k, v in g.items()}

int_feature=fgroup['int64']
float_feature=fgroup['float64']
int_feature.remove('Id')
int_feature.remove('YrSold')
int_feature.remove('MoSold')
string_feature=fgroup['object']

print('number of number features:',len(number_feature))
print('number of string features:', len(string_feature))


imr=Imputer(missing_values='NaN', strategy='mean', axis=0)
X_int=imr.fit_transform(X[int_feature])
#print('int_features:',int_feature)
#print('float_features:', float_feature)

X_int=pd.DataFrame(X_int, columns=int_feature)
X_int[['YearBuilt']]=2017-X_int[['YearBuilt']]
X_int[['YearRemodAdd']]=2017-X_int[['YearRemodAdd']]
X_int.iloc[1,:]
#X_train, X_test, y_train, y_test = train_test_split(X_1, y, test_size=0.2, random_state

shape of X: (2919, 80)
shape of y: (1460,)
Total number of features: 80
number of number features: 26
number of string features: 43


MSSubClass         20
LotArea          9600
OverallQual         6
OverallCond         8
YearBuilt          41
YearRemodAdd       41
1stFlrSF         1262
2ndFlrSF            0
LowQualFinSF        0
GrLivArea        1262
FullBath            2
HalfBath            0
BedroomAbvGr        3
KitchenAbvGr        1
TotRmsAbvGrd        6
Fireplaces          1
WoodDeckSF        298
OpenPorchSF         0
EnclosedPorch       0
3SsnPorch           0
ScreenPorch         0
PoolArea            0
MiscVal             0
Name: 1, dtype: float64

In [121]:
# Scale the features in int_feature
scale_feature=['LotArea', 'YearBuilt', 'YearRemodAdd', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 
               'GrLivArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch',
              'PoolArea', 'MiscVal']
unscale_features=[item for item in int_feature if item not in scale_feature]
unscale_features.remove('MSSubClass')

X_scale=StandardScaler().fit_transform(X_int[scale_feature])
X_scale=pd.DataFrame(X_scale, columns=scale_feature)

In [122]:
#Onehotencoder the class
Mssclass=OneHotEncoder().fit_transform(X_int[['MSSubClass']]).toarray()
Mssclass_col=['MSSubClass'+str(i) for i in range(16)]
Mssclass=pd.DataFrame(Mssclass, columns=Mssclass_col)

In [123]:
X_scale=X_scale.join(Mssclass)
X_scale=X_scale.join(X_int[unscale_features])
X_scale.shape

(2919, 38)

In [124]:
X_new=X_scale.iloc[0:1460,:]
real_test_new=X_scale.iloc[1460:2920, :]

In [125]:
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=7)

In [126]:
from sklearn.linear_model import LinearRegression
lg=LinearRegression()
lg.fit(X_train, y_train)
print("log error of training:", np.sqrt(mean_squared_error(y_train, lg.predict(X_train))))
print("log error of test:", np.sqrt(mean_squared_error(y_test, lg.predict(X_test))))

log error of training: 0.15439864294
log error of test: 0.144936593355


In [127]:
y_pre=lg.predict(real_test_new)
y_pre=np.exp(y_pre)

In [130]:
submission=pd.DataFrame(real_test['Id'])
submission['SalePrice']=y_pre
submission.head()

submission.to_csv(path_or_buf= 'submission2.csv',index=False)

In [129]:
len(y_pre)

1459

In [23]:
test=pd.DataFrame([[1,2,3], [4,5,6], [7,8,9]], columns=['A', 'B', 'C'])
test[['A','B']].values
test_n=StandardScaler().fit_transform(test[['A', 'B']])

In [24]:
test_n

array([[-1.22474487, -1.22474487],
       [ 0.        ,  0.        ],
       [ 1.22474487,  1.22474487]])

In [25]:
test

Unnamed: 0,A,B,C
0,1,2,3
1,4,5,6
2,7,8,9
