In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy.sparse
import pickle
import xgboost as xgb
from scipy import stats, integrate
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes=True)
#from random import choice
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
#from sklearn.preprocessing import OneHotEncoder

In [2]:
data = pd.read_csv('train_new.csv',encoding='utf8')
del data['Id']
#data.info()

In [3]:
x = data.iloc[:,:-1]
y = data.iloc[:,-1]
y = np.log(y)

In [4]:
#使用随机森林筛选变量
forest = RandomForestRegressor(criterion='mse', random_state=1, n_jobs=-1)
para = {'n_estimators':[501,1001],'max_depth':[3,4,5,6]}
clf = GridSearchCV(forest, para ,cv=5,n_jobs=-1)
clf.fit(x,y)
print(clf.best_params_)

{'max_depth': 6, 'n_estimators': 1001}


In [5]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

In [6]:
#regr = RandomForestClassifier(n_estimators=501,max_depth=4,criterion='entropy',random_state=0)
forest = RandomForestRegressor(n_estimators=1001, max_depth=6,
                               criterion='mse', 
                               random_state=1, 
                               n_jobs=-1)
forest.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=6,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1001, n_jobs=-1,
           oob_score=False, random_state=1, verbose=0, warm_start=False)

In [7]:
y_train_pred = np.e**forest.predict(X_train)
y_test_pred = np.e**forest.predict(X_test)

print('MSE train: %.3f, test: %.3f' % (
        pow(mean_squared_error(y_train, y_train_pred),0.5),
        pow(mean_squared_error(y_test, y_test_pred),0.5)))#这的公式好像错了，不过好在趋势是一致的。。。

MSE train: 191282.464, test: 186777.536


In [8]:
columns_imports = pd.DataFrame()
columns_imports['columns_imports'] = x.columns
columns_imports['importances'] = forest.feature_importances_ 
#columns_imports.sort_values(by='importances',ascending=False)#.head()#特征重要性分析

In [9]:
#筛选特征重要性大于零的特征
columns_imports = columns_imports[columns_imports['importances']>0]

In [10]:
#生成新变量
x_new = x[columns_imports['columns_imports'].values]
X_train, X_test, y_train, y_test = train_test_split(x_new, y, test_size=0.25, random_state=42)

In [27]:
xgb_model = xgb.XGBRegressor()
para = {'max_depth': [3,4], 'n_estimators': [2000,2200],'colsample_bytree':[0.4,0.5],'gamma':[0.001,0.01,0.1]}
clf = GridSearchCV(xgb_model, para ,cv=5,n_jobs=-1)
clf.fit(x,y)
print(clf.best_params_)

{'colsample_bytree': 0.4, 'gamma': 0.01, 'max_depth': 3, 'n_estimators': 2000}


In [11]:
xgb_model = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2000,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)
#这里的参数有参考kaggle比赛kernal，电脑跑的比较慢，还没有挨个去做网格搜索。

In [12]:
#xgb_model = xgb.XGBRegressor(n_jobs=-1,max_depth=3,n_estimators=1000)
xgb_model.fit(X_train,y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.4603, gamma=0.0468, learning_rate=0.05,
       max_delta_step=0, max_depth=3, min_child_weight=1.7817,
       missing=None, n_estimators=2200, n_jobs=1, nthread=-1,
       objective='reg:linear', random_state=7, reg_alpha=0.464,
       reg_lambda=0.8571, scale_pos_weight=1, seed=None, silent=1,
       subsample=0.5213)

In [13]:
y_train_pred = np.e**xgb_model.predict(X_train)
y_test_pred = np.e**xgb_model.predict(X_test)

print('RMSE train: %.3f, test: %.3f' % (
        pow(mean_squared_error(y_train, y_train_pred),0.5),
        pow(mean_squared_error(y_test, y_test_pred),0.5)))

MSE train: 194938.464, test: 191021.275


In [163]:
###使用SVM
SVR_model = SVR()
para = {'kernel':['rbf','linear','sigmoid','poly'],'C':[1,10,100]}
clf_svr = GridSearchCV(SVR_model, para ,cv=5,n_jobs=-1)
clf_svr.fit(x,y)
print(clf_svr.best_params_)

{'C': 10, 'kernel': 'rbf'}


In [164]:
SVR_model = SVR(kernel='rbf',C=10)
SVR_model.fit(X_train,y_train)

SVR(C=10, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [165]:
y_svr_train_pred = np.e**SVR_model.predict(X_train)
y_svr_test_pred = np.e**SVR_model.predict(X_test)

print('MSE train: %.3f, test: %.3f' % (
        pow(mean_squared_error(y_train, y_svr_train_pred),0.5),
        pow(mean_squared_error(y_test, y_svr_test_pred),0.5)))

MSE train: 181848.366, test: 179789.129


In [None]:
#多分类使用
# def appendmax(sr):#对每一行处理的函数  
#     one = sr.idxmax()#得到最大值的index名  
#     maxindex = pd.Series(one)  
#     #sr = sr.append(maxindex)#添加  
#     return maxindex  

In [None]:
# pd.DataFrame(y_train_pred,columns=y.columns).apply(lambda x:appendmax(x),axis=1)

In [None]:
# print(pow(mean_squared_error(y_train,y_train_pred),0.5))
# print(pow(mean_squared_error(y_test,y_test_pred),0.5))

In [48]:
# 保存模型
filename = 'forest.sav'
pickle.dump(forest, open(filename, 'wb'))

# 结果

In [17]:
result = pd.read_csv('test_new.csv',encoding='utf8')
data_test = result.drop('Id',axis=1)
data_test = pd.get_dummies(data_test)

In [18]:
#这两个function是为了当有新数据进来的时候把维度补全。
def filter_traincolumns(x):
    if x in x_dummies.columns:
        return False
    else:
        return True
def filter_testcolumns(x):
    if x in data_test.columns:
        return False
    else:
        return True

In [19]:
data_test = pd.concat([data_test, pd.DataFrame(columns=list(filter(filter_testcolumns,x.columns)))])#使用上面的function补全特征
data_test = data_test.fillna(0)

In [20]:
data_test = data_test[columns_imports['columns_imports'].values] #使用筛选后的变量测试

In [21]:
data_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1459 entries, 0 to 1458
Columns: 290 entries, LotFrontage to SaleCondition_Partial
dtypes: float64(32), int64(258)
memory usage: 3.2 MB


## xgboost

In [22]:
result['SalePrice'] = xgb_model.predict(data_test)
#result['SalePrice'] = pd.DataFrame(result_pre,columns=y.columns).apply(lambda x:appendmax(x),axis=1)

result = result[['Id','SalePrice']]

result['Id'] = result['Id'].astype('int')
result['SalePrice'] = result[['SalePrice']].apply(lambda x: np.e**x,axis=1)
result_xgb = result.copy()

In [23]:
result.to_csv('submission.csv',index=False,encoding='utf8')

## svr

In [177]:
result['SalePrice'] = SVR_model.predict(data_test)
#result['SalePrice'] = pd.DataFrame(result_pre,columns=y.columns).apply(lambda x:appendmax(x),axis=1)

result = result[['Id','SalePrice']]

result['Id'] = result['Id'].astype('int')
result['SalePrice'] = result[['SalePrice']].apply(lambda x: np.e**x,axis=1)
result_svr = result.copy()

In [172]:
result.to_csv('submission.csv',index=False,encoding='utf8')

## 两个模型求平均

In [182]:
result_blending = result_xgb.merge(result_svr,how='left',on='Id')
result_blending.head()

result_blending['SalePrice'] = result_blending.apply(lambda x: (x[1]+x[2])/2,axis=1)
del result_blending['SalePrice_x']
del result_blending['SalePrice_y']

result.to_csv('submission.csv',index=False,encoding='utf8')

Unnamed: 0,Id,SalePrice_x,SalePrice_y
0,1461,118677.117188,114548.688081
1,1462,148592.34375,147928.757995
2,1463,179035.84375,177781.001241
3,1464,181278.296875,190208.556243
4,1465,190123.203125,198570.832119
