In [77]:
import pandas as pd
import matplotlib.pyplot as plt 


In [78]:
df = pd.read_csv("house-prices.csv")
df = pd.concat((df,pd.get_dummies(df["Brick"]),pd.get_dummies(df["Neighborhood"])),axis=1)
del df["Brick"]
del df["Neighborhood"]
del df["No"]#因为Yes = 1-No
del df["North"]
del df["Home"]

In [79]:
#使用假设检验法
import statsmodels.api as sm
Y = df["Price"].values
X = df[["SqFt","Bedrooms","Bathrooms","Offers","Yes","East","West"]].values
X_ = sm.add_constant(X)
#使用最小平方法
result = sm.OLS(Y,X_)
#fit方法运行计算
summary = result.fit()
#调用summary2方法，打印出假设检验的系列信息
summary.summary2()
#名词解释：


0,1,2,3
Model:,OLS,Adj. R-squared:,0.861
Dependent Variable:,y,AIC:,2729.319
Date:,2017-11-10 16:04,BIC:,2752.1352
No. Observations:,128,Log-Likelihood:,-1356.7
Df Model:,7,F-statistic:,113.3
Df Residuals:,120,Prob (F-statistic):,8.25e-50
R-squared:,0.869,Scale:,100380000.0

0,1,2,3,4,5,6
,Coef.,Std.Err.,t,P>|t|,[0.025,0.975]
const,2159.4982,8877.8097,0.2432,0.8082,-15417.9471,19736.9435
x1,52.9937,5.7342,9.2416,0.0000,41.6403,64.3471
x2,4246.7939,1597.9108,2.6577,0.0089,1083.0416,7410.5462
x3,7883.2785,2117.0354,3.7237,0.0003,3691.6957,12074.8613
x4,-8267.4883,1084.7768,-7.6214,0.0000,-10415.2709,-6119.7058
x5,17297.3495,1981.6164,8.7289,0.0000,13373.8870,21220.8120
x6,-1560.5791,2396.7654,-0.6511,0.5162,-6306.0079,3184.8496
x7,20681.0374,3148.9538,6.5676,0.0000,14446.3280,26915.7467

0,1,2,3
Omnibus:,3.026,Durbin-Watson:,1.921
Prob(Omnibus):,0.22,Jarque-Bera (JB):,2.483
Skew:,0.268,Prob(JB):,0.289
Kurtosis:,3.421,Condition No.:,20328.0


## 名词解释
###### Coef 回归系数
###### Std.Err 标准差
###### t 虚无假设成立时的t值
###### P>|t| 虚无假设成立时的概率值
###### [0.025 ,0.975] 97.5%置信估计
###### 要做假设性检验，首先要设置显著性标准。a.假设显著性标准是0.01 b.推翻虚无假设的标准是 p < 0.01 c.上面的SqFt的t=9.2416,P（>t） = 0.0000 < 0.01,因此虚无假设被推翻（这里的虚无假设是SqFt对price的回归系数为0，即SqFt与price不相关）

## F统计

###### 回归平方和 Regression Square Sum [RSS] :依变量的变化归咎于回归模型  A = sum((y-y*)^2)
###### 误差平方和 Error Square Sum [ESS] : 依变量的变化归咎于线性模型  B = sum((y-y')^2)
###### 总的平方和 Total Square Sum [TSS] : 依变量整体变化 C = A+B
###### 回归平方平均  Model Mean Square: =RSS/Regression d.f(k) k=自变数的数量
###### 误差平方平均 Error Mean Square:= ESS / Error d.f(n-k-1) n=观测值得数量
###### F统计 F = Model Mean Square / Error Mean Square 
### F值越大越好，Prob(F-statistic)越小越好


## R Square

###### 回归可以解释的变异比例，可以作为自变量预测因变量准确度的指标
###### SSE （残差平方和） = sum((y-y')^2)
###### SST （整体平方和） = sum((yi-yavg)^2)
###### R^2 = 1-SSE/SST 一般要大于0.6,0.7才算好

## Adjust R Square

###### R^2 = 1-SSE/SST   SSE最小，推导出R^2不会递减 
###### yi = b1x1 + b2x2 + .... bkxk + .... 增加任何一个变量还会增加R^2
###### Adj R^2 = 1-(1-R^2)*((n-1)/(n-p-1))
###### n为总体大小，p为回归因子个数


## AIC/BIC


###### AIC （The Akaike Information  Criterion）= 2K + nln(SSE/n) k是参数数量，n是观察数，SSE是残差平方和。 AIC鼓励数据拟合的优良性，但是尽量避免出现过拟合，所以优先考虑的模型应该是AIC最小的那一个，赤池信息量的准则是寻找可以最好的解释数据但是包含最少自由参数的模型
###### BIC (The Bayesain Information Criterion)

## 选择AIC最小的前10个数据组合

In [80]:
fields = ["SqFt","Bedrooms","Bathrooms","Offers","Yes","East","West"]
fields

['SqFt', 'Bedrooms', 'Bathrooms', 'Offers', 'Yes', 'East', 'West']

In [81]:
import itertools
AICs = {}
for k in range(1,len(fields)+1):
    for varaibles in itertools.combinations(fields,k):
        predictors = X[list(varaibles)]
        predictors2 = sm.add_constant(predictors)
        ols = sm.OLS(Y,predictors2)
        res = ols.fit()
        AICs[varaibles] = res.aic
        
from collections import Counter
c = Counter(AICs)
c.most_common()[::-10]

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [75]:
#这里选用('SqFt', 'Bedrooms', 'Bathrooms', 'Offers', 'Yes', 'West')属性作为多元线性回归的特征属性
from sklearn.linear_model import LinearRegression
linear = LinearRegression()
model = linear.fit(X[['SqFt', 'Bedrooms', 'Bathrooms', 'Offers', 'Yes', 'West']],Y)
model

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [69]:
res = model.predict(X[['SqFt', 'Bedrooms', 'Bathrooms', 'Offers', 'Yes', 'West']])

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices