# Sklearn学习之朴素贝叶斯

In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier,export_graphviz
from sklearn.ensemble import RandomForestClassifier

In [2]:
news=fetch_20newsgroups(subset='all')

In [3]:
# 进行数据分割
x_train,x_test,y_train,y_test=train_test_split(news.data,news.target,test_size=(0.25))

In [4]:
# 对数据进行特征抽取
tf=TfidfVectorizer()

In [5]:
#  以训练集当中的词的列表进行每篇文章重要性统计
x_train=tf.fit_transform(x_train)

In [6]:
# 注意这里不能再次fit
x_test=tf.transform(x_test)

In [7]:
# 进行朴素贝叶斯算法预测
mlt=MultinomialNB(alpha=1.0)
mlt.fit(x_train,y_train)

MultinomialNB()

In [8]:
y_predict=mlt.predict(x_test) 

In [9]:
mlt.score(x_test,y_test)

0.8508064516129032

In [12]:
# 产生分类报告
# 每个类别的精确率和召回率
print(classification_report(y_test,y_predict,target_names=news.target_names))

precision    recall  f1-score   support

             alt.atheism       0.87      0.77      0.82       198
           comp.graphics       0.86      0.73      0.79       244
 comp.os.ms-windows.misc       0.88      0.85      0.87       248
comp.sys.ibm.pc.hardware       0.79      0.84      0.82       264
   comp.sys.mac.hardware       0.95      0.87      0.91       243
          comp.windows.x       0.93      0.85      0.89       253
            misc.forsale       0.93      0.70      0.80       272
               rec.autos       0.93      0.90      0.91       267
         rec.motorcycles       0.95      0.95      0.95       256
      rec.sport.baseball       0.93      0.98      0.96       243
        rec.sport.hockey       0.94      0.97      0.95       239
               sci.crypt       0.69      0.99      0.81       236
         sci.electronics       0.85      0.82      0.84       236
                 sci.med       0.98      0.83      0.90       241
               sci.space       0.91

# sklearn学习之决策树

In [2]:
# 读取数据
titan=pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt')

In [3]:
x=titan[['pclass','age','sex']]
y=titan['survived']

In [4]:
# 处理缺失值
x['age'].fillna(x['age'].mean(),inplace=True)

In [5]:
# 分割数据集到训练集和测试集
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25)

In [6]:
# 进行处理（特征处理），特征是类别时，用one_hot编码
dict=DictVectorizer(sparse=False)
x_train=dict.fit_transform(x_train.to_dict(orient='records'))
x_test=dict.transform(x_test.to_dict(orient='records'))

In [9]:
dict.get_feature_names()

['age', 'pclass=1st', 'pclass=2nd', 'pclass=3rd', 'sex=female', 'sex=male']

In [13]:
print(x_train)

[[31.19418104  0.          0.          1.          0.          1.        ]
 [23.          1.          0.          0.          1.          0.        ]
 [31.19418104  0.          1.          0.          1.          0.        ]
 ...
 [ 6.          1.          0.          0.          0.          1.        ]
 [31.19418104  0.          0.          1.          1.          0.        ]
 [25.          0.          1.          0.          1.          0.        ]]


In [21]:
# 用决策数进行预测
dec=DecisionTreeClassifier()
dec.fit(x_train,y_train)

DecisionTreeClassifier()

In [22]:
# 预测准确率
print('预测的准确率：',dec.score(x_test,y_test))

预测的准确率： 0.7872340425531915


In [25]:
# 导出树的结构
export_graphviz(dec,out_file='./tree.dot',feature_names=['age', 'pclass=1st', 'pclass=2nd', 'pclass=3rd', 'sex=female', 'sex=male'])

# Sklearn学习之随机森林预测

In [2]:
# 读取数据
titan=pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt')

x=titan[['pclass','age','sex']]
y=titan['survived']

# 处理缺失值
x['age'].fillna(x['age'].mean(),inplace=True)

# 分割数据集到训练集和测试集
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25)

# 进行处理（特征处理），特征是类别时，用one_hot编码
dict=DictVectorizer(sparse=False)
x_train=dict.fit_transform(x_train.to_dict(orient='records'))
x_test=dict.transform(x_test.to_dict(orient='records'))

In [3]:
# 随机森林预测
rf=RandomForestClassifier()

# 设置参数
param={'n_estimators':[120,200,300,500,800,1200],'max_depth':[5,8,15,25,30]}

# 网格搜索与交叉验证
gc=GridSearchCV(rf,param_grid=param,cv=10)

gc.fit(x_train,y_train)

GridSearchCV(cv=10, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [5, 8, 15, 25, 30],
                         'n_estimators': [120, 200, 300, 500, 800, 1200]})

In [4]:
print('预测准确率为：',gc.score(x_test,y_test))

预测准确率为： 0.8115501519756839


In [5]:
print('查看选择参数模型：',gc.best_params_)

查看选择参数模型： {'max_depth': 5, 'n_estimators': 300}
