In [None]:
# データセットを読み込み
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()

import pandas as pd
cancer_df = pd.DataFrame(cancer.data, columns=cancer.feature_names)


In [None]:
cancer_df['target'] = cancer.target


In [None]:
cancer_df.head()


In [None]:
import matplotlib.pyplot as plt
plt.hist(cancer.target)


In [None]:
cancer.target


In [None]:
import seaborn as sns
corrmat = cancer_df.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True);


In [None]:
import numpy as np
k = len(cancer_df)-1 #number of variables for heatmap
goal_parameter = 'target'
cols = corrmat.nlargest(k, goal_parameter)[goal_parameter].index
cm = np.corrcoef(cancer_df[cols].values.T)
plt.subplots(figsize=(18, 18))
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()


In [None]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

result = cancer_df.copy()


goal_parameter = 'target'
#目的変数を削除して、説明変数を作成
X = result.drop(goal_parameter, 1)

#Yに目的変数を入れます！
Y = result[goal_parameter]

# 学習用とテスト用でデータを分離
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.1,random_state=5)



# Random Forest
from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, y_train)
accuracy_random_forest = random_forest.score(X_test, y_test)
print("ランダムフォレストの精度:",accuracy_random_forest * 100,"%")


In [None]:
# PCAを試みます
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA


# 標準化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 偏っていないため次元圧縮はさほど有効ではない？
pca = PCA(n_components=30)
pca.fit(X_train_scaled)
plt.bar([n for n in range(1, len(pca.explained_variance_ratio_)+1)], pca.explained_variance_ratio_)


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
# LightGBMを試します。
import lightgbm as lgb



estimator = lgb.LGBMClassifier(learning_rate = 0.2, n_estimators = 100, boosting='gbdt')

params = {
    'num_leaves': [x for x in range(30, 200, 10)],
    'metric': ('l1', 'l2')
    }
gridsearch = GridSearchCV(estimator, params)
pca = PCA(n_components=2)



# 変換器・推定器オブジェクト作成
standardizer = StandardScaler()

pl = Pipeline([
    ('standardize', standardizer),
    #('pca', pca),
    ('clf', gridsearch)
    ])



pl.fit(X_train, y_train)

# テストデータに対して予測
y_pred = pl.predict(X_test)

y_result = []

for value in y_pred:
    if value > 0.5:
        y_result.append(True)
    else:
        y_result.append(False)

# 精度 (Accuracy) を計算する
accuracy = sum(y_test == y_result) / len(y_test)
print("LightGBMの精度",accuracy * 100,"%")
