# データ読み込み

- ファイルの確認
- ライブラリの読み込み
- データの読み込み
- 構造の把握
- 長さの把握
- 欠損の確認

In [1]:
!ls

beginner.ipynb	sub2.csv  sub5.csv	     test.csv
lightgbm.ipynb	sub3.csv  submit.csv	     train.csv
sub1.csv	sub4.csv  submit_sample.csv  重回帰分析.ipynb


In [2]:
# データ操作系
import pandas as pd
import numpy as np
import collections
# グラフ描画系
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
# 前処理
import sklearn.preprocessing as sp
# データ分割
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
# 機械学習モデル
from sklearn.linear_model import LogisticRegression # ロジスティック回帰
from sklearn.ensemble import RandomForestClassifier # ランダムフォレスト
from sklearn.svm import LinearSVC # SVM
import lightgbm as lgb #LightGBM
# 制御系
import warnings
warnings.simplefilter('ignore')
import gc

In [3]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sample = pd.read_csv("submit_sample.csv")

In [4]:
test.head()

Unnamed: 0,id,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,0,35,technician,single,secondary,no,89043,no,no,cellular,7,feb,101,2,184,2,success
1,1,37,services,married,secondary,no,64372,yes,no,cellular,7,jul,158,3,241,0,unknown
2,2,31,services,single,secondary,no,31606,yes,no,unknown,15,may,152,2,47,0,unknown
3,3,31,admin.,married,secondary,no,94826,yes,no,cellular,27,may,345,2,490,0,unknown
4,4,32,services,married,secondary,no,100401,no,no,cellular,7,jan,126,1,686,0,other


In [5]:
sample.head()

Unnamed: 0,0,0.1
0,1,0
1,2,0
2,3,0
3,4,0
4,5,0


In [6]:
print(len(train))
print(len(test))
print(len(sample))

27100
18050
18049


In [7]:
train.isnull().sum()

id           0
age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [8]:
test.isnull().sum()

id           0
age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
dtype: int64

# データの結合

- 同時に処理するため一時的にtrainとtestを結合する
- 長さの検算
- 欠損の検算

In [9]:
data = pd.concat([train, test], sort=False)

In [10]:
print(len(train), len(test), len(data))

27100 18050 45150


In [11]:
data.isnull().sum()

id               0
age              0
job              0
marital          0
education        0
default          0
balance          0
housing          0
loan             0
contact          0
day              0
month            0
duration         0
campaign         0
pdays            0
previous         0
poutcome         0
y            18050
dtype: int64

# 特徴量エンジニアリング

In [12]:
train.describe()

Unnamed: 0,id,age,balance,day,duration,campaign,pdays,previous,y
count,27100.0,27100.0,27100.0,27100.0,27100.0,27100.0,27100.0,27100.0,27100.0
mean,13549.5,36.073284,47682.901771,16.747565,229.325387,1.77583,432.482399,0.08572,0.077934
std,7823.240484,7.816417,31650.760036,8.569588,204.939958,0.950045,252.150648,0.365889,0.268072
min,0.0,22.0,-6847.0,1.0,0.0,1.0,-1.0,0.0,0.0
25%,6774.75,31.0,20015.75,8.0,121.0,1.0,214.0,0.0,0.0
50%,13549.5,33.0,47624.0,17.0,158.0,1.0,432.0,0.0,0.0
75%,20324.25,37.0,75330.0,26.0,345.0,2.0,650.0,0.0,0.0
max,27099.0,90.0,102121.0,31.0,3076.0,5.0,870.0,3.0,1.0


In [13]:
train.describe(include= 'O' )

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,poutcome
count,27100,27100,27100,27100,27100,27100,27100,27100,27100
unique,11,3,4,2,2,2,3,11,4
top,blue-collar,married,secondary,no,yes,no,cellular,may,unknown
freq,5957,17565,15955,27090,15819,23651,19147,11232,23099


## age

## balance

## day

## duration

## campaign

## pdays

## previous

## job

## material

## education

## default

## housing

## loan

## contact

## month

## poutcome

## 整形完了後

In [None]:
data.head()

# 相関の確認

In [None]:
data.corr()

In [None]:
sns.heatmap(data.corr(),cmap="Reds")

In [None]:
sns.clustermap(data.corr())

# 採用するカラムの決定

In [None]:
data.columns

In [None]:
adopt_columns = []
categorical_features = []
adopt_data = data[adopt_columns]
adopt_data.head()

In [None]:
# 標準化？
scale = sp.StandardScaler()
scale.fit(adopt_data)

X_train = scale.transform(adopt_data)

## データをtrainとtestに戻す

In [None]:
train = adopt_data[:len(train)]
test = adopt_data[len(train):]

## データを分割する

In [None]:
y_train = train['y']
X_train = train.drop('y', axis=1)
X_test = test.drop('y', axis=1)

### ロジスティック回帰

In [None]:
cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=0)
clf = LogisticRegression(penalty='l2', solver="sag", random_state=0)
scoring = {
    'accuracy': 'accuracy',
    "recall": "recall_macro",
    "f":"f1_macro"
}
scores = cross_validate(clf, X_train, y_train, scoring=scoring, cv=cv)

In [None]:
scores

In [None]:
print("精度(accuracy)")
print(scores["test_accuracy"].mean())
print("検出率(recall)")
print(scores["test_recall"].mean())
print("F値(f)")
print(scores["test_f"].mean())

In [None]:
clf.fit(X_train, y_train)
logistic_pred = clf.predict(X_test)

In [None]:
logistic_pred = list(map(int, logistic_pred))
collections.Counter(logistic_pred)

In [None]:
clf.coef_

### SVM

In [None]:
cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=0)
clf = LinearSVC()
scoring = {
    'accuracy': 'accuracy',
    "recall": "recall_macro",
    "f":"f1_macro"
}
scores = cross_validate(clf, X_train, y_train, scoring=scoring, cv=cv)

In [None]:
scores

In [None]:
print("精度(accuracy)")
print(scores["test_accuracy"].mean())
print("検出率(recall)")
print(scores["test_recall"].mean())
print("F値(f)")
print(scores["test_f"].mean())

In [None]:
clf.fit(X_train, y_train)
svm_pred = clf.predict(X_test)

In [None]:
svm_pred = list(map(int, svm_pred))
collections.Counter(svm_pred)

In [None]:
clf.coef_

### LightGBT

In [None]:
cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=0)
scoring = {
    'accuracy': 'accuracy',
    "recall": "recall_macro",
    "f":"f1_macro"
}
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.3, random_state=0, stratify=y_train)
lgb_train = lgb.Dataset(X_train, y_train, categorical_feature=categorical_features)
lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train, categorical_feature=categorical_features)

params = {
    'objective': 'binary'
}

clf = lgb.LGBMClassifier(
    **params
)

clf.fit(X_train,y_train)
scores = cross_validate(clf, X_train, y_train, scoring=scoring, cv=cv)

In [None]:
scores

In [None]:
print("精度(accuracy)")
print(scores["test_accuracy"].mean())
print("検出率(recall)")
print(scores["test_recall"].mean())
print("F値(f)")
print(scores["test_f"].mean())

In [None]:
light_pred = clf.predict(X_test)

In [None]:
svm_pred = list(map(int, light_pred))
collections.Counter(light_pred)

In [None]:
clf.coef_

# 提出準備

In [None]:
test["y"] = light_pred

In [None]:
test["y"].value_counts()

In [None]:
test["y"].to_csv("sub5.csv", index=True, header=False)

In [None]:
!ls

In [None]:
!cat sub5.csv