# ニフティインターン 機械学習モデル開発3日間コース チュートリアルノートブック

## ライブラリのインポート

In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix

## データの取得

In [5]:
# 初回だけ実行すれば大丈夫です
! aws s3 cp s3://nifty-machine-learning-internship-2020/ ./input --recursive

download: s3://nifty-machine-learning-internship-2020/sample_submission.csv to input/sample_submission.csv
download: s3://nifty-machine-learning-internship-2020/test.csv to input/test.csv
download: s3://nifty-machine-learning-internship-2020/train.csv to input/train.csv
download: s3://nifty-machine-learning-internship-2020/service_use_history.csv to input/service_use_history.csv


## データの読み込み

In [5]:
train = pd.read_csv('input/train.csv')
test=pd.read_csv("input/test.csv")
train["prefecture"].unique()

array(['鹿児島', '北海道', '長野', '福島', '愛知', '神奈川', '大阪', '石川', '長崎', '山口',
       '埼玉', '東京', '三重', '宮城', '福岡', '新潟', '茨城', '青森', '兵庫', '千葉', '秋田',
       '香川', '京都', '山形', '岡山', '滋賀', '富山', '熊本', '佐賀', '群馬', '広島', '栃木',
       '沖縄', '静岡', '奈良', '島根', '大分', '徳島', '二府亭県', '高知', '愛媛', '宮崎', '岩手',
       '岐阜', '福井', '山梨', '和歌山', '鳥取'], dtype=object)

## データの確認

In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 15 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   user_id          100000 non-null  object
 1   user_agent       100000 non-null  object
 2   entry_date       100000 non-null  object
 3   open_date        100000 non-null  object
 4   birthday         99459 non-null   object
 5   id_status        100000 non-null  object
 6   gender           100000 non-null  object
 7   blood_type       100000 non-null  object
 8   pay_method_type  100000 non-null  object
 9   course_name      100000 non-null  object
 10  price_type       100000 non-null  object
 11  entry_from       100000 non-null  object
 12  privilege_name   100000 non-null  object
 13  prefecture       100000 non-null  object
 14  close_flag       100000 non-null  int64 
dtypes: int64(1), object(14)
memory usage: 11.4+ MB


In [8]:
train.describe(include='all')

Unnamed: 0,user_id,user_agent,entry_date,open_date,birthday,id_status,gender,blood_type,pay_method_type,course_name,price_type,entry_from,privilege_name,prefecture,close_flag
count,100000,100000,100000,100000,99459,100000,100000,100000,100000,100000,100000,100000,100000,100000,100000.0
unique,100000,78651,2190,2337,18714,2,2,4,3,3,4,6,4,48,
top,USLTHP32419,Mozilla/5.0 (compatible; MSIE 7.0; Windows NT ...,2018-09-18,2020-01-07,1984-08-12,新規,男性,A,クレジットカード,@nifty光,マンション2年プラン,直販WEB,キャッシュバック還元,東京,
freq,1,76,76,72,22,80720,77015,38547,43436,35733,40704,24824,32455,10555,
mean,,,,,,,,,,,,,,,0.09988
std,,,,,,,,,,,,,,,0.299841
min,,,,,,,,,,,,,,,0.0
25%,,,,,,,,,,,,,,,0.0
50%,,,,,,,,,,,,,,,0.0
75%,,,,,,,,,,,,,,,0.0


## 使わないカラムの削除

In [9]:
datetime_columns = ['entry_date', 'open_date', 'birthday']
for column in datetime_columns:
    train = train.drop(column, axis=1)

#消し方    
train = train.drop('user_agent', axis=1)

train.head()

Unnamed: 0,user_id,id_status,gender,blood_type,pay_method_type,course_name,price_type,entry_from,privilege_name,prefecture,close_flag
0,YKJGBC19356,新規,男性,A,クレジットカード,@nifty光,マンション2年プラン,代理店取次,キャッシュバック還元,鹿児島,0
1,JMFEBB08712,新規,女性,O,クレジットカード,@nifty光,ホーム2年プラン,代理店取次,付帯サービス無料,北海道,0
2,EXLFTB26665,既存,女性,AB,クレジットカード,@nifty auひかり,ホーム2年プラン,ノジマ取次,月額割引還元,長野,0
3,MWEEMY19722,新規,男性,B,クレジットカード,@nifty光,マンション2年プラン,代理店取次,キャッシュバック還元,福島,0
4,WWTLYT31226,新規,男性,A,クレジットカード,@nifty auひかり,マンション3年プラン,WEB代理店,付帯サービス無料,愛知,0


## IDの処理

In [10]:
train = train.set_index('user_id')
train.head()

Unnamed: 0_level_0,id_status,gender,blood_type,pay_method_type,course_name,price_type,entry_from,privilege_name,prefecture,close_flag
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
YKJGBC19356,新規,男性,A,クレジットカード,@nifty光,マンション2年プラン,代理店取次,キャッシュバック還元,鹿児島,0
JMFEBB08712,新規,女性,O,クレジットカード,@nifty光,ホーム2年プラン,代理店取次,付帯サービス無料,北海道,0
EXLFTB26665,既存,女性,AB,クレジットカード,@nifty auひかり,ホーム2年プラン,ノジマ取次,月額割引還元,長野,0
MWEEMY19722,新規,男性,B,クレジットカード,@nifty光,マンション2年プラン,代理店取次,キャッシュバック還元,福島,0
WWTLYT31226,新規,男性,A,クレジットカード,@nifty auひかり,マンション3年プラン,WEB代理店,付帯サービス無料,愛知,0


## カテゴリデータの処理

In [11]:
category_columns = ['id_status', 'gender', 'blood_type', 'pay_method_type', 'course_name', 'price_type', 'entry_from', 'privilege_name', 'prefecture']

encoders = {}
for column in category_columns:
    le = LabelEncoder()
    le.fit(train[column])
    train[column] = le.transform(train[column])
    encoders[column] = le
    
train.head()

Unnamed: 0_level_0,id_status,gender,blood_type,pay_method_type,course_name,price_type,entry_from,privilege_name,prefecture,close_flag
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
YKJGBC19356,0,1,0,1,2,2,3,0,47,0
JMFEBB08712,0,0,3,1,2,0,3,1,5,0
EXLFTB26665,1,0,1,1,0,0,2,2,41,0
MWEEMY19722,0,1,2,1,2,2,3,0,36,0
WWTLYT31226,0,1,0,1,0,3,0,1,25,0


In [12]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100000 entries, YKJGBC19356 to VTAMKV51107
Data columns (total 10 columns):
 #   Column           Non-Null Count   Dtype
---  ------           --------------   -----
 0   id_status        100000 non-null  int64
 1   gender           100000 non-null  int64
 2   blood_type       100000 non-null  int64
 3   pay_method_type  100000 non-null  int64
 4   course_name      100000 non-null  int64
 5   price_type       100000 non-null  int64
 6   entry_from       100000 non-null  int64
 7   privilege_name   100000 non-null  int64
 8   prefecture       100000 non-null  int64
 9   close_flag       100000 non-null  int64
dtypes: int64(10)
memory usage: 8.4+ MB


## データセットの分割

In [13]:
y = train['close_flag']
X = train.drop('close_flag', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(90000, 9) (10000, 9) (90000,) (10000,)


In [14]:
X_train.head()

Unnamed: 0_level_0,id_status,gender,blood_type,pay_method_type,course_name,price_type,entry_from,privilege_name,prefecture
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
RHRRWA99067,0,1,1,0,1,1,5,0,25
MHWBZQ27906,0,1,0,1,2,2,3,1,8
SYAHAZ07444,0,1,0,1,1,2,5,1,12
AJVZVQ72485,0,1,0,0,1,3,5,1,8
IYSXCU88050,0,0,2,0,0,3,5,1,4


In [15]:
y_train.head()

user_id
RHRRWA99067    0
MHWBZQ27906    0
SYAHAZ07444    0
AJVZVQ72485    0
IYSXCU88050    0
Name: close_flag, dtype: int64

## モデルの学習と評価

### 全てのデータに対して0と予測

In [16]:
all_zero = [0 for _ in range(X_test.shape[0])]

print(f'accuracy_score: {accuracy_score(y_test, all_zero)}')
print(f'roc_auc_score: {roc_auc_score(y_test, all_zero)}')

accuracy_score: 0.9052
roc_auc_score: 0.5


### ロジスティック回帰

In [17]:
lr = LogisticRegression(random_state=0)
lr.fit(X_train, y_train)

y_pred_prob = lr.predict_proba(X_test)[:,1]
y_pred = [1 if prob > 0.5 else 0 for prob in y_pred_prob]

print(f'accuracy_score: {accuracy_score(y_test, y_pred)}')
print(f'roc_auc_score: {roc_auc_score(y_test, y_pred_prob)}')

pd.DataFrame(confusion_matrix(y_test, y_pred), index=['actual_0', 'actual_1'], columns=['predicted_0', 'predicted_1'])

accuracy_score: 0.9052
roc_auc_score: 0.5848192976911646


Unnamed: 0,predicted_0,predicted_1
actual_0,9052,0
actual_1,948,0


## XGboost

In [18]:
!pip install xgboost
import xgboost as xgb

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m


In [25]:
import xgboost as xgb
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
xgb_params = {
        # 評価指標
        'eval_metric': 'logloss',
    }

bst = xgb.train(xgb_params,
                    dtrain,
                    num_boost_round=100,  
                    )
    
y_pred_proba = bst.predict(dtest)
    
y_pred = np.where(y_pred_proba > 0.5, 1, 0)
    
acc = accuracy_score(y_test, y_pred)
print('Accuracy:', acc)
pd.DataFrame(confusion_matrix(y_test, y_pred), index=['actual_0', 'actual_1'], columns=['predicted_0', 'predicted_1'])

Accuracy: 0.9045


Unnamed: 0,predicted_0,predicted_1
actual_0,9041,11
actual_1,944,4


## lightGBM

In [27]:
! pip install lightgbm
import lightgbm as lgb


Collecting lightgbm
  Downloading lightgbm-2.3.1-py2.py3-none-manylinux1_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 8.0 MB/s eta 0:00:01
Installing collected packages: lightgbm
Successfully installed lightgbm-2.3.1
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m


In [28]:
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test)

lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test)

In [29]:
params = {'metric': 'rmse',
          'max_depth' : 9}

In [30]:
gbm = lgb.train(params,
                lgb_train,
                valid_sets=lgb_eval,
                num_boost_round=10000,
                early_stopping_rounds=100,
                verbose_eval=50)

Training until validation scores don't improve for 100 rounds
[50]	valid_0's rmse: 0.282555
[100]	valid_0's rmse: 0.282326
[150]	valid_0's rmse: 0.282525
[200]	valid_0's rmse: 0.282842
Early stopping, best iteration is:
[101]	valid_0's rmse: 0.282311


In [33]:
predicted = gbm.predict(X_test)
print(predicted)
acc = accuracy_score(y_test, predicted)
print('Accuracy:', acc)

[0.03051317 0.07272898 0.12254629 ... 0.01557537 0.20353397 0.07646524]


ValueError: Classification metrics can't handle a mix of binary and continuous targets

### ランダムフォレスト

In [18]:
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train, y_train)

y_pred_prob = rf.predict_proba(X_test)[:,1]
y_pred = [1 if prob > 0.5 else 0 for prob in y_pred_prob]

print(f'accuracy_score: {accuracy_score(y_test, y_pred)}')
print(f'roc_auc_score: {roc_auc_score(y_test, y_pred_prob)}')

pd.DataFrame(confusion_matrix(y_test, y_pred), index=['actual_0', 'actual_1'], columns=['predicted_0', 'predicted_1'])

accuracy_score: 0.8835
roc_auc_score: 0.654315152396561


Unnamed: 0,predicted_0,predicted_1
actual_0,8783,269
actual_1,896,52


## test.csvの予測の作成

In [19]:
test = pd.read_csv('input/test.csv')
test.head()

Unnamed: 0,user_id,user_agent,entry_date,open_date,birthday,id_status,gender,blood_type,pay_method_type,course_name,price_type,entry_from,privilege_name,prefecture
0,HLSZTB50403,Mozilla/5.0 (Android 3.2.5; Mobile; rv:50.0) G...,2017-11-14,2018-03-07,1996-04-04,新規,女性,O,口座振替,@nifty ドコモ光,ホーム2年プラン,代理店取次,月額割引還元,福岡
1,FCQNUE35187,Mozilla/5.0 (Macintosh; PPC Mac OS X 10_7_3 rv...,2016-09-24,2016-12-06,1976-04-08,新規,女性,A,口座振替,@nifty光,マンション2年プラン,ノジマ取次,付帯サービス無料,大阪
2,IYSBZQ82404,Mozilla/5.0 (compatible; MSIE 9.0; Windows NT ...,2016-09-05,2016-11-08,2002-05-18,新規,女性,AB,口座振替,@nifty auひかり,マンション2年プラン,代理店取次,付帯サービス無料,二府亭県
3,BPIREY57201,Mozilla/5.0 (Macintosh; PPC Mac OS X 10_7_1; r...,2018-03-13,2018-04-14,1995-04-15,新規,女性,A,クレジットカード,@nifty光,マンション2年プラン,キャリア取次,キャッシュバック還元,福岡
4,ISWWBF46615,Mozilla/5.0 (Windows CE) AppleWebKit/533.2 (KH...,2016-11-20,2017-02-02,2000-02-21,新規,男性,B,クレジットカード,@nifty auひかり,ホーム3年プラン,キャリア取次,キャッシュバック還元,愛知


In [7]:
test = test.set_index('user_id')

datetime_columns = ['entry_date', 'open_date', 'birthday']
for column in datetime_columns:
    test = test.drop(column, axis=1)
test = test.drop('user_agent', axis=1)

category_columns = ['id_status', 'gender', 'blood_type', 'pay_method_type', 'course_name', 'price_type', 'entry_from', 'privilege_name', 'prefecture']
for column in category_columns:
    le = encoders[column]
    le.fit(test[column])
    test[column] = le.transform(test[column])


NameError: name 'encoders' is not defined

In [21]:
! mkdir -v output

mkdir: created directory ‘output’


In [22]:
submission = pd.DataFrame({'user_id': test.index, 'close_flag': rf.predict_proba(test)[:,1]})
submission.head()

Unnamed: 0,user_id,close_flag
0,HLSZTB50403,0.13
1,FCQNUE35187,0.433556
2,IYSBZQ82404,0.1675
3,BPIREY57201,0.097595
4,ISWWBF46615,0.011667


In [23]:
submission.to_csv('output/my_submission.csv', index=False)
! head output/my_submission.csv

user_id,close_flag
HLSZTB50403,0.13
FCQNUE35187,0.4335555555555555
IYSBZQ82404,0.1675
BPIREY57201,0.09759523809523808
ISWWBF46615,0.011666666666666665
VDXVGA70627,0.0
BHPCVA00465,0.02
HNNPGY38481,0.63
OQNEPI63524,0.02


## Kaggleへ予測結果を提出

### Webサイトを利用して提出

1. 左のメニューから予測結果をダウンロードします  

<img src="images/download.png" align="left"/>

2. [コンペサイトの提出ページ](https://www.kaggle.com/c/competition-0819/submit)にアクセスします
    - コンペに参加していない場合は[こちら](https://www.kaggle.com/t/7155c36646d64acf93c04433208eb510)から参加してください

3. 予測結果を提出します
    - 予測結果をアップロード
        - チェックマークが表示されたらアップロード完了です  
    - 説明を記入する
    - ```Make Submission```を押して提出する
    
<img src="images/step1.png" align="left"/>
<img src="images/step2.png" align="left"/>
<img src="images/submit.png" align="left"/>

4. ```My Submissins```でスコアを確認できます

<img src="images/score.png" align="left"/>

### 応用編：APIを利用して提出

In [24]:
# 初回だけ実行すれば大丈夫です
! sudo pip install kaggle

Collecting kaggle
[?25l  Downloading https://files.pythonhosted.org/packages/62/ab/bb20f9b9e24f9a6250f95a432f8d9a7d745f8d24039d7a5a6eaadb7783ba/kaggle-1.5.6.tar.gz (58kB)
[K    100% |████████████████████████████████| 61kB 6.5MB/s ta 0:00:011
Collecting tqdm (from kaggle)
[?25l  Downloading https://files.pythonhosted.org/packages/28/7e/281edb5bc3274dfb894d90f4dbacfceaca381c2435ec6187a2c6f329aed7/tqdm-4.48.2-py2.py3-none-any.whl (68kB)
[K    100% |████████████████████████████████| 71kB 14.0MB/s ta 0:00:01
[?25hCollecting python-slugify (from kaggle)
  Downloading https://files.pythonhosted.org/packages/9f/42/e336f96a8b6007428df772d0d159b8eee9b2f1811593a4931150660402c0/python-slugify-4.0.1.tar.gz
Collecting text-unidecode>=1.3 (from python-slugify->kaggle)
[?25l  Downloading https://files.pythonhosted.org/packages/a6/a5/c0b6468d3824fe3fde30dbb5e1f687b291608f9473681bbf7dabbf5a87d7/text_unidecode-1.3-py2.py3-none-any.whl (78kB)
[K    100% |████████████████████████████████| 81kB 15.0M

In [25]:
# 初回だけ実行すれば大丈夫です
# usernameとkeyはWebから入手できます。各自で書き換えてください
! mkdir ~/.kaggle
! echo '{"username":"","key":""}' > ~/.kaggle/kaggle.json
! chmod 600 ~/.kaggle/kaggle.json
! ls -l ~/.kaggle/kaggle.json

-rw------- 1 ec2-user ec2-user 25 Aug 19 05:01 /home/ec2-user/.kaggle/kaggle.json


In [26]:
! kaggle competitions submit -c competition-0819 -f output/my_submission.csv -m "example"

401 - Unauthorized


In [27]:
! kaggle competitions submissions -c competition-0819 | head -n 3

401 - Unauthorized
