# Read sample data

本講座で用いる主なサンプルデータの読み込みをまとめました。コードを順に実行し結果を確認してみましょう。

In [2]:
# import the boston house-prices datase for regression
import pandas as pd
from sklearn.datasets import load_boston
dataset = load_boston()

# set dataframe
X = pd.DataFrame(dataset.data, columns=dataset.feature_names)
y = pd.DataFrame(dataset.target, columns=['MEDV'])

# check the shape
print('----------------------------------------------------------------------------------------')
print('X shape: (%i,%i)' %X.shape)
print('y shape: (%i,%i)' %y.shape)
print('----------------------------------------------------------------------------------------')
print(y.describe())
print('----------------------------------------------------------------------------------------')
print(X.join(y).head())
print('----------------------------------------------------------------------------------------')
print(dataset.DESCR)

----------------------------------------------------------------------------------------
X shape: (506,13)
y shape: (506,1)
----------------------------------------------------------------------------------------
             MEDV
count  506.000000
mean    22.532806
std      9.197104
min      5.000000
25%     17.025000
50%     21.200000
75%     25.000000
max     50.000000
----------------------------------------------------------------------------------------
      CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD    TAX  \
0  0.00632  18.0   2.31   0.0  0.538  6.575  65.2  4.0900  1.0  296.0   
1  0.02731   0.0   7.07   0.0  0.469  6.421  78.9  4.9671  2.0  242.0   
2  0.02729   0.0   7.07   0.0  0.469  7.185  61.1  4.9671  2.0  242.0   
3  0.03237   0.0   2.18   0.0  0.458  6.998  45.8  6.0622  3.0  222.0   
4  0.06905   0.0   2.18   0.0  0.458  7.147  54.2  6.0622  3.0  222.0   

   PTRATIO       B  LSTAT  MEDV  
0     15.3  396.90   4.98  24.0  
1     17.8  396.90   9.14  21

In [3]:
# big mart sales amount for regression
# https://datahack.analyticsvidhya.com/contest/practice-problem-big-mart-sales-iii/
import pandas as pd
import numpy as np

df = pd.read_csv('./data/av_big_mart_sales_UWu5bXk.csv', header=0)  # headerなしの場合はNoneを指定
X = df.iloc[:, :-1]      # 全行対象,最終カラム以外をXとする
y = df.iloc[:,[-1]]     # 全行対象,最終カラムをyとする

# check the shape
print('----------------------------------------------------------------------------------------')
print('X shape: (%i,%i)' %X.shape)
print('y shape: (%i,%i)' %y.shape)
print('----------------------------------------------------------------------------------------')
print(y.describe())
# 目的変数に欠損が含まれているとアルゴリズムが学習できないためチェック
print('----------------------------------------------------------------------------------------')
print('Check the null count of the target variable: %i' % y.isnull().sum())
print('----------------------------------------------------------------------------------------')
print(X.join(y).head())

----------------------------------------------------------------------------------------
X shape: (8523,11)
y shape: (8523,1)
----------------------------------------------------------------------------------------
       Item_Outlet_Sales
count        8523.000000
mean         2181.288914
std          1706.499616
min            33.290000
25%           834.247400
50%          1794.331000
75%          3101.296400
max         13086.964800
----------------------------------------------------------------------------------------
Check the null count of the target variable: 0
----------------------------------------------------------------------------------------
  Item_Identifier  Item_Weight Item_Fat_Content  Item_Visibility  \
0           FDA15         9.30          Low Fat         0.016047   
1           DRC01         5.92          Regular         0.019278   
2           FDN15        17.50          Low Fat         0.016760   
3           FDX07        19.20          Regular         0.00000

In [4]:
# Load and return the iris dataset (classification).
# The iris dataset is a classic and very easy multi-class classification dataset.
import pandas as pd
from sklearn.datasets import load_iris
dataset = load_iris()

# Set dataframe
X = pd.DataFrame(dataset.data, columns=dataset.feature_names)
y = pd.DataFrame(dataset.target, columns=['y'])

# check the shape
print('----------------------------------------------------------------------------------------')
print('X shape: (%i,%i)' %X.shape)
print('y shape: (%i,%i)' %y.shape)
print('----------------------------------------------------------------------------------------')
print(y.groupby('y').size())
print('----------------------------------------------------------------------------------------')
print(X.join(y).head())
print('----------------------------------------------------------------------------------------')
print(dataset.DESCR)

----------------------------------------------------------------------------------------
X shape: (150,4)
y shape: (150,1)
----------------------------------------------------------------------------------------
y
0    50
1    50
2    50
dtype: int64
----------------------------------------------------------------------------------------
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  y
0                5.1               3.5                1.4               0.2  0
1                4.9               3.0                1.4               0.2  0
2                4.7               3.2                1.3               0.2  0
3                4.6               3.1                1.5               0.2  0
4                5.0               3.6                1.4               0.2  0
----------------------------------------------------------------------------------------
Iris Plants Database

Notes
-----
Data Set Characteristics:
    :Number of Instances: 150 (50 in 

In [5]:
# Load and return the breast cancer wisconsin dataset (classification).
# The breast cancer dataset is a classic and very easy binary classification dataset.
import pandas as pd
from sklearn.datasets import load_breast_cancer

dataset = load_breast_cancer()

# Set dataframe
X = pd.DataFrame(dataset.data, columns=dataset.feature_names)
y = pd.DataFrame(dataset.target, columns=['y'])

# check the shape
print('----------------------------------------------------------------------------------------')
print('X shape: (%i,%i)' %X.shape)
print('y shape: (%i,%i)' %y.shape)
print('----------------------------------------------------------------------------------------')
print(y.groupby('y').size())
print('----------------------------------------------------------------------------------------')
print(X.join(y).head())
# print('----------------------------------------------------------------------------------------')
# print(dataset.DESCR)

----------------------------------------------------------------------------------------
X shape: (569,30)
y shape: (569,1)
----------------------------------------------------------------------------------------
y
0    212
1    357
dtype: int64
----------------------------------------------------------------------------------------
   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0        17.99         10.38          122.80     1001.0          0.11840   
1        20.57         17.77          132.90     1326.0          0.08474   
2        19.69         21.25          130.00     1203.0          0.10960   
3        11.42         20.38           77.58      386.1          0.14250   
4        20.29         14.34          135.10     1297.0          0.10030   

   mean compactness  mean concavity  mean concave points  mean symmetry  \
0           0.27760          0.3001              0.14710         0.2419   
1           0.07864          0.0869              0.07017  

In [6]:
# loan screening data for classification 
import pandas as pd

df = pd.read_csv('./data/av_loan_u6lujuX_CVtuZ9i.csv', header=0)
X = df.iloc[:, :-1]
y = df.iloc[:, [-1]]

# check the shape
print('----------------------------------------------------------------------------------------')
print('X shape: (%i,%i)' %X.shape)
print('y shape: (%i,%i)' %y.shape)
print('----------------------------------------------------------------------------------------')
print(y.groupby(['Loan_Status']).size())
print('----------------------------------------------------------------------------------------')
print('Check the null count of the target variable: %i' % y.isnull().sum())
print('----------------------------------------------------------------------------------------')
print(X.join(y).head())

# converting stirng to number
# 暗黙的にアルゴリズム側で数値変換してくれるが明示的に実施しておく
class_mapping = {'N':1, 'Y':0}
y_new = y.copy()
y_new.loc[:, 'Loan_Status'] = y_new['Loan_Status'].map(class_mapping)
print('----------------------------------------------------------------------------------------')
print(y_new.join(y,rsuffix='_org').head())
print('----------------------------------------------------------------------------------------')
print(y_new.groupby(['Loan_Status']).size())

----------------------------------------------------------------------------------------
X shape: (614,12)
y shape: (614,1)
----------------------------------------------------------------------------------------
Loan_Status
N    192
Y    422
dtype: int64
----------------------------------------------------------------------------------------
Check the null count of the target variable: 0
----------------------------------------------------------------------------------------
    Loan_ID Gender Married Dependents     Education Self_Employed  \
0  LP001002   Male      No          0      Graduate            No   
1  LP001003   Male     Yes          1      Graduate            No   
2  LP001005   Male     Yes          0      Graduate           Yes   
3  LP001006   Male     Yes          0  Not Graduate            No   
4  LP001008   Male      No          0      Graduate            No   

   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0             5849               