# Pythonのバージョン確認

In [1]:
!python -V

Python 3.6.6 :: Anaconda, Inc.


# ライブラリーのインポート

In [2]:
import os, gc, pickle, datetime, sys
import numpy as np
import pandas as pd
from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype

# データの読み込み

In [5]:
INPUTPATH = '../input'
print(os.listdir(INPUTPATH))

['gender_submission.csv', 'test.csv', 'train.csv']


In [23]:
df = pd.read_csv(f'{INPUTPATH}/train.csv')
df.head(1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S


In [5]:
#データ型の確認
df.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [6]:
#欠損値の確認
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

# データ型を変える

In [7]:
#データ型を倍精度から単精度に変える
dfare1 = df.Fare.nbytes
print('float64の時のFareのデータサイズは：{:.2f} KB'.format(dfare1/1024))
dfare2 = df.Fare.astype('float32').nbytes
print('float32の時のFareのデータサイズは：{:.2f} KB'.format(dfare2/1024))
print('データ型を変えることで{:.2f}%ファイルサイズを減らせました'.format(100*(1-dfare2/dfare1)))

float64の時のFareのデータサイズは：6.96 KB
float32の時のFareのデータサイズは：3.48 KB
データ型を変えることで50.00%ファイルサイズを減らせました


In [9]:
#データ型を変えると浮動小数精度が悪くなることも念頭に
print(df.Fare.max())
print(df.Fare.astype('float16').max())

512.3292
512.5


In [10]:
#データの最大、最小値に影響のない範囲でデータ型を変えないと、データ値そのものも変わってしまう
print(df.PassengerId.max())
print(df.PassengerId.astype('int8').max())

891
127


# read_csvで一気にデータ型を指定する

In [11]:
#dtypeの辞書のもとを作成する
dtype_dict=df.dtypes.to_dict()
dtype_dict

{'PassengerId': dtype('int64'),
 'Survived': dtype('int64'),
 'Pclass': dtype('int64'),
 'Name': dtype('O'),
 'Sex': dtype('O'),
 'Age': dtype('float64'),
 'SibSp': dtype('int64'),
 'Parch': dtype('int64'),
 'Ticket': dtype('O'),
 'Fare': dtype('float64'),
 'Cabin': dtype('O'),
 'Embarked': dtype('O')}

In [12]:
dtype_dict ={'PassengerId': np.dtype('int32'),
 'Survived': np.dtype('int32'),
 'Pclass': np.dtype('int32'),
 'Name': np.dtype('O'),
 'Sex': np.dtype('O'),
 'Age': np.dtype('float32'),
 'SibSp': np.dtype('int32'),
 'Parch': np.dtype('int32'),
 'Ticket': np.dtype('O'),
 'Fare': np.dtype('float32'),
 'Cabin': np.dtype('O'),
 'Embarked': np.dtype('O')}

In [13]:
dtype_dict

{'PassengerId': dtype('int32'),
 'Survived': dtype('int32'),
 'Pclass': dtype('int32'),
 'Name': dtype('O'),
 'Sex': dtype('O'),
 'Age': dtype('float32'),
 'SibSp': dtype('int32'),
 'Parch': dtype('int32'),
 'Ticket': dtype('O'),
 'Fare': dtype('float32'),
 'Cabin': dtype('O'),
 'Embarked': dtype('O')}

In [14]:
#データ型を変えることによって、データそのものに影響がないか最大最小を確認する
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [15]:
#ユーザー定義したデータ型辞書に基づいてデータを読み込む
df = pd.read_csv(f'{INPUTPATH}/train.csv',dtype=dtype_dict)
print(df.dtypes)

PassengerId      int32
Survived         int32
Pclass           int32
Name            object
Sex             object
Age            float32
SibSp            int32
Parch            int32
Ticket          object
Fare           float32
Cabin           object
Embarked        object
dtype: object


# 便利な関数を使う

In [17]:
def reduce_mem_usage(df, use_float16=False):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            # skip datetime type or categorical type
            continue
        col_type = df[col].dtype
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')
    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

In [24]:
df = pd.read_csv(f'{INPUTPATH}/train.csv')
df = reduce_mem_usage(df, use_float16=False)

Memory usage of dataframe is 0.08 MB
Memory usage after optimization is: 0.10 MB
Decreased by -17.0%


In [25]:
df.dtypes

PassengerId       int16
Survived           int8
Pclass             int8
Name           category
Sex            category
Age             float32
SibSp              int8
Parch              int8
Ticket         category
Fare            float32
Cabin          category
Embarked       category
dtype: object

# Category型を使う

In [27]:
df = pd.read_csv(f'{INPUTPATH}/train.csv')
dsex1 = df.Sex.nbytes
print('objectの時のFareのデータサイズは：{:.2f} KB'.format(dsex1/1024))
dsex2 = df.Fare.astype('category').nbytes
print('categoryの時のFareのデータサイズは：{:.2f} KB'.format(dsex2/1024))
print('データ型を変えることで{:.2f}%ファイルサイズを減らせました'.format(100*(1-dsex2/dsex1)))

objectの時のFareのデータサイズは：6.96 KB
categoryの時のFareのデータサイズは：3.68 KB
データ型を変えることで47.17%ファイルサイズを減らせました


In [28]:
#category型にした後だと欠損値補完でエラーが出る
df.Cabin = df.Cabin.astype('category')
df.Cabin = df.Cabin.fillna('null')

ValueError: fill value must be in categories

In [30]:
#先に欠損値補完してから、category型にする
df = pd.read_csv(f'{INPUTPATH}/train.csv')
df.Cabin = df.Cabin.fillna('null')
df.Cabin = df.Cabin.astype('category')

In [32]:
#もしくは、新たにカテゴリを追加する
df = pd.read_csv(f'{INPUTPATH}/train.csv')
df.Cabin = df.Cabin.astype('category')
df.Cabin = df.Cabin.cat.add_categories('null').fillna('null')

# Sparse Data Structureを使う

In [6]:
df = pd.read_csv(f'{INPUTPATH}/train.csv')
for var in df.dtypes[df.dtypes =='object'].index.tolist():
    print('Unique level of '+var+' is {:}'.format(len(df[var].unique())))

Unique level of Name is 891
Unique level of Sex is 2
Unique level of Ticket is 681
Unique level of Cabin is 148
Unique level of Embarked is 4


In [9]:
%%time
dummy_list = ['Ticket', 'Cabin', 'Embarked']
df_dummies = pd.get_dummies(df[dummy_list], dummy_na=True, sparse=False, prefix = dummy_list)
df_dummies.shape

CPU times: user 16.5 ms, sys: 0 ns, total: 16.5 ms
Wall time: 18.7 ms


(891, 834)

In [35]:
df_dummies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Columns: 834 entries, Ticket_110152 to Embarked_nan
dtypes: uint8(834)
memory usage: 725.8 KB


In [36]:
print(sys.getsizeof(df_dummies)/1024)
print(df_dummies.memory_usage().sum()/1024)

725.826171875
725.802734375


In [38]:
df_dummies.head(1)

Unnamed: 0,Ticket_110152,Ticket_110413,Ticket_110465,Ticket_110564,Ticket_110813,Ticket_111240,Ticket_111320,Ticket_111361,Ticket_111369,Ticket_111426,...,Cabin_F33,Cabin_F38,Cabin_F4,Cabin_G6,Cabin_T,Cabin_nan,Embarked_C,Embarked_Q,Embarked_S,Embarked_nan
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0


In [39]:
df_dummies.Ticket_110152.sum()

3

In [10]:
%%time
df_dummies2 = pd.get_dummies(df[dummy_list], dummy_na=True, sparse=True, prefix = dummy_list)

CPU times: user 199 ms, sys: 155 µs, total: 200 ms
Wall time: 201 ms


In [41]:
df_dummies2.memory_usage().sum()/1024

13.1767578125

In [42]:
#Sparse Data Structureだと通常のDataFrameのメソッドがエラーになる
df_dummies2.Ticket_110152.sum()

TypeError: sum() got an unexpected keyword argument 'min_count'

In [43]:
#Sparseから元に戻す必要がある
np.asarray(df_dummies2.Ticket_110152).sum()

3

## groupbyするときの工夫

In [15]:
df_all = pd.concat([df, df_dummies], axis=1)
df_all.shape

(891, 846)

In [16]:
df_all.groupby('Sex')['Ticket_110152'].sum()

Sex
female    3
male      0
Name: Ticket_110152, dtype: uint8

In [17]:
df_all2 = pd.concat([df, df_dummies2], axis=1)
df_all2.shape

(891, 846)

In [18]:
#Sparse Data Structureにするとgroupbyでもエラーが出る
df_all2.groupby('Sex')['Ticket_110152'].sum()

TypeError: sum() got an unexpected keyword argument 'min_count'

In [25]:
df_tmp = df_all2[['Sex', 'Ticket_110152']].copy()
df_tmp.Ticket_110152 = np.asarray(df_tmp.Ticket_110152)
df_tmp.groupby('Sex')['Ticket_110152'].sum()

Sex
female    3
male      0
Name: Ticket_110152, dtype: int64