In [1]:
from google.colab import files
files.upload() # kaggle.jsonをアップロード
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 /root/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json


In [2]:
import pandas as pd
import numpy as np

# データの準備
def prepare():
    !kaggle datasets download -d rajanand/rainfall-in-india
    !unzip rainfall-in-india.zip

# データの読み込み
def preprocess():
    # CSVファイルを読み込んでデータフレームに格納
    df = pd.read_csv('rainfall in india 1901-2015.csv')
    # データフレームから特定の列のみを抽出
    #（1月～12月のデータ）
    data = df.iloc[:,2:14]
    col = df.columns.values[2:14]
    df = pd.DataFrame(data, columns=col)
    print('Original Data:\t%d' % len(df))
    return df

# 欠損値の削除
def drop_missing_data(data): 
  # 欠損している要素数を列ごとに確認
    data.isnull().sum()
    data.info()
    # リストワイズ法で欠損値を含む行を削除する
    print('Missing Value Removed Data:\t%d' \
    % len(data.dropna()))
    # 欠損値を別のデータで埋める
    print('\n===== Fill by FILL =====')
    print( data.fillna('FILL')[20:30] )
    print('\n===== Fill by 100 =====')
    print(data.fillna(100)[20:30])

# 代表値による代入
def insert_typical_value(df, comp_type):
    print('\n===== Fill by {} ====='.format(comp_type))
    for f in df.columns.values: 
        if comp_type == 'mean':
            # 平均値で穴埋め
            df[f].fillna(df[f].mean(), inplace=True)
        elif comp_type == 'median':
            # 中央値で穴埋め
            df[f].fillna(df[f].median(), inplace=True) 
        elif comp_type == 'mode':
            # 最頻値で穴埋め
            df[f].fillna(df[f].mode()[0], inplace=True) 
    return df

def main():
    prepare()
    data = preprocess()
    # 欠損値の除去
    drop_missing_data(data) 
    for comp_type in ['mean', 'median', 'mode']:
        dt = preprocess()
        res = insert_typical_value(dt, comp_type)
        print(res[20:30])

if __name__ == '__main__':
    main()


Downloading rainfall-in-india.zip to /content
  0% 0.00/187k [00:00<?, ?B/s]
100% 187k/187k [00:00<00:00, 72.3MB/s]
Archive:  rainfall-in-india.zip
  inflating: district wise rainfall normal.csv  
  inflating: rainfall in india 1901-2015.csv  
Original Data:	4116
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4116 entries, 0 to 4115
Data columns (total 12 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   JAN     4112 non-null   float64
 1   FEB     4113 non-null   float64
 2   MAR     4110 non-null   float64
 3   APR     4112 non-null   float64
 4   MAY     4113 non-null   float64
 5   JUN     4111 non-null   float64
 6   JUL     4109 non-null   float64
 7   AUG     4112 non-null   float64
 8   SEP     4110 non-null   float64
 9   OCT     4109 non-null   float64
 10  NOV     4105 non-null   float64
 11  DEC     4106 non-null   float64
dtypes: float64(12)
memory usage: 386.0 KB
Missing Value Removed Data:	4090

===== Fill by FILL =====
      J