In [1]:
import pandas as pd
pd.options.display.max_columns = 200

import numpy as np
import pickle

import matplotlib.pyplot as plt
%matplotlib inline

# Exploring the data

Data: https://www.drivendata.org/competitions/7/pump-it-up-data-mining-the-water-table/page/23/

In [2]:
problem = 'PumpItUp'
train = pd.read_csv('data/' + problem + '/train.csv')
labels = pd.read_csv('data/' + problem + '/train_labels.csv')
test = pd.read_csv('data/' + problem + '/test.csv')

In [3]:
display(train.head(1))
display(test.head(1))
display(labels.head(1))

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,recorded_by,scheme_management,scheme_name,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,Lake Nyasa,Mnyusi B,Iringa,11,5,Ludewa,Mundindi,109,True,GeoData Consultants Ltd,VWC,Roman,False,1999,gravity,gravity,gravity,vwc,user-group,pay annually,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe


Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,recorded_by,scheme_management,scheme_name,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,50785,0.0,2013-02-04,Dmdd,1996,DMDD,35.290799,-4.059696,Dinamu Secondary School,0,Internal,Magoma,Manyara,21,3,Mbulu,Bashay,321,True,GeoData Consultants Ltd,Parastatal,,True,2012,other,other,other,parastatal,parastatal,never pay,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,other,other


Unnamed: 0,id,status_group
0,69572,functional


In [5]:
train_nulls = train.isnull().sum()
test_nulls = test.isnull().sum()
nulls = pd.concat([train_nulls[train_nulls != 0], 
                   test_nulls[test_nulls != 0]], axis=1)
nulls.columns = ['train', 'test']
nulls

Unnamed: 0,train,test
funder,3635,869
installer,3655,877
subvillage,371,99
public_meeting,3334,821
scheme_management,3877,969
scheme_name,28166,7092
permit,3056,737


# Preprocessing

In [6]:
def reduce_factor_levels(df, column_name, limit=None, top=None, name=None):
    assert(limit is not None or top is not None), 'Specify limit ot top'
    if top is None:
        top = df[column_name].value_counts()[:limit].index
    if name is None:
        name = '%s_OTHER' % column_name
    df.loc[~df[column_name].isin(top), column_name] = name
    return top

In [None]:
top = reduce_factor_levels(df, 'funder', 10)
reduce_factor_levels(df_test, 'funder', top=top);

top = reduce_factor_levels(df, 'installer', 10)
reduce_factor_levels(df_test, 'installer', top=top);

In [None]:
drop = ['wpt_name', 'num_private', 'subvillage', 'region_code', 'district_code', 'lga', 'ward', 'recorded_by', 'scheme_name']

df.drop(drop, axis=1, inplace=True)
df_test.drop(drop, axis=1, inplace=True)

In [None]:
df.loc[df.scheme_management == 'None', 'scheme_management'] = ''
df.loc[df.scheme_management.isnull(), 'scheme_management'] = ''

df_test.loc[df_test.scheme_management.isnull(), 'scheme_management'] = ''

In [None]:
df['construction_date_known'] = (df.construction_year > 0).astype(np.int32)
df_test['construction_date_known'] = (df_test.construction_year > 0).astype(np.int32)

In [None]:
min_year = df[df.construction_year > 0].construction_year.min() // 10 - 1

df['construction_decade'] = df.construction_year // 10 - min_year
df_test['construction_decade'] = df_test.construction_year // 10 - min_year

df.loc[df.construction_decade < 0, 'construction_decade'] = 0
df_test.loc[df_test.construction_decade < 0, 'construction_decade'] = 0

In [None]:
top = reduce_factor_levels(df, 'construction_year', 20)
reduce_factor_levels(df_test, 'construction_year', top=top);

In [None]:

df.loc[df.extraction_type == 'other - mkulima/shinyanga', 'extraction_type'] = 'other'

In [None]:
heights = np.arange(-1, df.gps_height.max()+500, 500)
height_labels = list(range(len(heights)-1))

df['gps_height_rounded'] = pd.cut(df.gps_height, bins=heights, labels=height_labels)
df_test['gps_height_rounded'] = pd.cut(df_test.gps_height, bins=heights, labels=height_labels)

df.drop(['gps_height'], axis=1, inplace=True)
df_test.drop(['gps_height'], axis=1, inplace=True)

In [None]:
pops = np.arange(-1, df.population.max()+500, 500)
pops_labels = list(range(len(pops)-1))

df['pop_rounded'] = pd.cut(df.population, bins=pops, labels=pops_labels)
df_test['pop_rounded'] = pd.cut(df_test.population, bins=pops, labels=pops_labels)

df.drop(['population'], axis=1, inplace=True)
df_test.drop(['population'], axis=1, inplace=True)

In [None]:
df.drop(['date_recorded'], axis=1, inplace=True)
df_test.drop(['date_recorded'], axis=1, inplace=True)

In [None]:
df.public_meeting.fillna(True, inplace=True)
df_test.public_meeting.fillna(True, inplace=True)

In [None]:
df.permit.fillna(True, inplace=True)
df_test.permit.fillna(True, inplace=True)

In [None]:
df.gps_height_rounded.fillna(0, inplace=True)
df_test.gps_height_rounded.fillna(0, inplace=True)

# Визуализация

https://public.tableau.com/profile/calin.uioreanu#!/vizhome/DataMiningtheWaterTableDrivenData_com/Bubblestatusquantity

# Модели

In [None]:
X, y, X_test = df.drop(['id', 'status_group'], axis=1), \
               df.status_group, \
               df_test.drop(['id'], axis=1)

In [None]:
X.head(1)

<div class="panel panel-warning">
    <div class="panel-heading">
        <h3 class="panel-title">Обратите внимание</h3> 
    </div>
</div>

Вот эта функция ниже - опять мои штуки-дрюки, и можно кодировать данные по-своему.

In [None]:
def prepare(X_train, X_test):
    from sklearn.preprocessing import StandardScaler
    from sklearn.feature_extraction import DictVectorizer
    
    objects = X_train.select_dtypes(include=['O']).columns.values
    numeric = X_train.select_dtypes(exclude=['O']).columns.values
    
    dv = DictVectorizer(sparse=False)
    data_encoded_tr = dv.fit_transform(X_train[objects].to_dict(orient='records'))
    data_encoded_ts = dv.transform(X_test[objects].to_dict(orient='records'))

    ss = StandardScaler()
    data_scaled_tr = ss.fit_transform(X_train[numeric])
    data_scaled_ts = ss.transform(X_test[numeric])
    
    train = np.hstack((data_encoded_tr, data_scaled_tr))
    test  = np.hstack((data_encoded_ts, data_scaled_ts))
    return train, test

In [None]:
x_train, x_test = prepare(X, X_test)

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
y_encoder = LabelEncoder()
y = y_encoder.fit_transform(y)

<div class="panel panel-info" style="margin: 50px 0 0 0">
    <div class="panel-heading">
        <h3 class="panel-title">Задание 1.</h3> 
    </div>
</div>

Возьмите тетрадку с сегодняшнего занятия и, руководствуясь советами по настройке, заделайте лучший GBM в мире! Не забудьте отправлять результаты на drivendata и хвастаться в чате о результатах.

In [None]:
# Ваш код здесь

<div class="panel panel-info" style="margin: 50px 0 0 0">
    <div class="panel-heading">
        <h3 class="panel-title">Задание 2.</h3> 
    </div>
</div>

Выберите любой из сторонних фреймворков по своему усмотрению:
* XGBoost
* LightGBM
* H2O
* CatBoost

Установите, прокачайте его, побейте GBM от sklearn.

In [None]:
# Ваш код здесь

<div class="panel panel-info" style="margin: 50px 0 0 0">
    <div class="panel-heading">
        <h3 class="panel-title">Задание 3 (опционально).</h3> 
    </div>
</div>

Возьмите __hyperopt__ или его порт для классификаторов sklearn, называющийся __hyperopt-sklearn__. Установите его, попробуйте найти оптимальные гиперпараметры с помощью байесовской оптимизации. Помните, что это не silver bullet, и сценарий подбора оптимальный параметров все еще актуален. Но на этапе подбора параметров деревьев байесовская оптимизация может дать выигрыш во времени, уделав классический GridSearchCV.

In [None]:
# Ваш код здесь