In [112]:
import pandas as pd
import numpy as np
from datetime import date
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.feature_selection import SelectKBest
from category_encoders.binary import BinaryEncoder
from functools import reduce
from time import time

pd.options.display.max_columns = 999

In [81]:
%%time

gh_raw_prefix = 'https://raw.githubusercontent.com/quinn-dougherty/well/master/'

csv_local = ['train_features.csv', 'test_features.csv', 'train_labels.csv', 'sample_submission.csv']
csv_github = {x: gh_raw_prefix + x for x in csv_local}

def cleanski(df): 
    START_ = time()

    df = df.fillna('NOT_KNOWN')    
    boolski = ['public_meeting', 'permit']
    for feat in boolski: 
        df[feat] = df[feat].map({True: 1, False: 0, 'NOT_KNOWN': 0.4})
    to_drop = ['wpt_name', 'region', 'ward', 'scheme_name', 'district_code']
    cats = ['region_district'] + list(df.select_dtypes(include='object').drop(to_drop[:-1], axis=1).columns)
    nums = list(df.select_dtypes(exclude='object').drop(['id', 'district_code'], axis=1).columns)

    insigs = ['funder', 'installer', 'subvillage', 'ward']
    
    def insignificant(features, k=3): 
        cumula = 0
        for feat in features: 
            j = time()
            cumula += j - START_
            valcts = df[feat].str.lower().value_counts()

            df[feat] = [val if valcts[val] > k else "OTHER" for val in df[feat].str.lower()]
            continue
        #print(f'{cumula:.3}')
        pass

    df['date_recorded'] = pd.to_datetime(df['date_recorded']).apply(lambda x: x.toordinal())
    
    assert df.region.isna().sum() == df.district_code.isna().sum() == 0
    df['region_district'] = df.apply(lambda row: f'{row.region}_{row.district_code}', axis=1)
    
    insignificant(insigs)
    
    assert all([x==0 for x in df.isna().sum()])
    return ((df.drop(to_drop, axis=1)
              ), cats, nums)

df_train, cats, nums = cleanski(pd.read_csv('train_features.csv'))
df_test = cleanski(pd.read_csv('test_features.csv'))[0]
test_indices = df_test.id.to_numpy()

target_train = pd.read_csv('train_labels.csv')


assert all([x==0 for x in df_test.isna().sum()])

print(df_train.shape)

N = df_train.shape[0]
N_test = df_test.shape[0]
bb = 10
j = 3

sample_submission = pd.read_csv('sample_submission.csv')
submit_rows = sample_submission.id.to_numpy()
assert (submit_rows == test_indices).all()

(59400, 36)
CPU times: user 10.2 s, sys: 80 ms, total: 10.2 s
Wall time: 10.6 s


In [82]:
df_train.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,num_private,basin,subvillage,region_code,lga,population,public_meeting,recorded_by,scheme_management,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,region_district
0,69572,6000.0,734210,roman,1390,roman,34.938093,-9.856322,0,Lake Nyasa,mnyusi b,11,Ludewa,109,1.0,GeoData Consultants Ltd,VWC,0.0,1999,gravity,gravity,gravity,vwc,user-group,pay annually,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,Iringa_5
1,8776,0.0,734933,grumeti,1399,grumeti,34.698766,-2.147466,0,Lake Victoria,nyamara,20,Serengeti,280,0.4,GeoData Consultants Ltd,Other,1.0,2010,gravity,gravity,gravity,wug,user-group,never pay,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,Mara_2
2,34310,25.0,734924,lottery club,686,world vision,37.460664,-3.821329,0,Pangani,majengo,21,Simanjiro,250,1.0,GeoData Consultants Ltd,VWC,1.0,2009,gravity,gravity,gravity,vwc,user-group,pay per bucket,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe,Manyara_4
3,67743,0.0,734896,unicef,263,unicef,38.486161,-11.155298,0,Ruvuma / Southern Coast,mahakamani,90,Nanyumbu,58,1.0,GeoData Consultants Ltd,VWC,1.0,1986,submersible,submersible,submersible,vwc,user-group,never pay,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,Mtwara_63
4,19728,0.0,734331,OTHER,0,artisan,31.130847,-1.825359,0,Lake Victoria,OTHER,18,Karagwe,0,1.0,GeoData Consultants Ltd,NOT_KNOWN,1.0,0,gravity,gravity,gravity,other,other,never pay,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,Kagera_1


(14358, 36)

In [129]:
# imputer obsolete now, cuz datacleaning 

numerical_transformer = Pipeline(steps=[ 
    #('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    #('imputer', SimpleImputer(strategy='constant', fill_value='not_known')),
    ('encode', BinaryEncoder())
])

preprocessor = ColumnTransformer(
     transformers=[
         ('num', numerical_transformer, nums),
         ('cat', categorical_transformer, cats)])


# # Append classifier to preprocessing pipeline.
# # Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      #('reduc', TruncatedSVD()),
                      ('select', SelectKBest()),
                      ('classifier', SGDClassifier(loss='log', tol=np.exp(-bb), max_iter=int(np.exp(bb))))])

kmax = preprocessor.fit_transform(df_train).shape[1]

grid_params = {
    'classifier__alpha': [np.exp(k) for k in range(-bb, bb-j, j)], 
    'select__k': range(kmax-bb**(j-1)+bb, kmax, j**j+j) 
    #'reduc__n_components': [bb, bb**j] ##range(N_test//(bb+j), N_test//(bb-j), j**(j+j))
}

cv_ = 4
search = GridSearchCV(clf, param_grid=grid_params, iid=False, 
                      cv=cv_, return_train_score=True, verbose=7, 
                      n_jobs=3)

NUMBER_OF_JOBS = cv_ * reduce(lambda x,y: x*y, [len(x) for x in grid_params.values()])#* search.get_params['cv']
print(NUMBER_OF_JOBS)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


72


In [110]:
%%time
search.fit(df_train.drop('id', axis=1), target_train.drop('id', axis=1))


Fitting 4 folds for each of 18 candidates, totalling 72 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:   29.7s
[Parallel(n_jobs=3)]: Done  26 tasks      | elapsed:  4.3min
[Parallel(n_jobs=3)]: Done  66 tasks      | elapsed: 10.8min
[Parallel(n_jobs=3)]: Done  72 out of  72 | elapsed: 12.3min finished
  y = column_or_1d(y, warn=True)
 121 126 130 133 137] are constant.
  f = msb / msw
  y = column_or_1d(y, warn=True)


CPU times: user 18.4 s, sys: 671 ms, total: 19.1 s
Wall time: 12min 23s


GridSearchCV(cv=4, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('preprocessor', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('num', Pipeline(memory=None,
     steps=[('scaler', RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=...e,
       tol=4.5399929762484854e-05, validation_fraction=0.1, verbose=0,
       warm_start=False))]),
       fit_params=None, iid=False, n_jobs=3,
       param_grid={'classifier__alpha': [4.5399929762484854e-05, 0.0009118819655545162, 0.01831563888873418, 0.36787944117144233, 7.38905609893065, 148.4131591025766], 'select__k': range(52, 142, 30)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=7)

In [130]:
print(search.best_score_)

x = search.best_estimator_ 

x.steps
best_k = 52
best_alpha = np.exp(-4)

AttributeError: 'GridSearchCV' object has no attribute 'best_score_'

In [131]:

def write_submit(gscv, name='submission.csv'): 
    '''s is a GridSearchCV instance after running `.fit(train)` on it'''
    
    prediction = {x: s for x,s in zip(df_test.id.to_numpy(), gscv.predict(df_test))}

    submit_df = (pd.DataFrame.from_dict(prediction, orient='index')
                 .reset_index()
                 .rename(mapper={'index': 'id', 0: 'status_group'}, axis=1))

    submit_df.to_csv(name, index=False)
    
    !kaggle competitions submit -c ds1-predictive-modeling-challenge -f submission.csv -m "lots of cleaning and selectkbest, not very good tho"
    return submit_df

write_submit(search)

NotFittedError: This GridSearchCV instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.

In [132]:
numerical_transformer = Pipeline(steps=[ 
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('encode', BinaryEncoder())
])

preprocessor = ColumnTransformer(
     transformers=[
         ('num', numerical_transformer, nums),
         ('cat', categorical_transformer, cats)])


# # Append classifier to preprocessing pipeline.
# # Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      #('reduc', TruncatedSVD()),
                      ('select', SelectKBest()),
                      ('classifier', SGDClassifier(loss='log', tol=np.exp(-bb), max_iter=int(np.exp(bb))))])

kmax = preprocessor.fit_transform(df_train).shape[1]

grid_params = {
    'classifier__alpha': [np.exp(k) for k in [-5, -4, -3, 2]],
    'select__k': range(best_k-bb, best_k+bb, j)
    #'reduc__n_components': [bb, bb**j] ##range(N_test//(bb+j), N_test//(bb-j), j**(j+j))
}

cv_ = 8
search_2 = GridSearchCV(clf, param_grid=grid_params, iid=False, 
                      cv=cv_, return_train_score=True, verbose=7, 
                      n_jobs=3)

NUMBER_OF_JOBS = cv_ * reduce(lambda x,y: x*y, [len(x) for x in grid_params.values()])#* search.get_params['cv']
print(NUMBER_OF_JOBS)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


224


In [133]:
%%time
search_2.fit(df_train.drop('id', axis=1), target_train.drop('id', axis=1))

print(search_2.best_score_)

x2 = search_2.best_estimator_ 

x2.steps

Fitting 8 folds for each of 28 candidates, totalling 224 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:   27.4s
[Parallel(n_jobs=3)]: Done  26 tasks      | elapsed:  1.5min


KeyboardInterrupt: 

In [None]:
write_submit(search_2)