In [31]:
import pandas as pd
import numpy as np
from datetime import date
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler, OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.feature_selection import SelectKBest
from category_encoders.binary import BinaryEncoder
from functools import reduce
from time import time
from scipy.special import comb

pd.options.display.max_columns = 999

In [32]:
%%time

gh_raw_prefix = 'https://raw.githubusercontent.com/quinn-dougherty/well/master/'

csv_local = ['train_features.csv', 'test_features.csv', 'train_labels.csv', 'sample_submission.csv']
csv_github = {x: gh_raw_prefix + x for x in csv_local}

def cleanski(df): 
    START_ = time()

    df = df.fillna('NOT_KNOWN')    
    boolski = ['public_meeting', 'permit']
    for feat in boolski: 
        df[feat] = df[feat].map({True: 1, False: 0, 'NOT_KNOWN': 0.4})
    to_drop = ['wpt_name', 'region', 'ward', 'scheme_name', 'district_code']
    cats = ['region_district'] + list(df.select_dtypes(include='object').drop(to_drop[:-1], axis=1).columns)
    nums = list(df.select_dtypes(exclude='object').drop(['id', 'district_code'], axis=1).columns)

    insigs = ['funder', 'installer', 'subvillage', 'ward']
    
    def insignificant(features, k=3): 
        cumula = 0
        for feat in features: 
            j = time()
            cumula += j - START_
            valcts = df[feat].str.lower().value_counts()

            df[feat] = [val if valcts[val] > k else "OTHER" for val in df[feat].str.lower()]
            continue
        #print(f'{cumula:.3}')
        pass

    df['date_recorded'] = pd.to_datetime(df['date_recorded']).apply(lambda x: x.toordinal())
    
    assert df.region.isna().sum() == df.district_code.isna().sum() == 0
    df['region_district'] = df.apply(lambda row: f'{row.region}_{row.district_code}', axis=1)
    
    insignificant(insigs)
    
    assert all([x==0 for x in df.isna().sum()])
    return ((df.drop(to_drop, axis=1)
              ), cats, nums)

df_train, cats, nums = cleanski(pd.read_csv(csv_local[0]))
df_test = cleanski(pd.read_csv(csv_local[1]))[0]
test_indices = df_test.id.to_numpy()

target_train = pd.read_csv(csv_local[2])


assert all([x==0 for x in df_test.isna().sum()])

print(df_train.shape)

N = df_train.shape[0]
N_test = df_test.shape[0]

sample_submission = pd.read_csv(csv_local[3])
submit_rows = sample_submission.id.to_numpy()
assert (submit_rows == test_indices).all()


def write_submit(gscv, name='submission.csv'): 
    '''s is a GridSearchCV instance after running `.fit(train)` on it'''
    
    prediction = {x: s for x,s in zip(df_test.id.to_numpy(), gscv.predict(df_test))}

    submit_df = (pd.DataFrame.from_dict(prediction, orient='index')
                 .reset_index()
                 .rename(mapper={'index': 'id', 0: 'status_group'}, axis=1))

    submit_df.to_csv(name, index=False)
    
    !kaggle competitions submit -c ds1-predictive-modeling-challenge -f submission.csv -m "polynomial features and selectkbest"
    return submit_df


(59400, 36)
CPU times: user 10.6 s, sys: 63.9 ms, total: 10.6 s
Wall time: 10.2 s


In [33]:
numerical_transformer = Pipeline(steps=[ 
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('encode', BinaryEncoder())
])

preprocessor = ColumnTransformer(
     transformers=[
         ('num', numerical_transformer, nums),
         ('cat', categorical_transformer, cats)])

d=2
# # Append classifier to preprocessing pipeline.
# # Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('poly_gen', PolynomialFeatures(degree=d)), 
                      ('select', SelectKBest()),
                      ('classifier', SGDClassifier(loss='log', tol=np.exp(-bb), max_iter=int(np.exp(bb))))])

bb = 10
j = 3
encoded_full = preprocessor.fit_transform(df_train).shape[1]
poly = comb(encoded_full+d, d, exact=True)
kmin = best_k-j*bb
kmax = best_k+j**j*bb

best_k = 60 # from a previous selectkbest
grid_params = {
    'classifier__alpha': [np.exp(-5), np.exp(-6.5), np.exp(-4)],#[np.exp(k) for k in range(-8,-1)],
    'select__k': range(kmin, kmax, j**j**j)
}

cv_ = 12
search = GridSearchCV(clf, param_grid=grid_params, iid=False, 
                      cv=cv_, return_train_score=True, verbose=7, 
                      n_jobs=-1)

NUMBER_OF_JOBS = cv_ * reduce(lambda x,y: x*y, [len(x) for x in grid_params.values()])#* search.get_params['cv']
print(NUMBER_OF_JOBS)

encoded_full, poly, (kmin, kmax)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


36


(142, 10296, (30, 330))

In [34]:
%%time
search.fit(df_train.drop('id', axis=1), target_train.drop('id', axis=1))

print(search_2.best_score_)

x = search_2.best_estimator_ 

x.steps

Fitting 12 folds for each of 3 candidates, totalling 36 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


MemoryError: 

In [35]:


write_submit(search)

NotFittedError: This GridSearchCV instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.

330