In [33]:
import pandas as pd
import numpy as np
from datetime import date
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.feature_selection import SelectKBest
from category_encoders.binary import BinaryEncoder
from functools import reduce

pd.options.display.max_columns = 999

In [34]:
%%time

gh_raw_prefix = 'https://raw.githubusercontent.com/quinn-dougherty/well/master/'

csv_local = ['train_features.csv', 'test_features.csv', 'train_labels.csv', 'sample_submission.csv']
csv_github = {x: gh_raw_prefix + x for x in csv_local}

def cleanski(df): 
    df = df.fillna('NOT_KNOWN')
    cats = [df.select_dtypes(include='object').drop(['region'], axis=1).columns] + ['region_district']
    nums = df.select_dtypes(exclude='object').drop(['id', 'district_code'], axis=1).columns

    insigs = ['funder', 'installer', 'wpt_name', 'subvillage', 'scheme_name', 'ward']
    def insignificant(features, k=2): 
        
        for feat in features:
            
            def new_feat_mapper(x):
                valcountsbool = (df[feat].value_counts()<=k)
                if x in valcountsbool[valcountsbool.to_numpy() == True].index: 
                    return 'OTHER'
                else: return x
            df[feat] = df[feat].apply(new_feat_mapper)
        pass

    df['date_recorded'] = pd.to_datetime(df['date_recorded']).apply(lambda x: x.toordinal())
    
    boolski = ['public_meeting', 'permit']
    for feat in boolski: 
        df[feat] = df[feat].map({True: 1, False: 0, 'NOT_KNOWN': 0.4})
    
    assert df.region.isna().sum() == df.district_code.isna().sum() == 0
    df['region_district'] = df.apply(lambda row: f'{row.region}_{row.district_code}', axis=1)
    
    insignificant(insigs)
    
    return ((df.drop(['region', 'district_code'], axis=1)
              ), cats[0], nums)

df_train, cats, nums = cleanski(pd.read_csv('train_features.csv'))
df_test = cleanski(pd.read_csv('test_features.csv'))[0]
test_indices = df_test.id.to_numpy()

target_train = pd.read_csv('train_labels.csv')

print(df_train.shape)

N = df_train.shape[0]
N_test = df_test.shape[0]
bb = 10
j = 3

sample_submission = pd.read_csv('sample_submission.csv')
submit_rows = sample_submission.id.to_numpy()
assert (submit_rows == test_indices).all()

KeyboardInterrupt: 

'd'

In [35]:
# imputer obsolete now, cuz datacleaning 

numerical_transformer = Pipeline(steps=[ 
    #('imputer', SimpleImputer(strategy='mean')),
    ('scaler', RobustScaler())
])

categorical_transformer = Pipeline(steps=[
    #('imputer', SimpleImputer(strategy='constant', fill_value='not_known')),
    ('encode', BinaryEncoder())
])

preprocessor = ColumnTransformer(
     transformers=[
         ('num', numerical_transformer, nums),
         ('cat', categorical_transformer, cats)])


# # Append classifier to preprocessing pipeline.
# # Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      #('reduc', TruncatedSVD()),
                      ('select', SelectKBest),
                      ('classifier', SGDClassifier(loss='log', tol=np.exp(-bb), max_iter=int(np.exp(bb))))])

grid_params = {
    'classifier__alpha': [np.exp(k) for k in range(-bb, bb-j, j)], 
    'select__k': range(N_test//(bb+j), N_test//(bb-j), j**(j+j))
    #'reduc__n_components': [bb, bb**j] ##range(N_test//(bb+j), N_test//(bb-j), j**(j+j))
}

cv_ = 7
search = GridSearchCV(clf, param_grid=grid_params, iid=False, 
                      cv=cv_, return_train_score=True, verbose=7, 
                      n_jobs=3)

NUMBER_OF_JOBS = cv_ * reduce(lambda x,y: x*y, [len(x) for x in grid_params.values()])#* search.get_params['cv']
print(NUMBER_OF_JOBS)

84


In [36]:
%%time
search.fit(df_train.drop('id', axis=1), target_train.drop('id', axis=1))


TypeError: get_params() missing 1 required positional argument: 'self'

In [37]:
print(search.best_score_)

x = search.best_estimator_ 

x


AttributeError: 'GridSearchCV' object has no attribute 'best_score_'

In [None]:

def write_submit(gscv, name='submission.csv'): 
    '''s is a GridSearchCV instance after running `.fit(train)` on it'''
    
    prediction = {x: s for x,s in zip(df_test.id.to_numpy(), gscv.predict(df_test))}

    submit_df = (pd.DataFrame.from_dict(prediction, orient='index')
                 .reset_index()
                 .rename(mapper={'index': 'id', 0: 'status_group'}, axis=1))

    submit_df.to_csv(name, index=False)
    
    !kaggle competitions submit -c ds1-predictive-modeling-challenge -f submission.csv -m "lots of cleaning and selectkbest"
    return submit_df

write_submit(search)

In [31]:
len(range(N_test//(bb+j), N_test//(bb-j), (j-1)**(j+j))), len([np.exp(k) for k in range(-bb, bb-j, j)])

(15, 6)