In [46]:
import pandas as pd
import numpy as np
from datetime import date
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA, TruncatedSVD

pd.options.display.max_columns = 999

In [97]:

def cleanski(df): 
    boolski = ['public_meeting', 'permit']
    df['date_recorded'] = pd.to_datetime(df['date_recorded']).apply(lambda x: x.toordinal())
    df[boolski[0]] = df[boolski[0]].map({True: 1, False: 0})
    df[boolski[1]] = df[boolski[1]].map({True: 1, False: 0})
    return df

df_train = cleanski(pd.read_csv('train_features.csv'))#.set_index(keys='id', drop=True)
df_test = cleanski(pd.read_csv('test_features.csv'))
test_indices = df_test.id.to_numpy()


target_train = pd.read_csv('train_labels.csv')#.set_index(keys='id', drop=True)

print(df_train.shape)
#assert (target_train.id == df_train.id).all()


cats = df_train.select_dtypes(include='object').columns

nums = df_train.select_dtypes(exclude='object').drop('id', axis=1).columns

N = df_train.shape[0]
N_test = df_test.shape[0]
bb = 10
j = 3

sample_submission = pd.read_csv('sample_submission.csv')#.set_index(keys='id', drop=True)
submit_rows = sample_submission.id.to_numpy()
assert (submit_rows == test_indices).all()


(59400, 40)


In [107]:
# OneHotEncoder(handle_unknown='ignore').fit(df_train[cats])

assert [x==0 for x in df_train[nums].isna().sum()] # omit imputer for numeric

numerical_transformer = Pipeline(steps=[ 
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', RobustScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='not_known')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
     transformers=[
         ('num', numerical_transformer, nums),
         ('cat', categorical_transformer, cats)])


# # Append classifier to preprocessing pipeline.
# # Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('reduc', TruncatedSVD()),
                      ('classifier', SGDClassifier(loss='log', tol=np.exp(-bb), max_iter=int(np.exp(bb))))])

grid_params = {
    'classifier__alpha': [np.exp(k) for k in range(-5, 6, 4)], 
    'reduc__n_components': range(N_test//(bb+j), N_test//(bb-j), j**(j+j))
}

search = GridSearchCV(clf, param_grid=grid_params, iid=False, cv=4, return_train_score=True, verbose=7)



In [108]:
%%time 
#X_train, X_test, y_train, y_test = train_test_split(df_train, target_train)

search.fit(df_train.drop('id', axis=1), target_train.drop('id', axis=1))
print(search.best_score_)

# not working, we'll have to open up in colab

Fitting 4 folds for each of 6 candidates, totalling 24 fits
[CV] classifier__alpha=0.006737946999085467, reduc__n_components=1104 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


MemoryError: 

In [19]:

def write_submit(gscv, name='submission.csv'): 
    '''s is a GridSearchCV instance after running `.fit(train)` on it'''
    
    prediction = {x: s for x,s in zip(df_test.id.to_numpy(), gscv.predict(df_test))}

    submit_df = (pd.DataFrame.from_dict(prediction, orient='index')
                 .reset_index()
                 .rename(mapper={'index': 'id', 0: 'status_group'}, axis=1))

    submit_df.to_csv(name, index=False)
    
    !kaggle competitions submit -c ds1-predictive-modeling-challenge -f submission.csv -m "basic pipeline"
    return submit_df

# write_submit(search)

0.7102190280728671

In [35]:
write_submit(search).head()

1485.0

In [106]:
len(range(N_test//(bb+j), N_test//(bb-j), j**(j+j)))

2