In [25]:
import pandas as pd
import numpy as np
from datetime import date
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer

pd.options.display.max_columns = 999

In [31]:

def cleanski(df): 
    boolski = ['public_meeting', 'permit']
    df['date_recorded'] = pd.to_datetime(df['date_recorded']).apply(lambda x: x.toordinal())
    df[boolski[0]] = df[boolski[0]].map({True: 1, False: 0})
    df[boolski[1]] = df[boolski[1]].map({True: 1, False: 0})
    return df

df_train = cleanski(pd.read_csv('train_features.csv'))#.set_index(keys='id', drop=True)
df_test = cleanski(pd.read_csv('test_features.csv'))
test_indices = df_test.id.to_numpy()


target_train = pd.read_csv('train_labels.csv')#.set_index(keys='id', drop=True)

print(df_train.shape)
#assert (target_train.id == df_train.id).all()


cats = df_train.select_dtypes(include='object').columns

nums = df_train.select_dtypes(exclude='object').drop('id', axis=1).columns

N = df_train.shape[0]

sample_submission = pd.read_csv('sample_submission.csv')#.set_index(keys='id', drop=True)
submit_rows = sample_submission.id.to_numpy()
assert (submit_rows == test_indices).all()


# df_train[nums].head()


(59400, 40)


In [30]:
assert [x==0 for x in df_train[nums].isna().sum()] # omit imputer for numeric

numerical_transformer = Pipeline(steps=[ 
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', RobustScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='not_known')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
     transformers=[
         ('num', numerical_transformer, nums),
         ('cat', categorical_transformer, cats)])


# # Append classifier to preprocessing pipeline.
# # Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', SGDClassifier(loss='log', tol=np.exp(-10), max_iter=100000))])

grid_params = {
    'classifier__alpha': [np.exp(k) for k in range(-5, 6, 2)], 
}

search = GridSearchCV(clf, param_grid=grid_params, iid=False, cv=7, return_train_score=True, n_jobs=3, verbose=7)

In [28]:
%%time 
#X_train, X_test, y_train, y_test = train_test_split(df_train, target_train)

search.fit(df_train.drop('id', axis=1), target_train.drop('id', axis=1))

prediction = {x: s for x,s in zip(df_test.id.to_numpy(), search.predict(df_test))}


# target_train.to_numpy().reshape(-1,1).T.shape

# target_train.columns



Fitting 7 folds for each of 4 candidates, totalling 28 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:   29.0s
[Parallel(n_jobs=3)]: Done  28 out of  28 | elapsed:  1.1min remaining:    0.0s
[Parallel(n_jobs=3)]: Done  28 out of  28 | elapsed:  1.1min finished
  y = column_or_1d(y, warn=True)


CPU times: user 9.36 s, sys: 307 ms, total: 9.66 s
Wall time: 1min 12s


In [33]:
prediction 
#print(prediction.shape, target_train.shape, sample_submission.id.to_numpy().shape)

# submit_predictions = {}
# for k in submit_rows: 
#     submit_predictions[k] = prediction[k]

submit_df = (pd.DataFrame.from_dict(prediction, orient='index')
             .reset_index()
             .rename(mapper={'index': 'id', 0: 'status_group'}, axis=1))

submit_df.head()

#sample_submission.head()



Unnamed: 0,id,status_group
0,50785,non functional
1,51630,functional
2,17168,non functional
3,45559,non functional
4,49871,functional


In [34]:
def write_submit(model, name='submission.csv'): 
    model.to_csv(name, index=False)
    !kaggle competitions submit -c ds1-predictive-modeling-challenge -f submission.csv -m "basic pipeline"
    pass 

write_submit(submit_df)

100%|█████████████████████████████████████████| 255k/255k [00:01<00:00, 239kB/s]
Successfully submitted to DS1 Predictive Modeling Challenge