In [None]:
import pandas as pd 

from sklearn import model_selection
from sklearn import linear_model
from sklearn import metrics
from sklearn import preprocessing

In [None]:
def create_folds(df, split=7):    
    data = df.sample(frac=1).reset_index(drop=True)    
    data["kfold"] = -1    
    skf = model_selection.StratifiedKFold(n_splits=split)
    for fold, (train_, valid_) in enumerate(skf.split(data, data.target)):
        data.loc[valid_, "kfold"] = fold        
    return data

In [None]:
def run(fold, split):
    df = pd.read_csv("/kaggle/input/cat-in-the-dat-ii/train.csv")
    test = pd.read_csv("/kaggle/input/cat-in-the-dat-ii/test.csv")
    
    df = create_folds(df, split)
        
    features = [ feature for feature in df.columns if feature not in ["id", "kfold", "target"] ]
        
    for feature in features:
        df[feature] = df[feature].astype(str).fillna("none")
        test[feature] = test[feature].astype(str).fillna("none")
        
    all_data = pd.concat([df[features], test[features]], axis=0)
                
    train = df[df.kfold != fold ].reset_index(drop=True)
    valid = df[df.kfold == fold].reset_index(drop=True) 
            
    # One Hot Encoder
    ohe = preprocessing.OneHotEncoder().fit(all_data[features])
    train_ohe = ohe.transform(train[features])
    valid_ohe = ohe.transform(valid[features])
    test_ohe = ohe.transform(test[features])
    
    #Logistic Regression
    lgr = linear_model.LogisticRegression(max_iter=800)
    lgr.fit(train_ohe, train.target)
    
    return lgr, test_ohe    

In [None]:
def evaluate(models):
    result = pd.DataFrame()
    for i, (model, test_ohe) in enumerate(models):
        result[i] = model.predict_proba(test_ohe)[:, 1]         
    result["mean"] = result.mean(axis=1)
    return result    

In [None]:
split = 7
models = []
for i in range(split):
    models.append(run(i, split))
sub_vals = evaluate(models)

In [None]:
submission = pd.read_csv("/kaggle/input/cat-in-the-dat-ii/sample_submission.csv")
submission["target"] = sub_vals["mean"]
submission.to_csv('./submission.csv', index=False)