In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
import optuna
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression

In [2]:
train_df = pd.read_csv('../input/tabular-playground-series-apr-2021/train.csv')
test_df = pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv')

In [3]:
def label_encoder(c):
    lc = LabelEncoder()
    return lc.fit_transform(c)
def preprocess(df):
    age_map = df[['Age', 'Pclass']].dropna().groupby('Pclass').mean().to_dict()
    df.Age = df.Age.fillna(df.Pclass.map(age_map['Age']))
    df.Cabin = df.Cabin.fillna('X').map(lambda x: x[0].strip())
    df.Ticket = df.Ticket.fillna('X').map(lambda x:str(x).split()[0] if len(str(x).split()) > 1 else 'X')
    df.Fare = df.Fare.fillna(df.Fare.mean())
    df.Fare = df.Fare.map(lambda i: np.log(i) if i > 0 else 0)
    df.Embarked = df.Embarked.fillna('X')
    df.Name = df.Name.map(lambda x: x.split(',')[0])
    df["Family"] = df["SibSp"] + df["Parch"] +1
    df["Alone"] =  df["Family"].map(lambda x : "Yes" if x == 1 else "No")
    label_cols = ['Name', 'Ticket']
    onehot_cols = ['Pclass', 'Sex', 'Cabin', 'Embarked',"Alone"]
    numerical_cols = ['Age', 'SibSp', 'Parch', 'Fare', 'Survived',"Family"]  
    onehot_encoded_df = pd.get_dummies(df[onehot_cols])
    label_encoded_df = df[label_cols].apply(label_encoder)
    numerical_df = df[numerical_cols]
    return pd.concat([numerical_df, label_encoded_df, onehot_encoded_df], axis=1)

def test_preprocess(df):
    age_map = df[['Age', 'Pclass']].dropna().groupby('Pclass').mean().to_dict()
    df.Age = df.Age.fillna(df.Pclass.map(age_map['Age']))
    df.Cabin = df.Cabin.fillna('X').map(lambda x: x[0].strip())
    df.Ticket = df.Ticket.fillna('X').map(lambda x:str(x).split()[0] if len(str(x).split()) > 1 else 'X')
    df.Fare = df.Fare.fillna(df.Fare.mean())
    df.Fare = df.Fare.map(lambda i: np.log(i) if i > 0 else 0)
    df.Embarked = df.Embarked.fillna('X')
    df.Name = df.Name.map(lambda x: x.split(',')[0])
    df["Family"] = df["SibSp"] + df["Parch"] +1
    df["Alone"] =  df["Family"].map(lambda x : "Yes" if x == 1 else "No")
    label_cols = ['Name', 'Ticket']
    onehot_cols = ['Pclass', 'Sex', 'Cabin', 'Embarked',"Alone"]
    numerical_cols = ['Age', 'SibSp', 'Parch', 'Fare',"Family"]  
    onehot_encoded_df = pd.get_dummies(df[onehot_cols])
    label_encoded_df = df[label_cols].apply(label_encoder)
    numerical_df = df[numerical_cols]
    return pd.concat([numerical_df, label_encoded_df, onehot_encoded_df], axis=1)

In [4]:
scaler = StandardScaler()
train = preprocess(df = train_df)
# test = test_preprocess(df=test_df)
test = scaler.fit_transform(test_preprocess(df=test_df))
y = train.pop("Survived")
# x = train.values
x = scaler.fit_transform(train.values)


In [5]:
 params = {'solver': 'sag',
  'class_weight': None,
  'max_iter': 703,
  'C': 1481.8092814948996}

In [6]:
clf = LogisticRegression(**params)
clf.fit(x,y)

LogisticRegression(C=1481.8092814948996, max_iter=703, solver='sag')

In [7]:
preds = clf.predict(test)

In [8]:
submit = pd.DataFrame({"PassengerId":test_df["PassengerId"],"Survived":preds})
submit.head(10)

Unnamed: 0,PassengerId,Survived
0,100000,0
1,100001,1
2,100002,1
3,100003,0
4,100004,1
5,100005,0
6,100006,1
7,100007,0
8,100008,1
9,100009,0


In [9]:
submit.to_csv("submission.csv",index = False)