In [None]:
import pandas as pd
import csv


In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/aug_train.csv')
test = pd.read_csv('../input/hr-analytics-job-change-of-data-scientists/aug_test.csv')
train.head()

In [None]:
train = train.drop('enrollee_id', axis=1)

In [None]:
def describe_data(df):
    print("Data Types:")
    print(df.dtypes)
    print("Rows and Columns:")
    print(df.shape)
    print("Column Names:")
    print(df.columns)
    print("Null Values:")
    print(df.apply(lambda x: sum(x.isnull()) / len(df)))

In [None]:
describe_data(train)

In [None]:
numeric_features = train.select_dtypes(include=['int64', 'float64']).drop(['target'],axis=1).columns
categorical_features = train.select_dtypes(include=['object']).columns

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [None]:
from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [None]:
from sklearn.model_selection import train_test_split
X = train.drop('target', axis=1)
y = train['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
from sklearn.linear_model import LogisticRegression
lr = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression(solver='lbfgs'))])

In [None]:
lr.fit(X_train, y_train)
print("model score: %.3f" % lr.score(X_test, y_test))

In [None]:
test_no_id = test.drop('enrollee_id', axis=1)

In [None]:
test_predictions = lr.predict(test_no_id)

In [None]:
enrollee_id = test['enrollee_id']
submission_df_1 = pd.DataFrame({
                  "enrollee_id": enrollee_id, 
                  "target": test_predictions})

In [None]:
submission_df_1.to_csv('submission_1.csv', index=False)