<a href="https://www.kaggle.com/code/poushalimukherjee/logreg-pg-series-s3e3?scriptVersionId=116814456" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

## Import Libraries

In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import ( OneHotEncoder,
                                    StandardScaler )
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import ( accuracy_score,
                              confusion_matrix,
                              precision_score, recall_score, f1_score,
                              precision_recall_curve  )

## Prepare Train / Test Data

In [2]:
df_train = pd.read_csv('/kaggle/input/playground-series-s3e3/train.csv')
df_test  = pd.read_csv('/kaggle/input/playground-series-s3e3/test.csv')

In [3]:
df_train = df_train.set_index('id')
df_test  = df_test.set_index('id')

X = df_train.drop(['Attrition'], axis=1)
y = df_train['Attrition']

In [4]:
X_train, X_valid, y_train, y_valid = train_test_split( X, y, 
                                                     test_size=0.3, 
                                                     random_state=42)

X_test = df_test

## Pre-Process Data

In [5]:
features_cat = ['Gender', 'BusinessTravel', 'Department', 'EducationField', 'EnvironmentSatisfaction',
               'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'MaritalStatus', 'OverTime', 'PerformanceRating',
               'RelationshipSatisfaction', 'StockOptionLevel','WorkLifeBalance']

features_num = ['Age', 'DailyRate', 'DistanceFromHome', 'HourlyRate', 'MonthlyIncome', 'MonthlyRate', 
               'NumCompaniesWorked', 'PercentSalaryHike', 'TotalWorkingYears', 'TrainingTimesLastYear',
               'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager']

In [6]:
numeric_transformer = Pipeline(
                            [ ('impute', SimpleImputer(strategy="median")),
                              ('scale', StandardScaler()) ] )

In [7]:
categotical_transformer = OneHotEncoder(handle_unknown="ignore")

In [8]:
preprocessor = ColumnTransformer(
                 [
                     ("num", numeric_transformer, features_num),
                     ("cat", categotical_transformer, features_cat)
                     
                 ],
                 verbose_feature_names_out=False
               )

## Build Model

In [9]:
class_weights_manual = {0:1., 1:5.}
unique_classes = np.unique(y_train)
class_weights_array = class_weight.compute_class_weight('balanced',
                                                   classes=np.unique(y_train),
                                                   y=y_train)
class_weights = {}
for i in range(len(unique_classes)):
    class_weights[i] = class_weights_array[i]

In [10]:
samples = X_train.shape[0]
log_reg = LogisticRegression( 
                               penalty = 'l2',
                               C = 0.25,
                               solver  = 'saga',
                               class_weight=class_weights,
                               max_iter=500,
                               tol=0.0001,
                               verbose=0
                            )
model_log_reg = Pipeline(
                 [
                     ('preprocessor', preprocessor),
                     ('logreg', log_reg)
                 ]
    )

In [11]:
model_log_reg.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('impute',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scale',
                                                                   StandardScaler())]),
                                                  ['Age', 'DailyRate',
                                                   'DistanceFromHome',
                                                   'HourlyRate',
                                                   'MonthlyIncome',
                                                   'MonthlyRate',
                                                   'NumCompaniesWorked',
                                                   'PercentSalaryHike',
                                                   'TotalWorkingYears',
         

## Performance Analysis

In [12]:
y_pred = model_log_reg.predict( X=X_valid )
y_pred_proba = model_log_reg.predict_proba( X=X_valid )

score_accuracy = accuracy_score(y_valid, y_pred)
score_f1       = f1_score(y_valid, y_pred)
conf_matrix = confusion_matrix(y_valid, y_pred)

## Predicting Test-Data

In [13]:
y_pred_test = model_log_reg.predict( X = X_test )
y_pred_proba_test = model_log_reg.predict_proba( X = X_test )

df_pred_test = pd.DataFrame(columns=['id', 'Attrition_label', 'Attrition'])
df_pred_test['id'] = X_test.index
df_pred_test['Attrition_label'] = y_pred_test
df_pred_test['Attrition'] = y_pred_proba_test

df_pred_test['Attrition'] = 1 - df_pred_test['Attrition']

## Submission

In [14]:
df_pred_test.drop('Attrition_label', axis=1).to_csv('submission.csv', 
                                                 index=False)