In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df_train = pd.read_csv('../input/tabular-playground-series-nov-2021/train.csv')
df_test = pd.read_csv('../input/tabular-playground-series-nov-2021/test.csv')
submission = pd.read_csv('../input/tabular-playground-series-nov-2021/sample_submission.csv')

Let's have an overview of the data

In [None]:
print(df_train.info())
print('*****')
print(df_test.info())

In [None]:
print(df_train.shape)
print(df_test.shape)

In [None]:
features = [col for col in df_train.columns if 'f' in col]

In [None]:
print(features)

### Observing the test data

In [None]:
import seaborn as sns

sns.countplot(x = 'target', data = df_train)

### Conclusion
The distribution of data with target = 0  and target = 1 made are the same. This is one less thing to worry about :)

### Observing the distribution of data in the train and test set

In [None]:
import matplotlib.pyplot as plt

for idx, feature in enumerate(features):
    plt.hist(df_train[feature], bins=30, alpha=0.5, label='Train set')
    plt.hist(df_test[feature], bins=30, alpha=0.5, label='Test set')
    plt.title(feature + " Train/Test")
    plt.xlabel(feature)
    plt.ylabel('Frequency')

    plt.legend()
    plt.show()

### Conclusion

The train and test data distribution are similar. This is great!!!

In [None]:
print(df_train[features].isna().sum().sum())
print(df_test[features].isna().sum().sum())

### No missing values in train and test data. That is one less thing to worry about

In [None]:
X = df_train[features].copy()
y = df_train['target'].copy()

x_test = df_test[features].copy()

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X[features] = scaler.fit_transform(X[features])
x_test[features] = scaler.transform(x_test[features])

In [None]:
for idx, feature in enumerate(features):
    plt.plot(X[df_train['target']==0][feature], df_train.loc[df_train['target']==0]['target'])
    plt.plot(X[df_train['target']==1][feature], df_train.loc[df_train['target']==1]['target'])
    plt.xlabel(feature)

    plt.legend()
    plt.show()

The range of value for each feature is the same for target value = 0 and target value = 1

## Implementing Logistic Regression model

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
model = LogisticRegression(random_state=0)
model.fit(X_train,y_train)

y_pred = model.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
  
print ("Confusion Matrix : \n", cm)

### True positive + True negative = 71488 + 74826 = 146314
### False positive + False negative = 26309 + 25377 = 51686

In [None]:
from sklearn.metrics import accuracy_score
print ("Accuracy : ", accuracy_score(y_test, y_pred))

In [None]:
logistic_prediction = model.predict(x_test)

In [None]:
logistic_csv = pd.DataFrame()
logistic_csv['id'] = df_test['id']
logistic_csv['target'] = logistic_prediction

In [None]:
logistic_csv.to_csv('logistic', index=False)