In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#loading the data
train=pd.read_csv("/kaggle/input/playground-series-s5e8/train.csv")
test=pd.read_csv("/kaggle/input/playground-series-s5e8/test.csv")
train.head()

****DATA PREPROCESSING****

In [None]:
# Check unique values in the 'job' feature

# Get the number of unique values
num_unique_jobs = train['job'].nunique()

# Get the actual unique values
unique_jobs = train['job'].unique()

# Print results
print("Number of unique job values:", num_unique_jobs)
print("Unique job values:", unique_jobs)


In [None]:
# Count the frequency of each job category
job_counts = train['job'].value_counts()

print(job_counts)


In [None]:
# Group data by 'job' and calculate the mean of the target 'y'
# This tells us the probability of subscribing (y=1) for each job type
job_target_mean = train.groupby('job')['y'].mean()

# Sort the job types by the probability of subscription in descending order
job_target_sorted = job_target_mean.sort_values(ascending=False)

# Print the result
print("Jobs sorted by likelihood to subscribe to a bank term deposit:")
print(job_target_sorted)


In [None]:
# ----------------------------
# TARGET ENCODING FOR 'job'
# ----------------------------

# Step 1: Map the sorted subscription probabilities to create 'job_encoded' in training set
train['job_encoded'] = train['job'].map(job_target_sorted)

# Step 2: Apply the same mapping to the test set
# Always use the mapping from the training set to avoid data leakage
test['job_encoded'] = test['job'].map(job_target_sorted)

# Step 3: Check the result
# Display first 10 rows of original job, encoded value, and target
print(train[['job', 'job_encoded', 'y']].head(10))


In [None]:
'''
# Step 1: Calculate subscription probability per job
job_target_mean = train.groupby('job')['y'].mean()

# Step 2: Sort jobs by probability in ascending order (less likely = 0, most likely = highest number)
job_sorted = job_target_mean.sort_values().index  # returns job names sorted by subscription probability

# Step 3: Create a mapping from job name to integer label
job_label_mapping = {job: idx for idx, job in enumerate(job_sorted)}

# Step 4: Apply this mapping to train and test sets
train['job_encoded'] = train['job'].map(job_label_mapping)
test['job_encoded'] = test['job'].map(job_label_mapping)

# Step 5: Check result
print(train[['job', 'job_encoded', 'y']].head(10))




output:

           job  job_encoded  y
0   technician            5  0
1  blue-collar            0  0
2  blue-collar            0  0
3      student           11  0
4   technician            5  1
5       admin.            4  0
6  blue-collar            0  0
7       admin.            4  0
8  blue-collar            0  0
9   management            8  0
'''

*in target encoding = Captures subtle differences between categories (e.g., 0.118321 vs 0.116453 is meaningful).*

*in lable encoding = blue-collar = 0.067 vs entrepreneur = 0.081 → both get integers 0 and 1, but the real difference is very small.*

*so we choose target encoding *

In [None]:
# Get unique values in 'marital' column
marital_unique = train['marital'].unique()
print("Unique marital values:", marital_unique)

# Get the number of unique values
marital_count = train['marital'].nunique()
print("Number of unique marital values:", marital_count)


In [None]:
# ----------------------------
# TARGET ENCODING FOR 'marital'
# ----------------------------

# Step 1: Calculate mean subscription probability for each marital status
marital_target_mean = train.groupby('marital')['y'].mean()

# Step 2: Sort marital statuses by probability of subscribing (optional)
marital_target_sorted = marital_target_mean.sort_values(ascending=False)

# Step 3: Map the probabilities to the training set
train['marital_encoded'] = train['marital'].map(marital_target_sorted)

# Step 4: Apply the same mapping to the test set
test['marital_encoded'] = test['marital'].map(marital_target_sorted)

# Step 5: Check the result
print(train[['marital', 'marital_encoded', 'y']].head(10))


In [None]:
train.head()

In [None]:
# Drop original categorical columns after encoding
train = train.drop(columns=['job', 'marital'])
test = test.drop(columns=['job', 'marital'])

# Check the first few rows
print(train.head())


In [None]:
'''how does this works 
   # Step 1: Calculate mean subscription probability for each marital status
   marital_target_mean = train.groupby('marital')['y'].mean()


Explanation:

train.groupby('marital')

This groups the dataset by the values in the marital column.

For example, all rows where marital = married are grouped together, all rows with marital = single are another group, etc.

['y']

We select only the target column y (which indicates whether a client subscribed: 1 = yes, 0 = no).

.mean()

Calculates the average of y for each marital group.

Since y is 0 or 1, the mean is essentially the probability of subscription for that marital status.

Example: if 100 married people are in the dataset and 15 subscribed (y=1), the mean is 15/100 = 0.15.

'''

In [None]:
# Get unique values in 'education' column
education_unique = train['education'].unique()
print("Unique education values:", education_unique)

# Get the number of unique values
education_count = train['education'].nunique()
print("Number of unique education values:", education_count)

# Optionally, see the frequency of each education level
print(train['education'].value_counts())


In [None]:
# ----------------------------
# TARGET ENCODING FOR 'education'
# ----------------------------

# Step 1: Calculate mean subscription probability for each education level
education_target_mean = train.groupby('education')['y'].mean()

# Step 2: Sort education levels by probability of subscribing (optional)
education_target_sorted = education_target_mean.sort_values(ascending=False)

# Step 3: Map the probabilities to the training set
train['education_encoded'] = train['education'].map(education_target_sorted)

# Step 4: Apply the same mapping to the test set
test['education_encoded'] = test['education'].map(education_target_sorted)

# Step 5: Check the result
print(train[['education', 'education_encoded', 'y']].head(10))


In [None]:
# Unique values in 'default' column
default_unique = train['default'].unique()
print("Unique default values:", default_unique)

# Number of unique values
default_count = train['default'].nunique()
print("Number of unique default values:", default_count)

# Frequency of each value
print(train['default'].value_counts())


In [None]:
# Simple label encoding for 'default'
train['default_encoded'] = train['default'].map({'no': 0, 'yes': 1})
test['default_encoded'] = test['default'].map({'no': 0, 'yes': 1})

# Optional: drop original column
train = train.drop(columns=['default'])
test = test.drop(columns=['default'])

# Check the result
print(train[['default_encoded', 'y']].head(10))


In [None]:
# Drop the original 'education' column after encoding
train = train.drop(columns=['education'])
test = test.drop(columns=['education'])


In [None]:
# Number of unique values in 'balance'
balance_unique_count = train['balance'].nunique()
print("Number of unique balance values:", balance_unique_count)

# List of unique values (optional, might be very long)
balance_unique_values = train['balance'].unique()
#print("Unique balance values:", balance_unique_values)

# Number of missing/null values in 'balance'
balance_null_count = train['balance'].isnull().sum()
print("Number of null values in balance:", balance_null_count)


In [None]:
# Unique values in 'housing'
housing_unique = train['housing'].unique()
print("Unique housing values:", housing_unique)

# Number of unique values
print("Number of unique housing values:", train['housing'].nunique())

# Frequency of each value
print(train['housing'].value_counts())


In [None]:
# Simple label encoding for 'housing'
train['housing_encoded'] = train['housing'].map({'no': 0, 'yes': 1})
test['housing_encoded'] = test['housing'].map({'no': 0, 'yes': 1})

# Optional: drop original column
train = train.drop(columns=['housing'])
test = test.drop(columns=['housing'])

# Check result
print(train[['housing_encoded', 'y']].head(10))


In [None]:
# Unique values in 'loan'
loan_unique = train['loan'].unique()
print("Unique loan values:", loan_unique)

# Number of unique values
print("Number of unique loan values:", train['loan'].nunique())

# Frequency of each value
print(train['loan'].value_counts())


In [None]:
# Label encoding for 'loan'
train['loan_encoded'] = train['loan'].map({'no': 0, 'yes': 1})
test['loan_encoded'] = test['loan'].map({'no': 0, 'yes': 1})

# Optional: drop original column
train = train.drop(columns=['loan'])
test = test.drop(columns=['loan'])

# Check first few rows
print(train[['loan_encoded', 'y']].head(10))


In [None]:
# Unique values in 'contact'
contact_unique = train['contact'].unique()
print("Unique contact values:", contact_unique)

# Number of unique values
print("Number of unique contact values:", train['contact'].nunique())

# Frequency of each value
print(train['contact'].value_counts())


In [None]:
# One-hot encode 'contact' column
train_contact_ohe = pd.get_dummies(train['contact'], prefix='contact')
test_contact_ohe = pd.get_dummies(test['contact'], prefix='contact')

# Align columns of train and test (in case some category is missing in test)
train_contact_ohe, test_contact_ohe = train_contact_ohe.align(test_contact_ohe, join='outer', axis=1, fill_value=0)

# Add one-hot columns to original dataset
train = pd.concat([train, train_contact_ohe], axis=1)
test = pd.concat([test, test_contact_ohe], axis=1)

# Optional: drop original 'contact' column
train = train.drop(columns=['contact'])
test = test.drop(columns=['contact'])

# Check first few rows
print(train.head(5))


In [None]:
# Unique values in 'day'
day_unique = train['day'].unique()
print("Unique day values:", day_unique)

# Number of unique values
print("Number of unique day values:", train['day'].nunique())

# Frequency of each value
print(train['day'].value_counts())


In [None]:
# Display first 10 rows of 'day' column
print(train['day'].head(10))

# Or display the entire column (careful if dataset is large)
print(train['day'])


In [None]:
# Total number of rows in the training set
total_rows = train.shape[0]
print("Total number of rows:", total_rows)


In [None]:
# Unique values in 'month'
month_unique = train['month'].unique()
print("Unique month values:", month_unique)

# Number of unique month values
print("Number of unique month values:", train['month'].nunique())

# Frequency of each month
print(train['month'].value_counts())


What is Cyclical Encoding?

Some features are cyclical, meaning the first and last values are close to each other in meaning:

Months: Dec → Jan

Hours: 23 → 0

Days of the week: Sun → Mon

If you treat them as numeric (1–12 for months), the model might think Dec (12) is far from Jan (1), which is misleading.

Cyclical encoding fixes this using sine and cosine transformations:

In [None]:
# ----------------------------
# TARGET ENCODING FOR 'month'
# ----------------------------

# Step 1: Calculate mean subscription probability for each month
month_target_mean = train.groupby('month')['y'].mean()

# Step 2: Sort months by subscription probability (optional, just for checking)
month_target_sorted = month_target_mean.sort_values(ascending=False)
print("Months sorted by likelihood to subscribe:")
print(month_target_sorted)

# Step 3: Map the target mean to create 'month_encoded' in training set
train['month_encoded'] = train['month'].map(month_target_mean)

# Step 4: Apply the same mapping to the test set
# Always use the mapping from the training set to avoid data leakage
test['month_encoded'] = test['month'].map(month_target_mean)

# Step 5: Check the result
print(train[['month', 'month_encoded', 'y']].head(10))


In [None]:
# Check basic stats of 'duration'
print(train['duration'].describe())

# Check for unique values count
print("Number of unique duration values:", train['duration'].nunique())

# Optional: see top 10 values
print(train['duration'].head(10))


In [None]:
# Number of unique values in 'campaign'
print("Number of unique campaign values:", train['campaign'].nunique())

# Display the unique values themselves
print("Unique campaign values:", train['campaign'].unique())

# Optional: Get value counts to see frequency of each number
print("Value counts for 'campaign':")
print(train['campaign'].value_counts().sort_index())


In [None]:
# Check basic stats of 'pdays'
print(train['pdays'].describe())

# Number of unique values
print("Number of unique pdays values:", train['pdays'].nunique())

# Unique values themselves (optional, might be a lot)
print("Unique pdays values (sample):", train['pdays'].unique()[:20])

# Value counts to see frequency of each value
print("Value counts for 'pdays':")
print(train['pdays'].value_counts().sort_index())


In [None]:
# Number of unique values in 'previous'
print("Number of unique values in 'previous':", train['previous'].nunique())

# Display the unique values themselves
print("Unique 'previous' values:", train['previous'].unique())

# Value counts to see how often each number occurs
print("Value counts for 'previous':")
print(train['previous'].value_counts().sort_index())


In [None]:
# Check unique values in 'poutcome'
print("Unique 'poutcome' values:", train['poutcome'].unique())

# Number of unique values
print("Number of unique 'poutcome' values:", train['poutcome'].nunique())

# Value counts to see distribution
print("Value counts for 'poutcome':")
print(train['poutcome'].value_counts())


In [None]:
# Perform one-hot encoding for 'poutcome' in the training set
poutcome_dummies_train = pd.get_dummies(train['poutcome'], prefix='poutcome')

# Concatenate the new one-hot columns to the original training dataframe
train = pd.concat([train, poutcome_dummies_train], axis=1)

# Do the same for the test set
poutcome_dummies_test = pd.get_dummies(test['poutcome'], prefix='poutcome')
test = pd.concat([test, poutcome_dummies_test], axis=1)

# Optional: Drop the original 'poutcome' column if you don't need it anymore
train.drop('poutcome', axis=1, inplace=True)
test.drop('poutcome', axis=1, inplace=True)

# Check the first few rows to see the one-hot encoded columns
print(train.head())


In [None]:
train.head()

In [None]:
# Columns in the training set
print("Training set columns:")
print(train.columns)

# Columns in the test set
print("\nTest set columns:")
print(test.columns)


In [None]:
# List all column names in the training dataset
print(train.columns)

In [None]:
# Drop the original 'month' column since 'month_encoded' exists
train.drop(columns=['month'], inplace=True)
test.drop(columns=['month'], inplace=True)

# Check remaining columns
print(train.columns)


In [None]:
# Separate features and target
X_train = train.drop(columns=['id', 'y'])
y_train = train['y']

# For test set, drop 'id' column only
X_test = test.drop(columns=['id'])


In [None]:
from sklearn.preprocessing import StandardScaler

# List of numeric columns to scale
numeric_cols = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

# Initialize the scaler
scaler = StandardScaler()

# Fit on training data and transform
X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])

# Transform the test set using the same scaler
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])

# Check first 5 rows after scaling
print(X_train.head())


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

# Step 1: Initialize the model
model = LogisticRegression(max_iter=1000, random_state=42)

# Step 2: Train the model on the training data
model.fit(X_train, y_train)

# Step 3: Predict probabilities on the training set (to check ROC-AUC)
y_train_pred_proba = model.predict_proba(X_train)[:, 1]

# Step 4: Calculate ROC-AUC score on training set
roc_auc = roc_auc_score(y_train, y_train_pred_proba)
print(f"Training ROC-AUC Score: {roc_auc:.4f}")

# Step 5: Predict probabilities on the test set for submission
y_test_pred_proba = model.predict_proba(X_test)[:, 1]

# Step 6: Create submission DataFrame
submission = pd.DataFrame({
    'id': test['id'],
    'y': y_test_pred_proba
})

# Step 7: Save submission file
submission.to_csv('submission.csv', index=False)
print("Submission file created: submission.csv")
