In [1]:
import pandas as pd

# Load the datasets
X_train_path = "train_X.csv"
y_train_path = "train_y.csv"

X_train = pd.read_csv(X_train_path)
y_train = pd.read_csv(y_train_path)

# Display basic info about the datasets
X_train.info(), y_train.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20413 entries, 0 to 20412
Data columns (total 28 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   ID                                     20413 non-null  int64  
 1   DIVISION_NUMBER                        20413 non-null  int64  
 2   PRODUCT_NUMBER                         20413 non-null  int64  
 3   PURCHASE_ORDER_DUE_DATE                20413 non-null  object 
 4   COMPANY_VENDOR_NUMBER                  20413 non-null  int64  
 5   SHIP_FROM_VENDOR                       20413 non-null  int64  
 6   ORDER_DATE                             20413 non-null  object 
 7   ORDER_DAY_OF_WEEK                      20413 non-null  int64  
 8   PRODUCT_CLASSIFICATION                 20413 non-null  int64  
 9   PURCHASE_ORDER_TYPE                    20413 non-null  int64  
 10  DISTANCE_IN_MILES                      20413 non-null  float64
 11  DI

(None, None)

## Instructions {-}

1. This template serves as the required format for your code and report submission for the Prediction Problem assignment.
2. You may modify the template to improve readability or add relevant details, but it must include all requested information.
3. Ensure that your work is reproducible, meaning your code should consistently yield a metric value close to your Kaggle leaderboard score despite inherent randomness in data science.

## 1) Exploratory Data Analysis (EDA)

* Summarize key insights obtained from the dataset.
* Discuss trends, correlations, or anomalies that influenced your modeling decisions.
* Provide relevant data visualizations (e.g., histograms, scatter plots, correlation matrices) to support your findings.

## 2) Data Preprocessing

Describe any preprocessing steps performed on your dataset. This may include imputing missing values, creating dummy variables, combining levels of categorical variable(s), discarding predictors that are not useful, etc.

In [None]:
# Put your data preparation code with comments here
# The code should end when you obtain the data used for the model in Question 4

In [2]:
# Check missing values
missing_values = X_train.isnull().sum()
missing_values = missing_values[missing_values > 0]

# Summarize categorical features
categorical_features = X_train.select_dtypes(include=['object', 'int64']).nunique()

# Display missing values and categorical feature summary
missing_values, categorical_features


(AVERAGE_DAILY_DEMAND_CASES          41
 AVERAGE_VENDOR_ORDER_CYCLE_DAYS    339
 AVERAGE_ORDER_CYCLE_DAYS           339
 AVERAGE_ORDER_CYCLE_CASES          339
 dtype: int64,
 ID                                 20413
 DIVISION_NUMBER                        3
 PRODUCT_NUMBER                      4498
 PURCHASE_ORDER_DUE_DATE              137
 COMPANY_VENDOR_NUMBER                344
 SHIP_FROM_VENDOR                     371
 ORDER_DATE                            89
 ORDER_DAY_OF_WEEK                      6
 PRODUCT_CLASSIFICATION                35
 PURCHASE_ORDER_TYPE                    2
 DIVISION_CODE                          3
 PURCHASE_FROM_VENDOR                 465
 DAYS_BETWEEN_ORDER_AND_DUE_DATE       65
 DUE_DATE_WEEKDAY                       7
 PRODUCT_MARKET                      5160
 RESERVABLE_INDICATOR                   1
 PRODUCT_STATUS                         1
 dtype: int64)

- Handle missing values.
- Convert date columns into numerical features.
- Encode categorical variables.
- Scale numerical features and create interaction terms.

In [3]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from datetime import datetime

# Drop non-informative columns
X_train = X_train.drop(columns=['RESERVABLE_INDICATOR', 'PRODUCT_STATUS'])

# Convert date columns to datetime format
X_train['PURCHASE_ORDER_DUE_DATE'] = pd.to_datetime(X_train['PURCHASE_ORDER_DUE_DATE'])
X_train['ORDER_DATE'] = pd.to_datetime(X_train['ORDER_DATE'])

# Create new date-related features
X_train['ORDER_YEAR'] = X_train['ORDER_DATE'].dt.year
X_train['ORDER_MONTH'] = X_train['ORDER_DATE'].dt.month
X_train['ORDER_DAY'] = X_train['ORDER_DATE'].dt.day

X_train['DUE_YEAR'] = X_train['PURCHASE_ORDER_DUE_DATE'].dt.year
X_train['DUE_MONTH'] = X_train['PURCHASE_ORDER_DUE_DATE'].dt.month
X_train['DUE_DAY'] = X_train['PURCHASE_ORDER_DUE_DATE'].dt.day

# Create a feature for the number of days between order and due date
X_train['ORDER_TO_DUE_DAYS'] = (X_train['PURCHASE_ORDER_DUE_DATE'] - X_train['ORDER_DATE']).dt.days

# Drop original date columns
X_train = X_train.drop(columns=['PURCHASE_ORDER_DUE_DATE', 'ORDER_DATE'])

# Define categorical and numerical columns
categorical_cols = ['DIVISION_NUMBER', 'PURCHASE_ORDER_TYPE', 'DIVISION_CODE']
numerical_cols = [col for col in X_train.columns if col not in categorical_cols + ['ID']]

# Define preprocessing pipeline
categorical_transformer = OneHotEncoder(handle_unknown='ignore')
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Apply transformations
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

# Fit and transform the data
X_processed = preprocessor.fit_transform(X_train)

# Convert target variable to binary format
y_train = y_train['ON_TIME_AND_COMPLETE'].astype(int)

# Display transformed data shape
X_processed.shape


(20413, 35)

## 3) Feature Engineering

* List and explain the new features you created (if any).
* Justify why these features were added and how they contribute to improving the model.

Mention the logical/intuitive steps you took to obtain the final model. This may include identifying transformations, significant interactions, variable selection, etc. You do not need to put any code here.

## 4) Model Selection and Training

Put your model here.

In [2]:
# Put the code that develops the model using the data you processed in Question 2, 
# and then uses the developed model on test data for prediction.

### Model 1

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Split data into training and validation sets
X_train_final, X_val, y_train_final, y_val = train_test_split(X_processed, y_train, test_size=0.2, random_state=42)

# Train Logistic Regression model
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train_final, y_train_final)

# Make predictions
y_pred = log_reg.predict(X_val)

# Evaluate model performance
accuracy = accuracy_score(y_val, y_pred)
classification_rep = classification_report(y_val, y_pred)

accuracy, classification_rep


(0.6882194464854274,
 '              precision    recall  f1-score   support\n\n           0       0.66      0.80      0.72      2077\n           1       0.74      0.57      0.64      2006\n\n    accuracy                           0.69      4083\n   macro avg       0.70      0.69      0.68      4083\nweighted avg       0.70      0.69      0.68      4083\n')

# Model 2 Adding Interaction Terms

In [5]:
from sklearn.preprocessing import PolynomialFeatures

# Generate interaction terms
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_interactions = poly.fit_transform(X_processed)

# Split data again after generating interaction terms
X_train_final, X_val, y_train_final, y_val = train_test_split(X_interactions, y_train, test_size=0.2, random_state=42)

# Train Logistic Regression with interactions
log_reg_inter = LogisticRegression(max_iter=1000, random_state=42)
log_reg_inter.fit(X_train_final, y_train_final)

# Make predictions
y_pred_inter = log_reg_inter.predict(X_val)

# Evaluate model
accuracy_inter = accuracy_score(y_val, y_pred_inter)
classification_rep_inter = classification_report(y_val, y_pred_inter)

accuracy_inter, classification_rep_inter


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


(0.7844722018123929,
 '              precision    recall  f1-score   support\n\n           0       0.76      0.83      0.80      2077\n           1       0.81      0.73      0.77      2006\n\n    accuracy                           0.78      4083\n   macro avg       0.79      0.78      0.78      4083\nweighted avg       0.79      0.78      0.78      4083\n')

Address Convergence Warning:

Increase max_iter or switch to the saga solver.
Apply feature scaling more aggressively to improve convergence.
Regularization Tuning:

Optimize C (regularization strength) in Logistic Regression.
Compare L1 (Lasso) and L2 (Ridge) regularization to improve generalization.
Feature Selection:

Reduce dimensionality by selecting the most impactful features.
Remove redundant interaction terms.

In [6]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid for Logistic Regression tuning
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'solver': ['lbfgs', 'saga'],  # Test different solvers
    'max_iter': [2000]  # Increase iteration limit to address convergence issues
}

# Perform Grid Search with Cross-Validation
log_reg_tuned = GridSearchCV(LogisticRegression(), param_grid, cv=5, scoring='accuracy', n_jobs=-1)
log_reg_tuned.fit(X_train_final, y_train_final)

# Get best model and evaluate on validation set
best_model = log_reg_tuned.best_estimator_
y_pred_best = best_model.predict(X_val)

# Evaluate best model
accuracy_best = accuracy_score(y_val, y_pred_best)
classification_rep_best = classification_report(y_val, y_pred_best)

best_model, accuracy_best, classification_rep_best


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


(LogisticRegression(C=10, max_iter=2000),
 0.7822679402400196,
 '              precision    recall  f1-score   support\n\n           0       0.76      0.83      0.80      2077\n           1       0.81      0.73      0.77      2006\n\n    accuracy                           0.78      4083\n   macro avg       0.78      0.78      0.78      4083\nweighted avg       0.78      0.78      0.78      4083\n')

## 5) Model Prediction and Evaluation

In [None]:
# Generate prediction and report the accuracy of the model

Please note that your code for Questions 2, 4, and 5 will be executed sequentially, and it should generate a metric value close to the one displayed next to your name on the Kaggle leaderboard. While minor variations due to randomness in data science are expected, your implementation must be consistent and reproducible to receive full credit.

## 6) Complete Code Submission

* Provide a link to your Kaggle notebook or a GitHub repository containing your code.
* Ensure the notebook is properly commented and reproducible.

## 7) Reflection and Challenges

* Discuss any challenges faced during this process and how they were addressed.
* Mention any improvements you would make for the next steps