In [11]:
#! pip install lightgbm

In [31]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score, confusion_matrix
import operator
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

In [32]:
target = 'redemption_status'

features = ['age_range', 'c_coverage_brand', 'c_coverage_brandt', 'c_coverage_category', 'c_coverage_item', 'c_freq_brand', 'c_freq_brandt', 'c_freq_category', 'c_items_freq_brand', 'c_items_freq_brandt', 'c_items_freq_category', 'c_items_rare_brand', 'c_items_rare_brandt', 'c_items_rare_category', 'c_rare_brand', 'c_rare_brandt', 'c_rare_category', 'c_unique_brand', 'c_unique_brandt', 'c_unique_category', 'c_unique_items', 'campaign_id', 'campaign_type', 'coupon_id', 'customer_id', 'duration', 'family_size', 'income_bracket', 'marital_status', 'no_of_children']

categorical_columns = ['age_range', 'c_freq_brand', 'c_freq_brandt', 'c_freq_category', 'c_rare_brand', 'c_rare_brandt', 'c_rare_category', 'campaign_id', 'campaign_type', 'coupon_id', 'customer_id', 'family_size', 'income_bracket', 'marital_status', 'no_of_children', 'rented', 'overall_freq_brand', 'overall_rare_brand', 'overall_freq_brandt', 'overall_rare_brandt', 'overall_freq_category', 'overall_rare_category']

In [None]:
def preprocess(trainset, testset, categorical_columns, features, target):
    # Combine for consistent preprocessing
    dataset = pd.concat([trainset, testset], ignore_index=True).copy()

    # Fill missing values
    dataset = dataset.fillna(0)

    # Keep only needed columns
    available_features = [col for col in features if col in dataset.columns]
    available_categoricals = [col for col in categorical_columns if col in dataset.columns]

    # Convert to category dtype
    for column in available_categoricals:
        dataset[column] = dataset[column].astype('category')

    # Select only final features
    dataset = dataset[available_features]

    # Split back
    train_len = len(trainset)
    train_features = dataset.iloc[:train_len].reset_index(drop=True)
    test_features = dataset.iloc[train_len:].reset_index(drop=True)

    # Add back target
    trainset = pd.concat([trainset[[target]].reset_index(drop=True), train_features], axis=1)
    testset = test_features

    return trainset, testset



In [34]:
trainset = pd.read_csv(r'C:\Users\sahil\OneDrive\Pictures\Documents\OneDrive\Desktop\Project\Data\train1.csv', index_col='id', parse_dates=['start_date','end_date'])
trainset  = pd.DataFrame(trainset)
trainset

Unnamed: 0_level_0,redemption_status,campaign_id,coupon_id,customer_id,campaign_type,start_date,end_date,duration,age_range,marital_status,...,overall_coverage_item,overall_coverage_brand,overall_coverage_brandt,overall_coverage_category,overall_podiscount,overall_pcdiscount,overall_ptdiscount,overall_podiscount_pq,overall_pcdiscount_pq,overall_ptdiscount_pq
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,13,27,1053,0,2013-05-19,2013-07-05,47,46-55,0.0,...,0.002808,0.015195,1.0,0.421053,-0.219529,-0.001901,-0.221430,-0.226799,-0.002547,-0.229346
2,0,13,116,48,0,2013-05-19,2013-07-05,47,36-45,1.0,...,0.003294,0.018452,1.0,0.631579,-0.134105,-0.015566,-0.149671,-0.134672,-0.014163,-0.148835
6,0,9,635,205,1,2013-03-11,2013-04-12,32,46-55,1.0,...,0.007196,0.033647,1.0,0.578947,-0.172274,-0.021414,-0.193688,-0.149134,-0.019910,-0.169044
7,0,13,644,1050,0,2013-05-19,2013-07-05,47,,,...,0.002916,0.013205,1.0,0.421053,-0.204061,-0.009207,-0.213268,-0.171436,-0.002500,-0.173936
9,0,8,1017,1489,0,2013-02-16,2013-04-05,48,46-55,1.0,...,0.004415,0.019175,1.0,0.421053,-0.205019,-0.004710,-0.209729,-0.202754,-0.006626,-0.209380
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128587,0,8,71,1523,0,2013-02-16,2013-04-05,48,70+,1.0,...,0.004807,0.032381,1.0,0.578947,-0.085659,0.000000,-0.085659,-0.080501,0.000000,-0.080501
128589,0,30,547,937,0,2012-11-19,2013-01-04,46,70+,1.0,...,0.003011,0.017366,1.0,0.473684,-0.217481,-0.004803,-0.222285,-0.202415,-0.005312,-0.207727
128590,0,8,754,1004,0,2013-02-16,2013-04-05,48,,,...,0.016148,0.057344,1.0,0.736842,-0.158425,-0.001542,-0.159967,-0.156793,-0.000941,-0.157734
128592,0,13,134,71,0,2013-05-19,2013-07-05,47,36-45,1.0,...,0.007534,0.026954,1.0,0.526316,-0.354192,-0.022114,-0.376306,-0.348428,-0.023312,-0.371739


In [35]:
testset = pd.read_csv(r'C:\Users\sahil\OneDrive\Pictures\Documents\OneDrive\Desktop\Project\Data\test1.csv', index_col='id', parse_dates=['start_date','end_date'])
testset  = pd.DataFrame(testset)
testset.head()

Unnamed: 0_level_0,campaign_id,coupon_id,customer_id,campaign_type,start_date,end_date,duration,age_range,marital_status,rented,...,overall_coverage_item,overall_coverage_brand,overall_coverage_brandt,overall_coverage_category,overall_podiscount,overall_pcdiscount,overall_ptdiscount,overall_podiscount_pq,overall_pcdiscount_pq,overall_ptdiscount_pq
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,22,869,967,0,2013-09-16,2013-10-18,32,36-45,0.0,0.0,...,0.008884,0.038531,1.0,0.684211,-0.116013,-0.015037,-0.131051,-0.112131,-0.015335,-0.127466
4,20,389,1566,1,2013-09-07,2013-11-16,70,26-35,1.0,0.0,...,0.016391,0.053003,1.0,0.789474,-0.123949,-0.005637,-0.129586,-0.12924,-0.005853,-0.135093
5,22,981,510,0,2013-09-16,2013-10-18,32,26-35,0.0,0.0,...,0.013758,0.037988,1.0,0.578947,-0.32241,-0.019489,-0.3419,-0.332639,-0.0195,-0.352139
8,25,1069,361,1,2013-10-21,2013-11-22,32,18-25,0.0,0.0,...,0.004361,0.021346,1.0,0.421053,-0.18594,0.0,-0.18594,-0.179631,0.0,-0.179631
10,17,498,811,1,2013-07-29,2013-08-30,32,,,,...,0.009721,0.037808,1.0,0.578947,-0.196402,-0.00091,-0.197312,-0.184152,-0.000647,-0.184799


In [36]:
trainset, testset= preprocess(trainset, testset,categorical_columns,features,target)

print("Trainset size: {}".format(trainset.shape))
print("Testset size: {}".format(testset.shape))

Trainset size: (78369, 31)
Testset size: (50226, 30)


In [37]:
class Ensemble():
    def __init__(self, models):
        self.__models = models
    
    def predict(self, data):
        pred_sum = np.zeros(len(data))
        for model in self.__models:
            pred_sum = pred_sum + model.predict(data, num_iteration=model.best_iteration)
        return pred_sum / len(self.__models)

In [38]:
from sklearn.metrics import roc_auc_score

def run_lgbm(X_train, y_train, verbose):
    # Train-validation split
    X_tr, X_val, y_tr, y_val = train_test_split( trainset,testset, test_size=0.2, random_state=42)

    params = {
        "objective": "binary",
        "metric": "auc",
        "boosting": "gbdt",
        "verbosity": -1,
        "seed": 41,
        "learning_rate": 0.08,
        "bagging_fraction": 0.9,
        "bagging_freq": 50,
        "feature_fraction": 0.4
    }

    lgtrain = lgb.Dataset(X_tr, label=y_tr)
    lgval = lgb.Dataset(X_val, label=y_val, reference=lgtrain)

    model = lgb.train(
        params,
        lgtrain,
        num_boost_round=3000,
        valid_sets=[lgval],
        valid_names=["valid_0"],
        callbacks=[
            lgb.early_stopping(stopping_rounds=100),
            lgb.log_evaluation(period=verbose if verbose else 0)
        ]
    )

    # Manual AUC score calculation on validation
    y_pred = model.predict(X_val, num_iteration=model.best_iteration)
    auc_score = roc_auc_score(y_val, y_pred)
    print("num_leaves {}: AUC = {:.5f}".format(num_leaves, auc_score))

    return model


In [39]:
ensemble_model, train_auc = run_lgbm(X_tr,y_tr, verbose=100)
print("Training AUC Score:", train_auc)

NameError: name 'X_tr' is not defined

In [None]:
# Predict probabilities (or classes if required)
test_preds = ensemble_model.predict(testset)

print("test predictions", test_preds)

test predictions [1.56689549e-03 1.12429333e-05 3.86604076e-02 ... 6.17453768e-04
 1.74065828e-03 5.04899280e-05]


In [None]:
# Convert to DataFrame
submission = pd.DataFrame({
    'id': testset.index,  # index was set to 'id' while reading
    'redemption_status': test_preds  # use test_preds directly
})




In [None]:
# Normalize predictions between 0 and 1
submission['redemption_status'] = submission['redemption_status'] /

# Optional: Round to 0 or 1 if binary prediction needed
submission['predicted_class'] = (submission['redemption_status'] >= 0.5).astype(int)


In [None]:
# Save to CSV
submission.to_csv('test_result2.csv', index=False)
print("Submission saved as 'test_result2.csv'")

Submission saved as 'test_result1.csv'


In [None]:
submission.tail()

Unnamed: 0,id,redemption_status,predicted_class
50221,50221,2.110926e-07,0
50222,50222,1.870277e-07,0
50223,50223,6.174538e-06,0
50224,50224,1.740658e-05,0
50225,50225,5.048993e-07,0
