In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
customer_data = pd.read_csv('Data\customer_demographics.csv')
customer_data.head()

Unnamed: 0,customer_id,age_range,marital_status,rented,family_size,no_of_children,income_bracket
0,1,70+,Married,0,2,,4
1,6,46-55,Married,0,2,,5
2,7,26-35,,0,3,1.0,3
3,8,26-35,,0,4,2.0,6
4,10,46-55,Single,0,1,,5


In [3]:
for i in customer_data.columns:
    print(i, customer_data[i].isnull().sum())

customer_id 0
age_range 0
marital_status 329
rented 0
family_size 0
no_of_children 538
income_bracket 0


In [4]:
customer_data["no_of_children"].value_counts()

no_of_children
1     107
3+     60
2      55
Name: count, dtype: int64

In [5]:
def get_marital_status(row):
    na_row = row.isna()    #checks for missing values (NaNs) in a row of a DataFrame
    if not na_row['marital_status']:
        return row['marital_status']
    return 'Married' if row['family_size'] - row['no_of_children'] > 1 else 'Single'

customer_data['family_size'] = customer_data['family_size'].str.replace('+','').astype('int')
customer_data['no_of_children'] = customer_data['no_of_children'].fillna('0').str.replace('+','').astype('int')
customer_data['marital_status'] = customer_data.apply(get_marital_status, axis=1)
customer_data['marital_status'] = customer_data['marital_status'].replace({'Single': 0, 'Married': 1})

customer_data.head()

Unnamed: 0,customer_id,age_range,marital_status,rented,family_size,no_of_children,income_bracket
0,1,70+,1,0,2,0,4
1,6,46-55,1,0,2,0,5
2,7,26-35,1,0,3,1,3
3,8,26-35,1,0,4,2,6
4,10,46-55,0,0,1,0,5


In [6]:
train_data = pd.read_csv(r"C:\Users\sahil\OneDrive\Pictures\Documents\OneDrive\Desktop\Project\train1.csv")
train_data.head()

Unnamed: 0,id,campaign_id,coupon_id,customer_id,redemption_status
0,1,13,27,1053,0
1,2,13,116,48,0
2,6,9,635,205,0
3,7,13,644,1050,0
4,9,8,1017,1489,0


In [7]:
merged_df = pd.merge(customer_data,train_data,on="customer_id",how="inner")

In [8]:
merged_df

Unnamed: 0,customer_id,age_range,marital_status,rented,family_size,no_of_children,income_bracket,id,campaign_id,coupon_id,redemption_status
0,1,70+,1,0,2,0,4,4674,29,597,0
1,1,70+,1,0,2,0,4,5085,8,424,0
2,1,70+,1,0,2,0,4,5380,8,103,0
3,1,70+,1,0,2,0,4,6741,29,896,0
4,1,70+,1,0,2,0,4,9849,12,705,0
...,...,...,...,...,...,...,...,...,...,...,...
43656,1581,26-35,1,0,3,1,1,89281,8,256,0
43657,1581,26-35,1,0,3,1,1,99526,8,447,0
43658,1581,26-35,1,0,3,1,1,114949,8,93,0
43659,1581,26-35,1,0,3,1,1,117173,8,1033,0


In [9]:
merged_df.drop(['id','campaign_id','coupon_id'],axis=1,inplace=True)

In [10]:
merged_df

Unnamed: 0,customer_id,age_range,marital_status,rented,family_size,no_of_children,income_bracket,redemption_status
0,1,70+,1,0,2,0,4,0
1,1,70+,1,0,2,0,4,0
2,1,70+,1,0,2,0,4,0
3,1,70+,1,0,2,0,4,0
4,1,70+,1,0,2,0,4,0
...,...,...,...,...,...,...,...,...
43656,1581,26-35,1,0,3,1,1,0
43657,1581,26-35,1,0,3,1,1,0
43658,1581,26-35,1,0,3,1,1,0
43659,1581,26-35,1,0,3,1,1,0


In [11]:
#redemption_count =merged_df.groupby('customer_id')['redemption_status'].sum().reset_index(name="redemption_count")

In [12]:
#merged_df =merged_df.merge(redemption_count,on='customer_id',how='left')
merged_df

Unnamed: 0,customer_id,age_range,marital_status,rented,family_size,no_of_children,income_bracket,redemption_status
0,1,70+,1,0,2,0,4,0
1,1,70+,1,0,2,0,4,0
2,1,70+,1,0,2,0,4,0
3,1,70+,1,0,2,0,4,0
4,1,70+,1,0,2,0,4,0
...,...,...,...,...,...,...,...,...
43656,1581,26-35,1,0,3,1,1,0
43657,1581,26-35,1,0,3,1,1,0
43658,1581,26-35,1,0,3,1,1,0
43659,1581,26-35,1,0,3,1,1,0


In [13]:
merged_df['age_range'].unique()




array(['70+', '46-55', '26-35', '36-45', '56-70', '18-25'], dtype=object)

In [14]:
age_order = ['70+', '46-55', '26-35', '36-45', '56-70', '18-25']
merged_df['age_encoded'] = merged_df['age_range'].map({age: i for i, age in enumerate(age_order)})
merged_df

Unnamed: 0,customer_id,age_range,marital_status,rented,family_size,no_of_children,income_bracket,redemption_status,age_encoded
0,1,70+,1,0,2,0,4,0,0
1,1,70+,1,0,2,0,4,0,0
2,1,70+,1,0,2,0,4,0,0
3,1,70+,1,0,2,0,4,0,0
4,1,70+,1,0,2,0,4,0,0
...,...,...,...,...,...,...,...,...,...
43656,1581,26-35,1,0,3,1,1,0,2
43657,1581,26-35,1,0,3,1,1,0,2
43658,1581,26-35,1,0,3,1,1,0,2
43659,1581,26-35,1,0,3,1,1,0,2


In [15]:
merged_df.drop(['age_range'],axis=1,inplace= True)

In [16]:
merged_df['redemption_status'].value_counts()

redemption_status
0    43093
1      568
Name: count, dtype: int64

In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE

In [18]:
X = merged_df.drop('redemption_status', axis=1)
y = merged_df['redemption_status']


In [19]:

from sklearn.model_selection import train_test_split,GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix



# Step 2: Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)

# Step 3: Initialize and train the Random Forest with class weighting
clf = RandomForestClassifier(class_weight='balanced', random_state=42)
clf.fit(X_train, y_train)

# Step 4: Predict and evaluate
y_pred = clf.predict(X_test)

# Step 5: Output metrics
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.82      0.90     12929
           1       0.05      0.64      0.08       170

    accuracy                           0.82     13099
   macro avg       0.52      0.73      0.49     13099
weighted avg       0.98      0.82      0.89     13099

Confusion Matrix:
[[10662  2267]
 [   62   108]]


In [20]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 5,10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'class_weight': ['balanced']  # to handle class imbalance
}

# Step 4: Initialize the model
rf = RandomForestClassifier(random_state=42)

# Step 5: Grid search
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid,
                           cv=5, scoring='f1', n_jobs=-1, verbose=1)

grid_search.fit(X_train, y_train)

# Step 6: Evaluate best model
best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test)

print("Best Parameters Found:")
print(grid_search.best_params_)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best Parameters Found:
{'class_weight': 'balanced', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.83      0.90     12929
           1       0.05      0.62      0.08       170

    accuracy                           0.83     13099
   macro avg       0.52      0.73      0.49     13099
weighted avg       0.98      0.83      0.89     13099

