In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [2]:
# Load the training and test datasets
train_df = pd.read_csv('Train.csv')
test_df = pd.read_csv('Test.csv')

In [3]:
# Separate features and target in the training data
X_train = train_df.drop('Segmentation', axis=1)
y_train = train_df['Segmentation']

# Ensure the test data has the same features
X_test = test_df.copy()

In [4]:
# List of numerical and categorical columns
numerical_features = ['Age', 'Work_Experience', 'Spending_Score', 'Family_Size']
categorical_features = ['Gender', 'Ever_Married', 'Graduated', 'Profession', 'Var_1']

# Inspect numerical columns
print("Numerical columns inspection:")
for col in numerical_features:
    print(f"{col}:")
    print(X_train[col].unique())

# Handle non-numeric values in numerical columns if necessary
# For example, converting 'Low', 'Medium', 'High' to numerical values
X_train['Spending_Score'] = X_train['Spending_Score'].replace({'Low': 1, 'Medium': 2, 'Average': 3, 'High': 4})
X_test['Spending_Score'] = X_test['Spending_Score'].replace({'Low': 1, 'Medium': 2, 'Average': 3, 'High': 4})

# Preprocessing for numerical data: impute missing values and scale
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data: impute missing values and one-hot encode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

Numerical columns inspection:
Age:
[22 38 67 40 56 32 33 61 55 26 19 70 58 41 31 79 49 18 36 35 45 42 83 27
 28 47 29 57 76 25 72 48 74 59 39 51 30 63 52 60 68 86 50 43 80 37 46 69
 78 71 82 23 20 85 21 53 62 75 65 89 66 73 77 87 84 81 88]
Work_Experience:
[ 1. nan  0.  4.  9. 12.  3. 13.  5.  8. 14.  7.  2.  6. 10. 11.]
Spending_Score:
['Low' 'Average' 'High']
Family_Size:
[ 4.  3.  1.  2.  6. nan  5.  8.  7.  9.]


In [5]:
# Creating a pipeline that includes preprocessing and the classifier
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Spliting training data for validation
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Training the model
model.fit(X_train_split, y_train_split)

# Validating the model
y_val_pred = model.predict(X_val_split)
print("Validation Accuracy: ", accuracy_score(y_val_split, y_val_pred))
print("Classification Report:\n", classification_report(y_val_split, y_val_pred))

# Predicting the segments for the test data
y_test_pred = model.predict(X_test)

# Creating a DataFrame with predictions
test_predictions = pd.DataFrame({'Customer_ID': test_df['ID'], 'Predicted_Segment': y_test_pred})

# Saving predictions to a CSV file
test_predictions.to_csv('test_predictions.csv', index=False)

Validation Accuracy:  0.48079306071871125
Classification Report:
               precision    recall  f1-score   support

           A       0.36      0.36      0.36       391
           B       0.37      0.35      0.36       369
           C       0.50      0.51      0.50       380
           D       0.64      0.66      0.65       474

    accuracy                           0.48      1614
   macro avg       0.47      0.47      0.47      1614
weighted avg       0.48      0.48      0.48      1614



In [6]:
from sklearn.model_selection import train_test_split, GridSearchCV
# Defining the parameter grid for Grid Search
param_grid = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

# Creating a GridSearchCV object
grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1, scoring='accuracy')

# Training the model using Grid Search
grid_search.fit(X_train_split, y_train_split)

# Getting the best model from Grid Search
best_model = grid_search.best_estimator_

# Validating the best model
y_val_pred = best_model.predict(X_val_split)
print("Validation Accuracy: ", accuracy_score(y_val_split, y_val_pred))
print("Classification Report:\n", classification_report(y_val_split, y_val_pred))

# Predicting the segments for the test data using the best model
y_test_pred = best_model.predict(X_test)

# Creating a DataFrame with predictions
test_predictions = pd.DataFrame({'Customer_ID': test_df['ID'], 'Predicted_Segment': y_test_pred})

# Saving predictions to a CSV file
test_predictions.to_csv('test_prediction.csv', index=False)


Validation Accuracy:  0.5297397769516728
Classification Report:
               precision    recall  f1-score   support

           A       0.44      0.43      0.43       391
           B       0.42      0.33      0.37       369
           C       0.52      0.57      0.54       380
           D       0.66      0.74      0.70       474

    accuracy                           0.53      1614
   macro avg       0.51      0.52      0.51      1614
weighted avg       0.52      0.53      0.52      1614



In [7]:
from sklearn.model_selection import GridSearchCV

# Defining the parameter grid
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

# Creating the GridSearchCV object
grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1, scoring='accuracy')

# Fitting the model
grid_search.fit(X_train_split, y_train_split)

# Printing the best parameters and best score
print("Best parameters found: ", grid_search.best_params_)
print("Best accuracy: ", grid_search.best_score_)

Best parameters found:  {'classifier__max_depth': 10, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 300}
Best accuracy:  0.5409027314923232


In [10]:
# Defining the RandomForestClassifier with the best parameters
best_rf = RandomForestClassifier(
    max_depth=10,
    min_samples_leaf=1,
    min_samples_split=10,
    n_estimators=300,
    random_state=42
)

# Creating a pipeline with the best RandomForestClassifier
best_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', best_rf)
])

# Training the model on the training data
best_model.fit(X_train_split, y_train_split)

# Predicting on the validation data
y_val_pred_best = best_model.predict(X_val_split)

# Evaluating the model's performance
print("Tuned RandomForestClassifier Validation Accuracy: ", accuracy_score(y_val_split, y_val_pred_best))
print("Tuned RandomForestClassifier Classification Report:\n", classification_report(y_val_split, y_val_pred_best))

Tuned RandomForestClassifier Validation Accuracy:  0.5322180916976456
Tuned RandomForestClassifier Classification Report:
               precision    recall  f1-score   support

           A       0.45      0.44      0.45       391
           B       0.43      0.33      0.37       369
           C       0.52      0.57      0.54       380
           D       0.65      0.74      0.69       474

    accuracy                           0.53      1614
   macro avg       0.51      0.52      0.51      1614
weighted avg       0.52      0.53      0.52      1614



In [None]:
!pip install xgboost

In [13]:
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb


# Encoding labels as numeric values
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train_split)
y_val_encoded = label_encoder.transform(y_val_split)

# Definining the models to compare
models = {
    'TunedRandomForest': best_rf,
    'GradientBoosting': GradientBoostingClassifier(random_state=42),
    'XGBoost': xgb.XGBClassifier(random_state=42)
}

# Training and evaluate each model
for name, clf in models.items():
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', clf)
    ])
    pipeline.fit(X_train_split, y_train_encoded if name == 'XGBoost' else y_train_split)
    y_val_pred = pipeline.predict(X_val_split)
    
    # Decode predictions if using XGBoost
    if name == 'XGBoost':
        y_val_pred = label_encoder.inverse_transform(y_val_pred)
    
    print(f"{name} Validation Accuracy: ", accuracy_score(y_val_split, y_val_pred))
    print(f"{name} Classification Report:\n", classification_report(y_val_split, y_val_pred))


TunedRandomForest Validation Accuracy:  0.5322180916976456
TunedRandomForest Classification Report:
               precision    recall  f1-score   support

           A       0.45      0.44      0.45       391
           B       0.43      0.33      0.37       369
           C       0.52      0.57      0.54       380
           D       0.65      0.74      0.69       474

    accuracy                           0.53      1614
   macro avg       0.51      0.52      0.51      1614
weighted avg       0.52      0.53      0.52      1614

GradientBoosting Validation Accuracy:  0.5285006195786865
GradientBoosting Classification Report:
               precision    recall  f1-score   support

           A       0.41      0.41      0.41       391
           B       0.43      0.34      0.38       369
           C       0.55      0.57      0.56       380
           D       0.66      0.73      0.69       474

    accuracy                           0.53      1614
   macro avg       0.51      0.52      

In [17]:
from sklearn.preprocessing import PolynomialFeatures
# Step 1: Identifying categorical columns
categorical_features = ['Gender', 'Ever_Married', 'Graduated', 'Profession', 'Var_1']  # Update with your categorical columns
numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Step 2: Preprocessing Pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='mean'), numerical_features),  # Impute missing values for numerical features
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)  # One-hot encode categorical features
    ])

# Step 3: Transforming the data
X_train_preprocessed = preprocessor.fit_transform(X_train)

# Step 4: Applying Polynomial Features
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X_train_preprocessed)
