In [53]:
import numpy as np
import pandas as pd

np.random.seed(42)
n = 5000

# -------- Numerical features --------
age = np.random.randint(18, 70, n)

annual_income = np.random.lognormal(mean=10.5, sigma=0.8, size=n)  # heavy tail

website_visits = np.random.poisson(lam=20, size=n)
time_on_site = website_visits * np.random.uniform(1.5, 4.0, n)

discount_rate = np.clip(np.random.normal(0.15, 0.1, n), 0, 0.6)

past_purchases = np.random.poisson(lam=5, size=n)

# Inject extreme outliers
outlier_idx = np.random.choice(n, size=50, replace=False)
past_purchases[outlier_idx] *= 20
annual_income[outlier_idx] *= 5

# -------- Categorical features --------
region = np.random.choice(
    ["North", "South", "East", "West"],
    size=n,
    p=[0.4, 0.2, 0.25, 0.15]
)

device_type = np.random.choice(
    ["mobile", "desktop", "tablet"],
    size=n,
    p=[0.6, 0.3, 0.1]
)

membership_level = np.random.choice(
    ["basic", "silver", "gold", "platinum"],
    size=n,
    p=[0.5, 0.25, 0.2, 0.05]
)

# -------- Target (Nonlinear + noisy) --------
sales = (
    0.0004 * annual_income +
    12 * np.log1p(past_purchases) +
    2.5 * website_visits +
    8 * np.sqrt(time_on_site) -
    40 * discount_rate +
    np.where(membership_level == "gold", 300, 0) +
    np.where(membership_level == "platinum", 700, 0) +
    np.random.normal(0, 50, n)   # ↓ noise from 300 → 120
)

sales = np.clip(sales, 50, None)

# -------- Create DataFrame --------
df = pd.DataFrame({
    "age": age,
    "annual_income": annual_income,
    "website_visits": website_visits,
    "time_on_site": time_on_site,
    "discount_rate": discount_rate,
    "past_purchases": past_purchases,
    "region": region,
    "device_type": device_type,
    "membership_level": membership_level,
    "monthly_sales": sales
})

# -------- Inject missing values --------
for col in ["annual_income", "time_on_site", "discount_rate"]:
    idx = np.random.choice(n, size=200, replace=False)
    df.loc[idx, col] = np.nan

df.head()


Unnamed: 0,age,annual_income,website_visits,time_on_site,discount_rate,past_purchases,region,device_type,membership_level,monthly_sales
0,56,33262.449496,21,,,5,South,mobile,basic,50.0
1,69,54067.481666,15,28.596987,0.097277,6,North,mobile,gold,418.373893
2,46,26680.636004,26,49.584089,0.17078,6,South,mobile,basic,238.536461
3,32,50836.09851,16,59.634598,0.232317,8,North,desktop,basic,233.064429
4,60,18187.451246,19,32.111618,0.136919,3,North,mobile,gold,396.021915


In [54]:
df.to_csv('customer sales prediction dataset.csv')

In [55]:
df.describe()

Unnamed: 0,age,annual_income,website_visits,time_on_site,discount_rate,past_purchases,monthly_sales
count,5000.0,4800.0,5000.0,4800.0,4800.0,5000.0,5000.0
mean,43.5846,51483.03,20.0112,55.248058,0.152648,5.8116,240.568686
std,14.919094,56555.02,4.45042,19.373937,0.094064,9.330397,190.077331
min,18.0,2068.026,6.0,12.820025,0.0,0.0,50.0
25%,31.0,20969.21,17.0,39.951087,0.081084,3.0,118.313648
50%,43.0,36010.56,20.0,53.321879,0.150222,5.0,168.039596
75%,56.0,62585.09,23.0,68.158697,0.216175,6.0,352.04505
max,69.0,1349756.0,36.0,135.496132,0.518747,200.0,1064.084985


In [56]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   age               5000 non-null   int32  
 1   annual_income     4800 non-null   float64
 2   website_visits    5000 non-null   int32  
 3   time_on_site      4800 non-null   float64
 4   discount_rate     4800 non-null   float64
 5   past_purchases    5000 non-null   int32  
 6   region            5000 non-null   object 
 7   device_type       5000 non-null   object 
 8   membership_level  5000 non-null   object 
 9   monthly_sales     5000 non-null   float64
dtypes: float64(4), int32(3), object(3)
memory usage: 332.2+ KB


In [57]:
df.isnull().sum()

age                   0
annual_income       200
website_visits        0
time_on_site        200
discount_rate       200
past_purchases        0
region                0
device_type           0
membership_level      0
monthly_sales         0
dtype: int64

In [58]:
df = df.dropna()

In [59]:
df

Unnamed: 0,age,annual_income,website_visits,time_on_site,discount_rate,past_purchases,region,device_type,membership_level,monthly_sales
1,69,54067.481666,15,28.596987,0.097277,6,North,mobile,gold,418.373893
2,46,26680.636004,26,49.584089,0.170780,6,South,mobile,basic,238.536461
3,32,50836.098510,16,59.634598,0.232317,8,North,desktop,basic,233.064429
4,60,18187.451246,19,32.111618,0.136919,3,North,mobile,gold,396.021915
5,25,32443.573799,17,59.564106,0.093340,4,North,mobile,basic,85.225357
...,...,...,...,...,...,...,...,...,...,...
4995,24,17942.873043,23,51.097288,0.197511,3,South,mobile,basic,104.730567
4996,66,499378.436005,15,30.503480,0.169493,4,South,tablet,basic,375.010154
4997,26,55939.126830,21,59.650981,0.198077,6,East,desktop,silver,161.211704
4998,53,57182.933590,22,69.902692,0.192180,3,North,mobile,basic,165.352485


In [60]:
df.isnull().sum()

age                 0
annual_income       0
website_visits      0
time_on_site        0
discount_rate       0
past_purchases      0
region              0
device_type         0
membership_level    0
monthly_sales       0
dtype: int64

In [61]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor


In [62]:
X = df.drop("monthly_sales", axis=1)
y = df["monthly_sales"]
y.head()

1    418.373893
2    238.536461
3    233.064429
4    396.021915
5     85.225357
Name: monthly_sales, dtype: float64

In [63]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [64]:
Q1 = y_train.quantile(0.25)
Q3 = y_train.quantile(0.75)
IQR = Q3 - Q1

lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR

mask = (y_train >= lower) & (y_train <= upper)

X_train = X_train.loc[mask]
y_train = y_train.loc[mask]
X_train.shape

(3378, 9)

In [79]:
num_cols = [
    "age", "annual_income", "website_visits",
    "time_on_site", "discount_rate", "past_purchases"
]

cat_cols = [
    "region", "device_type", "membership_level"
]

num_pipeline = Pipeline([
    ("scaler", StandardScaler())
])

cat_pipeline = Pipeline([
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", num_pipeline, num_cols),
    ("cat", cat_pipeline, cat_cols)
])

model_pipeline = Pipeline([
    ("preprocessing", preprocessor),
    ("model", RandomForestRegressor(
        n_estimators=5000,          # keep high for stability
        max_depth=40,               # limit tree depth (VERY important)
        min_samples_split=20,       # prevent splits on tiny noisy patterns
        min_samples_leaf=10,        # smooth predictions
        max_features="sqrt",        # decorrelate trees
        bootstrap=True,             # noise averaging
        oob_score=True,             # built-in validation
        random_state=42,
        n_jobs=-1
    ))
])


In [80]:
model_pipeline.fit(X_train, y_train)


0,1,2
,steps,"[('preprocessing', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,5000
,criterion,'squared_error'
,max_depth,40
,min_samples_split,20
,min_samples_leaf,10
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [81]:

y_pred = model_pipeline.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("MAE :", mae)
print("R²  :", r2)


MAE : 66.88338636879561
R²  : 0.41751089584060574


In [82]:
errors = y_test - y_pred
print("Max error:", abs(errors).max())
print("Mean error:", abs(errors).mean())


Max error: 740.2275427065653
Mean error: 66.88338636879561


In [83]:
import joblib

joblib.dump(model_pipeline, "customer_sales_prediction.joblib")

['customer_sales_prediction.joblib']