In [1]:

import pandas as pd

In [2]:
# a
df=pd.read_csv('https://raw.githubusercontent.com/delinai/schulich_ds1_2024/main/Datasets/customers_final.csv')

In [3]:
# Display the first few rows of the dataset
df.head(), df.info(), df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   customer_id         10000 non-null  int64  
 1   join_date           10000 non-null  object 
 2   last_purchase_date  10000 non-null  object 
 3   age                 8991 non-null   float64
 4   gender              9467 non-null   object 
 5   location            10000 non-null  object 
dtypes: float64(1), int64(1), object(4)
memory usage: 468.9+ KB


(   customer_id   join_date last_purchase_date   age  gender           location
 0            1  2023-11-20         2024-03-17  56.0  Female  North Shannonbury
 1            2  2021-09-08         2023-10-25   NaN    Male          Hillville
 2            3  2021-06-01         2022-11-27   NaN     NaN   North Latoyatown
 3            4  2022-01-01         2022-09-01  29.0    Male          Grossstad
 4            5  2022-01-24         2023-06-02   NaN    Male   East Matthewfort,
 None,
        customer_id          age
 count  10000.00000  8991.000000
 mean    5000.50000    43.467467
 std     2886.89568    15.094380
 min        1.00000    18.000000
 25%     2500.75000    30.000000
 50%     5000.50000    44.000000
 75%     7500.25000    57.000000
 max    10000.00000    69.000000)

In [4]:
from datetime import datetime

In [5]:
# Convert date columns to datetime format
df['join_date'] = pd.to_datetime(df['join_date'])
df['last_purchase_date'] = pd.to_datetime(df['last_purchase_date'])

In [6]:
# Fill missing values in 'age' with the median age
median_age = df['age'].median()
df['age'].fillna(median_age, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['age'].fillna(median_age, inplace=True)


In [7]:
# Fill missing values in 'gender' with a placeholder 'Unknown'
df['gender'].fillna('Unknown', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['gender'].fillna('Unknown', inplace=True)


In [8]:
# Calculate tenure and recency
current_date = datetime.now()
df['tenure'] = (df['last_purchase_date'] - df['join_date']).dt.days
df['recency'] = (current_date - df['last_purchase_date']).dt.days

In [9]:
# Display the first few rows of the cleaned dataset
df.head()

Unnamed: 0,customer_id,join_date,last_purchase_date,age,gender,location,tenure,recency
0,1,2023-11-20,2024-03-17,56.0,Female,North Shannonbury,118,118
1,2,2021-09-08,2023-10-25,44.0,Male,Hillville,777,262
2,3,2021-06-01,2022-11-27,44.0,Unknown,North Latoyatown,544,594
3,4,2022-01-01,2022-09-01,29.0,Male,Grossstad,243,681
4,5,2022-01-24,2023-06-02,44.0,Male,East Matthewfort,494,407


In [10]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler


In [11]:
# OneHotEncode 'Gender' and 'Location'
encoder = OneHotEncoder(drop='first', sparse_output=False)
encoded_features = encoder.fit_transform(df[['gender', 'location']])
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(['gender', 'location']))

In [12]:
# Normalize/scale 'Age', 'Tenure', and 'Recency'
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[['age', 'tenure', 'recency']])
scaled_df = pd.DataFrame(scaled_features, columns=['age_scaled', 'tenure_scaled', 'recency_scaled'])

In [13]:
# Combine all features into a final DataFrame
final_df = pd.concat([df[['customer_id']], scaled_df, encoded_df], axis=1)

In [14]:
# Mock a CLV target variable for demonstration purposes
np.random.seed(42)
df['CLV'] = np.random.rand(len(df)) * 1000  # Random CLV values

In [15]:
#Prepare the final dataset with engineered features
final_df = pd.concat([df[['customer_id', 'CLV']], scaled_df, encoded_df], axis=1)

In [16]:
# Split the data into training and testing sets
X = final_df.drop(columns=['customer_id', 'CLV'])
y = final_df['CLV']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
pip install xgboost




[notice] A new release of pip is available: 24.0 -> 24.1.2
[notice] To update, run: C:\Users\rsand\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip





In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [19]:
# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest Regressor': RandomForestRegressor(random_state=42),
    'Gradient Boosting Regressor': GradientBoostingRegressor(random_state=42),
    'XGBoost Regressor': xgb.XGBRegressor(random_state=42)
}

In [20]:
# Train and evaluate models
results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results[name] = {'MSE': mse, 'R2': r2}

print(results)