# Step 1: Install Libraries and Import Dependencies


To start, install the necessary Python libraries:



In [None]:
!pip install pandas numpy seaborn matplotlib scikit-learn statsmodels keras pmdarima catboost lightgbm xgboost transformers dask[dataframe] imbalanced-learn


# import all the required libraries:

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE
from pmdarima import auto_arima
from keras.models import Sequential
from keras.layers import LSTM, Dense
import warnings

# Ignore warnings
warnings.filterwarnings("ignore")


# Step 2: Load the Datasets
Specify the paths and load your datasets.



In [None]:
# Load the Datasets
large_data_path = r"C:\Users\sande\Music\BI\Data_Set\ecommerce_customer_data_large.csv"
custom_ratios_path = r"C:\Users\sande\Music\BI\Data_Set\ecommerce_customer_data_custom_ratios.csv"

df_large = pd.read_csv(large_data_path)
df_custom = pd.read_csv(custom_ratios_path)


# Step 3: Concatenate the Datasets
Merge the two datasets into one.



In [None]:
# Concatenate Datasets
df_combined = pd.concat([df_large, df_custom])
df_combined.reset_index(drop=True, inplace=True)


# Step 4: Exploratory Data Analysis (EDA)
You can explore the combined dataset to understand its structure and contents:



In [None]:
# Check for missing values
print("Missing values in combined dataset:\n", df_combined.isnull().sum())

# Summary statistics of numerical columns
print(df_combined.describe())

# Impute missing values in 'Returns' and 'Age'
df_combined['Returns'].fillna(0, inplace=True)
df_combined['Age'].fillna(df_combined['Age'].mean(), inplace=True)

# Visualize the distribution of purchase amounts
sns.histplot(df_combined['Total Purchase Amount'], bins=20)
plt.title("Total Purchase Amount Distribution")
plt.show()

# Check the correlation between numerical features
numeric_df = df_combined.select_dtypes(include=[np.number])  # Only numeric columns
plt.figure(figsize=(10, 8))
sns.heatmap(numeric_df.corr(), annot=True, cmap="coolwarm")
plt.title("Correlation Matrix")
plt.show()


# Step 5: Data Preprocessing
Drop irrelevant columns, handle dates, and prepare data for modeling.



In [None]:
# Drop irrelevant columns
columns_to_drop = ['Customer Name', 'Customer ID']
df_combined = df_combined.drop(columns=[col for col in columns_to_drop if col in df_combined.columns])

# Handle 'Purchase Date' column
if 'Purchase Date' in df_combined.columns:
    df_combined['Purchase Date'] = pd.to_datetime(df_combined['Purchase Date'])
    df_combined.set_index('Purchase Date', inplace=True)

# Print columns to check
print("Columns after preprocessing:", df_combined.columns.tolist())

# One-hot encode categorical columns
categorical_columns = ['Product Category', 'Payment Method', 'Gender']
existing_categorical_columns = [col for col in categorical_columns if col in df_combined.columns]
df_combined = pd.get_dummies(df_combined, columns=existing_categorical_columns, drop_first=True)


# Step 6: Split Features and Target
Define the target variable and split the data.



In [None]:
# Split features and target
X = df_combined.drop('Churn', axis=1)
y = df_combined['Churn']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize numerical features
numeric_cols = ['Product Price', 'Quantity', 'Total Purchase Amount', 'Age']
scaler = StandardScaler()

X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

X_train_scaled[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test_scaled[numeric_cols] = scaler.transform(X_test[numeric_cols])


# Step 7: Handle Class Imbalance using SMOTE
Balance the dataset using SMOTE to handle imbalanced classes.



In [None]:
# Apply SMOTE for class imbalance
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_scaled, y_train)


# Step 8: Train and Evaluate Models
Train multiple models and evaluate their performance.



In [None]:
# Define models
models = {
    "LogisticRegression": LogisticRegression(class_weight='balanced', max_iter=300, solver='liblinear'),
    "RandomForest": RandomForestClassifier(class_weight='balanced'),
    "GradientBoosting": GradientBoostingClassifier(),
    "HistGradientBoosting": HistGradientBoostingClassifier(),
    "CatBoost": CatBoostClassifier(verbose=False),
    "LightGBM": LGBMClassifier(),
    "XGBoost": XGBClassifier()
}

# Function to evaluate models
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    print(f"\nModel Evaluation:\nAccuracy: {accuracy:.4f}\nPrecision: {precision:.4f}\nRecall: {recall:.4f}\nF1 Score: {f1:.4f}")
    print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Train and evaluate models
for name, model in models.items():
    model.fit(X_train_balanced, y_train_balanced)
    print(f"{name} trained successfully.")
    evaluate_model(model, X_test_scaled, y_test)


# Step 9: Time Series Forecasting (ARIMA & LSTM)
You can forecast future values using ARIMA and LSTM models:




# ARIMA


In [None]:
# Time series aggregation by month
df_time_series = df_combined['Total Purchase Amount'].resample('M').sum().dropna()

# ARIMA Model
model_arima = auto_arima(df_time_series, seasonal=True, stepwise=True, suppress_warnings=True)
forecast_arima = model_arima.predict(n_periods=12)  # Forecast for the next 12 months
print("ARIMA Forecast:", forecast_arima)


# LSTM

In [None]:
# Prepare data for LSTM
def prepare_lstm_data(data, time_step=30):
    data = data.values
    X, y = [], []
    for i in range(time_step, len(data)):
        X.append(data[i-time_step:i])
        y.append(data[i])
    return np.array(X), np.array(y)

X_lstm, y_lstm = prepare_lstm_data(df_time_series)
X_lstm = X_lstm.reshape((X_lstm.shape[0], X_lstm.shape[1], 1))

# Create LSTM model
lstm_model = Sequential()
lstm_model.add(LSTM(50, return_sequences=True, input_shape=(X_lstm.shape[1], 1)))
lstm_model.add(LSTM(50))
lstm_model.add(Dense(1))
lstm_model.compile(optimizer='adam', loss='mean_squared_error')

# Train the LSTM model
lstm_model.fit(X_lstm, y_lstm, epochs=50, batch_size=32)

# Forecast the next month
lstm_input = df_time_series.values[-30:].reshape((1, 30, 1))
lstm_prediction = lstm_model.predict(lstm_input)
print("LSTM Forecast:", lstm_prediction)


# Step 10: Compare ARIMA and LSTM Forecasts
Visualize and compare the forecasts from ARIMA and LSTM:



In [None]:
# Plotting ARIMA and LSTM forecasts
plt.figure(figsize=(14, 7))
plt.plot(df_time_series, label='Historical Data', color='blue')
plt.plot(pd.date_range(start='2023-10-31', periods=12, freq='M'), forecast_arima, label='ARIMA Forecast', color='orange')
plt.plot(pd.date_range(start='2023-10-31', periods=1, freq='M'), lstm_prediction.flatten(), label='LSTM Forecast', color='green')
plt.title('Forecast Comparison')
plt.xlabel('Date')
plt.ylabel('Total Purchase Amount')
plt.legend()
plt.show()
