# DATA PREPROCESSING

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from tabulate import tabulate
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
importances = None
from sklearn.linear_model import Lasso
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split




data=pd.read_csv('ECommerceDataset.csv')


In [None]:
#Checking for missing values 

In [None]:
data.isnull().sum()

In [None]:
#checking shape and size of dataset 

In [None]:
shape=data.shape
size=data.size
print(shape)
print(size)


In [None]:
#Calculating the missing percentage of data for this data set

In [None]:
missingvalpercentage=(data.isnull().sum()/len(data))*100
print(missingvalpercentage)

In [None]:
#As there is need to maintain integrity of data we will do data imputation for missing valuse instead of data removal 

In [None]:
data['Tenure'] = data['Tenure'].fillna(data['Tenure'].median())
data['WarehouseToHome'] = data['WarehouseToHome'].fillna(data['WarehouseToHome'].median())
data['HourSpendOnApp'] = data['HourSpendOnApp'].fillna(data['HourSpendOnApp'].median())
data['OrderAmountHikeFromlastYear'] = data['OrderAmountHikeFromlastYear'].fillna(data['OrderAmountHikeFromlastYear'].median())
data['CouponUsed'] = data['CouponUsed'].fillna(data['CouponUsed'].median())
data['OrderCount'] = data['OrderCount'].fillna(data['OrderCount'].median())
data['DaySinceLastOrder'] = data['DaySinceLastOrder'].fillna(data['DaySinceLastOrder'].median())


In [None]:
#Checkinng whether the imputation is done or not

In [None]:
print(data.isnull().sum())


In [None]:
#lets have a descriptive info of data 
data.describe()

In [None]:
#lets have basic information about data 

In [None]:
data.info()

In [None]:
#having a look at datahead

In [None]:
data.head()

In [None]:
print(data.columns)


### feature encoding

In [None]:
#we will do one hot encoding for nominal features of dataset

In [None]:
dataencoded=pd.get_dummies(data,columns=['PreferredLoginDevice','PreferredPaymentMode','Gender','PreferedOrderCat','MaritalStatus'])
dataencoded.head()

###### Feature scaling

In [None]:
numerical_columns=data.select_dtypes(include=['float64','int64']).columns
scaler=StandardScaler()
data[numerical_columns]=scaler.fit_transform(data[numerical_columns])
print(tabulate(data.head(), headers='keys', tablefmt='pretty'))

# Exploratory Data Analysis

In [None]:
plt.figure(figsize=(8, 5))
sns.countplot(data=data, x='Churn', hue='Churn', palette='viridis', dodge=False, legend=False)
plt.title('Churn Distribution')
plt.xlabel('Churn')
plt.ylabel('Count')
plt.show()

In [None]:
sns.set(style="whitegrid")
plt.figure(figsize=(8,6))
sns.boxplot(data=data, y='OrderAmountHikeFromlastYear')
plt.title('Distribution of Order Amount Hike from Last Year')
plt.ylabel('Order Amount Hike (%)')
plt.show()


In [None]:


# Set the figure size
plt.figure(figsize=(10, 6))

# Create the histogram
plt.hist(data['HourSpendOnApp'], bins=20, color='skyblue', edgecolor='black')

# Add titles and labels
plt.title('Distribution of Hour Spend on App')
plt.xlabel('Hours Spent on App')
plt.ylabel('Number of Customers')

# Show the plot
plt.show()


In [None]:
sns.set(style="whitegrid")

# Create the plot
plt.figure(figsize=(10, 6))
sns.histplot(data=data, x='NumberOfDeviceRegistered', kde=True)

# Add titles and labels
plt.title('Distribution of Number of Devices Registered')
plt.xlabel('Number of Devices Registered')
plt.ylabel('Frequency')

# Show the plot
plt.show()

In [None]:
sns.set(style="whitegrid")

# Create the plot
plt.figure(figsize=(10, 6))
sns.boxplot(data=data, x='Churn', y='SatisfactionScore')

# Add titles and labels
plt.title('Satisfaction Score by Churn Status')
plt.xlabel('Churn (0 = Retained, 1 = Churned)')
plt.ylabel('Satisfaction Score')

# Show the plot
plt.show()

In [None]:
sns.set(style="whitegrid")

# Create a bar plot
plt.figure(figsize=(8, 6))
sns.barplot(data=data, x='Churn', y='HourSpendOnApp', estimator='mean')

# Set plot titles and labels
plt.title('Average Hour Spend on App by Churn Status')
plt.xlabel('Churn (0=retained, 1=churned)')
plt.ylabel('Average Hour Spend on App')

# Show the plot
plt.show()

In [None]:
sns.set(style="whitegrid")

# Create a bar plot
plt.figure(figsize=(8, 6))
sns.barplot(data=data, x='Churn', y='NumberOfDeviceRegistered', estimator='mean')

# Set plot titles and labels
plt.title('Average Number of Devices Registered by Churn Status')
plt.xlabel('Churn (0=retained, 1=churned)')
plt.ylabel('Average Number of Devices Registered')

# Show the plot
plt.show()

In [None]:
sns.set(style="whitegrid")

# Create a histogram
plt.figure(figsize=(8, 6))
sns.histplot(data=data, x='NumberOfAddress', bins=20, kde=True)

# Set plot titles and labels
plt.title('Distribution of Number of Addresses')
plt.xlabel('Number of Addresses')
plt.ylabel('Frequency')

# Show the plot
plt.show()

In [None]:
sns.set(style="whitegrid")

# Create a bar plot
plt.figure(figsize=(8, 6))
sns.barplot(data=data, x='Churn', y='CashbackAmount', errorbar=None)

# Set plot titles and labels
plt.title('Average Cashback Amount by Churn Status')
plt.xlabel('Churn (0 = Retained, 1 = Churned)')
plt.ylabel('Average Cashback Amount')

# Show the plot
plt.show()

In [None]:
sns.set(style="whitegrid")

# Create a box plot
plt.figure(figsize=(10, 6))
sns.boxplot(data=data, x='CityTier', y='SatisfactionScore')

# Set plot titles and labels
plt.title('Distribution of Satisfaction Score by City Tier')
plt.xlabel('City Tier')
plt.ylabel('Satisfaction Score')

# Show the plot
plt.show()

# Feature engineering

In [None]:


# Sample data
data = pd.DataFrame({
    'Tenure': [1, 7, 13, 25, 37, 50, 68, 72],  # Example data
    'HourSpendOnApp': [1.5, 2.0, 1.2, 2.5, 3.0, 2.0, 1.8, 2.2],  # Example data
    'OrderCount': [5, 8, 3, 6, 7, 4, 9, 5]  # Example data
})

# Define bins and labels
bins = [0, 6, 12, 24, 36, 48, 60, 72]
labels = ['0-6 months', '6-12 months', '1-2 years', '2-3 years', '3-4 years', '4-5 years', '5-6 years']

# Create tenure category column
data['TenureCategory'] = pd.cut(data['Tenure'], bins=bins, labels=labels, right=False)

# Create interaction term
data['Interaction'] = data['HourSpendOnApp'] * data['OrderCount']

# Create average hours per order column
data['AvgHoursPerOrder'] = data['HourSpendOnApp'] / (data['OrderCount'] + 1)

# Display the dataframe with the new columns
print(data[['Tenure', 'TenureCategory', 'HourSpendOnApp', 'OrderCount', 'Interaction', 'AvgHoursPerOrder']].head())


# Feature selection


In [None]:
# Load the dataset
data = pd.read_csv('ECommerceDataset.csv')

# Check the column names and data types
print("Column Names and Data Types:")
print(data.dtypes)

# Check if 'Churn' exists and convert if needed
if 'Churn' in data.columns:
    if data['Churn'].dtype not in ['float64', 'int64']:
        data['Churn'] = pd.to_numeric(data['Churn'], errors='coerce')
else:
    raise ValueError("Column 'Churn' is missing from the dataframe.")

# Define bins and labels for TenureCategory if needed
bins = [0, 6, 12, 24, 36, 48, 60, 72]
labels = ['0-6 months', '6-12 months', '1-2 years', '2-3 years', '3-4 years', '4-5 years', '5-6 years']

# Create tenure category column if not already present
if 'TenureCategory' not in data.columns:
    data['TenureCategory'] = pd.cut(data['Tenure'], bins=bins, labels=labels, right=False)

# Create interaction term if not already present
if 'Interaction' not in data.columns:
    data['Interaction'] = data['HourSpendOnApp'] * data['OrderCount']

# Create average hours per order column if not already present
if 'AvgHoursPerOrder' not in data.columns:
    data['AvgHoursPerOrder'] = data['HourSpendOnApp'] / (data['OrderCount'] + 1)

# Select only numeric columns for correlation matrix
numeric_data = data.select_dtypes(include=['float64', 'int64'])

# Check if 'Churn' is present in the numeric columns
if 'Churn' not in numeric_data.columns:
    print("The 'Churn' column is not present in the numeric columns for correlation.")
else:
    # Compute the correlation matrix
    correlation_matrix = numeric_data.corr()

    # Generate a heatmap for the correlation matrix
    plt.figure(figsize=(12, 10))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
    plt.title('Correlation Matrix')
    plt.show()

    # Display correlation of features with the target variable 'Churn'
    print("Correlation of features with 'Churn':")
    print(correlation_matrix['Churn'].sort_values(ascending=False))
    



# Feature Scaling


In [None]:
# List of numeric features to be standardized
numeric_features = ['Tenure', 'HourSpendOnApp', 'OrderCount', 'WarehouseToHome',
                     'SatisfactionScore', 'NumberOfDeviceRegistered', 'NumberOfAddress',
                     'OrderAmountHikeFromlastYear', 'CouponUsed', 'DaySinceLastOrder',
                     'CashbackAmount', 'Interaction', 'AvgHoursPerOrder']

# Instantiate the scaler
scaler = StandardScaler()

# Fit and transform the numeric features
data[numeric_features] = scaler.fit_transform(data[numeric_features])

# Verify the transformation
print(data[numeric_features].head())

In [None]:
print("Missing values in each column:")
print(data[numeric_features].isnull().sum())


In [None]:
# Impute missing values with the median
data[numeric_features] = data[numeric_features].fillna(data[numeric_features].median())

# Verify that there are no more missing values
print("Missing values after imputation:")
print(data[numeric_features].isnull().sum())

In [None]:
print(data[numeric_features].head())

In [None]:
data = pd.read_csv('ECommerceDataset.csv')

# Define features and target variable
X = data.drop(columns=['Churn'])
y = data['Churn']

# Identify categorical features
categorical_features = ['PreferredLoginDevice', 'PreferredPaymentMode', 'Gender', 'PreferedOrderCat', 'MaritalStatus']

# Create a column transformer for encoding categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'  # Keep the numeric features unchanged
)

# Transform the data
X_transformed = preprocessor.fit_transform(X)

# Impute missing values
imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X_transformed)

# Train Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_imputed, y)
rf_importances = rf_model.feature_importances_

# Create a DataFrame for Random Forest feature importances
rf_importance_df = pd.DataFrame({
    'Feature': preprocessor.get_feature_names_out(),
    'Importance': rf_importances
}).sort_values(by='Importance', ascending=False)

# Plot Random Forest Feature Importances
plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature', data=rf_importance_df)
plt.title('Feature Importance from Random Forest')
plt.show()

# Train Lasso model
lasso = Lasso(alpha=0.1)
lasso.fit(X_imputed, y)
lasso_coefficients = lasso.coef_

# Create a DataFrame for Lasso coefficients
lasso_importance_df = pd.DataFrame({
    'Feature': preprocessor.get_feature_names_out(),
    'Coefficient': lasso_coefficients
}).sort_values(by='Coefficient', ascending=False)

# Plot Lasso Feature Coefficients
plt.figure(figsize=(12, 8))
sns.barplot(x='Coefficient', y='Feature', data=lasso_importance_df)
plt.title('Feature Importance from Lasso Regression')
plt.show()


In [None]:
# Load the dataset
data = pd.read_csv('ECommerceDataset.csv')

# Check the column names and data types
print("Column Names and Data Types:")
print(data.dtypes)

# Check if 'Churn' exists and convert if needed
if 'Churn' in data.columns:
    if data['Churn'].dtype not in ['float64', 'int64']:
        data['Churn'] = pd.to_numeric(data['Churn'], errors='coerce')
else:
    raise ValueError("Column 'Churn' is missing from the dataframe.")

# Define bins and labels for TenureCategory if needed
bins = [0, 6, 12, 24, 36, 48, 60, 72]
labels = ['0-6 months', '6-12 months', '1-2 years', '2-3 years', '3-4 years', '4-5 years', '5-6 years']

# Create tenure category column if not already present
if 'TenureCategory' not in data.columns:
    data['TenureCategory'] = pd.cut(data['Tenure'], bins=bins, labels=labels, right=False)

# Create interaction term if not already present
if 'Interaction' not in data.columns:
    data['Interaction'] = data['HourSpendOnApp'] * data['OrderCount']

# Create average hours per order column if not already present
if 'AvgHoursPerOrder' not in data.columns:
    data['AvgHoursPerOrder'] = data['HourSpendOnApp'] / (data['OrderCount'] + 1)

# Verify column names
print("Available columns in data:")
print(data.columns)

# Verify creation of necessary columns
missing_features = []
if 'Interaction' not in data.columns:
    missing_features.append("Interaction")
if 'AvgHoursPerOrder' not in data.columns:
    missing_features.append("AvgHoursPerOrder")

if missing_features:
    print(f"Missing columns: {', '.join(missing_features)}")

# Define selected features
selected_features = [
    'Tenure', 'HourSpendOnApp', 'OrderCount',
    'WarehouseToHome', 'SatisfactionScore', 'NumberOfDeviceRegistered',
    'NumberOfAddress', 'OrderAmountHikeFromlastYear', 'CouponUsed',
    'DaySinceLastOrder', 'Interaction', 'AvgHoursPerOrder'
]

# Check if all selected features exist
available_features = [feature for feature in selected_features if feature in data.columns]
print("Features available for final DataFrame:")
print(available_features)

# Prepare the final DataFrame
final_data = data[available_features + ['Churn']]

# Check for missing values and handle them
print("Missing values in final data:")
print(final_data.isnull().sum())

# Impute missing values using SimpleImputer
imputer = SimpleImputer(strategy='mean')
final_data_imputed = pd.DataFrame(imputer.fit_transform(final_data), columns=final_data.columns)

# Verify data after imputation
print("Missing values after imputation:")
print(final_data_imputed.isnull().sum())

# Split the data into features and target
X = final_data_imputed.drop('Churn', axis=1)
y = final_data_imputed['Churn']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Initialize and train the RandomForestClassifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Print feature importances
importances = clf.feature_importances_
feature_importance = pd.DataFrame({'Feature': available_features, 'Importance': importances})
print("Feature Importances:")
print(feature_importance.sort_values(by='Importance', ascending=False))

In [None]:
!pip install shap lime 

In [None]:
import shap
import pandas as pd

# Initialize the SHAP explainer
explainer = shap.TreeExplainer(clf)

# Compute SHAP values for the test set
shap_values = explainer.shap_values(X_test)

# Print detailed shapes to debug
print("Number of classes:", len(shap_values))
for i, values in enumerate(shap_values):
    print(f"SHAP values shape for class {i}: {values.shape}")

print("Test data shape:", X_test.shape)

# Ensure the dimensions match
is_multiclass = len(shap_values) > 2

if is_multiclass:
    # For multi-class, check each class
    for i in range(len(shap_values)):
        print(f"Class {i} SHAP values shape: {shap_values[i].shape}")
        if shap_values[i].shape[0] != X_test.shape[0]:
            print(f"Shape mismatch for class {i}:")
            print(f"SHAP values shape: {shap_values[i].shape}")
            print(f"Test data shape: {X_test.shape}")
        else:
            print(f"Class {i} SHAP values shape matches.")
else:
    # For binary classification, check class 1
    if shap_values[1].shape[0] != X_test.shape[0]:
        print(f"Shape mismatch for class 1:")
        print(f"SHAP values shape: {shap_values[1].shape}")
        print(f"Test data shape: {X_test.shape}")
    else:
        print("SHAP values shape matches for class 1.")

# Convert X_test to DataFrame for better readability
X_test_df = pd.DataFrame(X_test, columns=data.columns[2:])  # Adjust columns as needed

# Print sample rows of X_test
print("Sample rows of X_test:")
print(X_test_df.head())

# Plot SHAP values
if not is_multiclass:
    shap.summary_plot(shap_values[1], X_test)
else:
    # Plot for each class if multi-class classification
    for i in range(len(shap_values)):
        print(f"Plotting SHAP values for class {i}")
        shap.summary_plot(shap_values[i], X_test)

