In [None]:
# Import libraries and dataset

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns


# Adjust pandas display options
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.expand_frame_repr', False)  # Prevent truncation


df = pd.read_csv('/Users/joezhou/Downloads/Mentum Assignment Data/Customer-churn-records.csv',sep=",")



In [None]:
# Part 2 
# Now that you have access to the dataset, it’s time to start understanding the statistical properties of the data. These activities will all be carried out in Ed Lessons.
# Calculate basic statistical measures such as mean, median, mode, and percentiles for each attribute. 
# Recognise common distributions in the data, such as Gaussian, binomial, Poisson, etc. 
# Plan what visualisations you will include in your report and describe how you will explore and visualise the data.


In [None]:
# Part 2 
# Calculate basic statistical measures such as mean, median, mode, and percentiles for each attribute. 

# identify variable types and records
df.info()

# basic statistics
df.describe().round(1).applymap(lambda x: f'{x:,.1f}').transpose()


In [None]:
# Part 2 

# Identify mode for every variable and respective frequencies

# Assuming df is your DataFrame
mode_freq = {}

for column in df.columns:
    # Calculate the mode(s) of the column
    mode_values = df[column].mode()
    
    # Create an empty list to store mode and frequency pairs
    mode_list = []
    
    for mode in mode_values:
        # Calculate the frequency of the mode
        frequency = df[column].value_counts().get(mode, 0)
        mode_list.append((mode, frequency))
    
    # Store the mode and its frequency in the dictionary
    mode_freq[column] = mode_list

# Print the results
for column, modes in mode_freq.items():
    print(f'Mode(s) of {column}:')
    for mode, frequency in modes:
        print(f'  Value: {mode}, Frequency: {frequency}')


In [None]:
# Part 2 

# distribution graphs
# List of numerical and categorical columns, as some are numerical whilest others are categorical
numerical_columns = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts',
                     'EstimatedSalary', 'Satisfaction Score', 'Point Earned']

# identified as objects in the underlying data
categorical_columns = ['Geography', 'Gender', 'HasCrCard', 'IsActiveMember', 'Exited', 'Complain', 'Card Type']

# Combine the lists to get the total number of variables
all_columns = numerical_columns + categorical_columns
total_columns = len(all_columns)

# Determine grid size (e.g., 3 columns)
n_cols = 3
n_rows = (total_columns // n_cols) + (total_columns % n_cols > 0)

# Set up the plotting style and figure size
sns.set(style="whitegrid")
fig, axes = plt.subplots(n_rows, n_cols, figsize=(20, n_rows * 5))

# Flatten the axes array for easy indexing
axes = axes.flatten()

# Create plots for each variable
for i, col in enumerate(all_columns):
    if col in numerical_columns:
        sns.histplot(df[col], kde=True, bins=30, ax=axes[i])
    else:
        sns.countplot(x=col, data=df, ax=axes[i])
    
    axes[i].set_title(f'Distribution of {col}')
    axes[i].set_xlabel(col)
    axes[i].set_ylabel('Frequency' if col in numerical_columns else 'Count')

# Remove any unused subplots
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

# Adjust layout
plt.tight_layout()
plt.show()


In [None]:
# Part 3
# Generating relevant charts to visualise the distributions of different attributes will be your starting point for this part. 
# Generate histograms, box plots and other relevant charts to visualise data distributions
# Generate a correlation matrix to understand the relationships between different attributes. 
# Identify and address any data quality issues in the dataset. 

In [None]:
# Part 3 

# List of numerical and categorical columns
numerical_columns = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts',
                     'EstimatedSalary', 'Satisfaction Score', 'Point Earned']

categorical_columns = ['Geography', 'Gender', 'HasCrCard', 'IsActiveMember', 
                       'Complain', 'Card Type']

# Combine the lists to get the total number of variables
all_columns = numerical_columns + categorical_columns
total_columns = len(all_columns)

# Determine grid size (e.g., 4 columns)
n_cols = 4
n_rows = (total_columns // n_cols) + (total_columns % n_cols > 0)

# Set up the matplotlib figure
fig, axes = plt.subplots(n_rows, n_cols, figsize=(20, n_rows * 5))

# Flatten the axes array for easy indexing
axes = axes.flatten()

# Create plots for each variable
for i, col in enumerate(all_columns):
    if col in numerical_columns:
        sns.boxplot(x='Exited', y=col, data=df, ax=axes[i])
        axes[i].set_title(f'Box Plot of {col}')
        axes[i].set_xlabel('Exited')
        axes[i].set_ylabel(col)
    else:
        sns.countplot(x=col, hue='Exited', data=df, ax=axes[i])
        axes[i].set_title(f'Count Plot of {col}')
        axes[i].set_xlabel(col)
        axes[i].set_ylabel('Count')

# Remove any unused subplots
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

# Adjust layout
plt.tight_layout()
plt.show()

In [None]:
# Part 3

# Convert categorical variables into numerical variables to understand relationship to churn for correlation analysis

df_encoded = pd.get_dummies(df, columns=['Geography', 'Gender', 'HasCrCard', 'IsActiveMember', 'Complain', 'Card Type'], drop_first=False)

# Identify and convert boolean columns to integers
bool_columns = df_encoded.select_dtypes(include='bool').columns

df_encoded[bool_columns] = df_encoded[bool_columns].astype(int)

# df_encoded.head()
# df_encoded.info()



In [None]:
# Part 3: create heatmap and correlation chart 

# List of shortlisted columns
shortlist_columns = [#outcome variable used for prediction
                     'Exited',
                     #Numerical variables from source file
                     'CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts',
                     'EstimatedSalary', 'Satisfaction Score','Point Earned',
                    #transformed variable from categories
                    'Geography_NSW', 'Geography_QLD', 'Geography_VIC','Gender_Female','Gender_Male',
                    'HasCrCard_0','HasCrCard_1','IsActiveMember_0','IsActiveMember_1','Complain_0','Complain_1',
                    'Card Type_DIAMOND', 'Card Type_GOLD','Card Type_PLATINUM','Card Type_SILVER'
                     ]


# Calculate the correlation matrix
correlation_matrix = df_encoded[shortlist_columns].corr()

# Set up the matplotlib figure
plt.figure(figsize=(10, 10))

# Draw the heatmap
sns.heatmap(correlation_matrix, annot=True, fmt=".2f",annot_kws={"size": 8}, cmap='coolwarm',
            cbar_kws={'label': 'Correlation Coefficient'}, linewidths=2)

# Set title and show the plot
plt.title('Correlation Heatmap of Numerical Variables')
plt.show()




# Find pairs with correlation coefficient >= 0.8
high_corr_pairs = []
for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) >= 0.8:
            high_corr_pairs.append((correlation_matrix.columns[i], correlation_matrix.columns[j], correlation_matrix.iloc[i, j]))

# Display the pairs and their R-squared values
for pair in high_corr_pairs:
    var1, var2, corr = pair
    r_squared = corr ** 2
    print(f"Variables: {var1} and {var2} have a correlation of {corr:.2f} and R-squared of {r_squared:.2f}")



In [None]:
# data quality checks, for part 3
# identify, missing values, duplicates, data types, value range and unique values in categorical columns


# Check for duplicate rows
print(df[df.duplicated(subset=df.columns)])
 
# Check for duplicate primary keys
print(df[df.duplicated(subset='CustomerId')])

# note the function below common practice used within professional context

def check_data_quality(df):

    # Check for missing values
    print("Missing Values:")
    missing_values = df.isnull().sum()
    missing_values = missing_values[missing_values > 0]
    if not missing_values.empty:
        print(missing_values)
    else:
        print("No missing values found.")
    
    print("\n")
    
    # Check for duplicates
    print("Duplicate Rows:")
    duplicates = df.duplicated().sum()
    if duplicates > 0:
        print(f"Number of duplicate rows: {duplicates}")
    else:
        print("No duplicate rows found.")
    
    print("\n")
    
    # Check for incorrect data types
    print("Data Types:")
    incorrect_types = df.dtypes[df.dtypes == 'object'].index
    print(f"Object types: {list(incorrect_types)}")
    
    # Check for value ranges and outliers (for numerical columns)
    print("\nValue Ranges and Outliers:")
    for column in df.select_dtypes(include=['int64', 'float64']).columns:
        min_val = df[column].min()
        max_val = df[column].max()
        print(f"{column}: Min = {min_val}, Max = {max_val}")
        
        # Outliers can be detected using z-scores or IQR, here we use IQR
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        outliers = df[(df[column] < (Q1 - 1.5 * IQR)) | (df[column] > (Q3 + 1.5 * IQR))]
        if not outliers.empty:
            print(f"Outliers detected in {column}: {len(outliers)} rows")
        else:
            print(f"No outliers detected in {column}")

    print("\n")
    
    # Check for unique values in categorical columns
    print("Unique Values in Categorical Columns:")
    for column in df.select_dtypes(include=['object']).columns:
        unique_values = df[column].unique()
        print(f"{column}: {len(unique_values)} unique values")

# Call function
check_data_quality(df)




In [None]:
# Part 3: sample data and bootstrapping

import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt

#==================================================================================
# Part 3.1 determining sample size

# Given parameters
confidence_level = 0.85
margin_of_error = 0.25
population_size = 10000

# Z-score for 85% confidence level
Z = stats.norm.ppf(0.5 + confidence_level / 2)

# Assuming p = 0.5 for maximum variability
p = 0.5

# Calculating the required sample size without finite population correction
sample_size = (Z**2 * p * (1 - p)) / (margin_of_error**2)

# Apply finite population correction
sample_size = sample_size / (1 + (sample_size - 1) / population_size)
sample_size = int(np.ceil(sample_size))  # Round up to the nearest integer
print(f"Required sample size with population correction: {sample_size}")
#==================================================================================
# Part 3.2 Bootstrapping

num_bootstrap_samples = 9
bootstrap_samples = np.random.choice(df.index, size=(num_bootstrap_samples, sample_size), replace=True)

# List to store results from bootstrapping
boot_means = []

# Apply bootstrapping
for sample_indices in bootstrap_samples:
    sample_df = df.loc[sample_indices]
    boot_means.append(sample_df['Exited'].mean())
    bootstrap_std_of_means = np.std(boot_means)

# Calculate the population mean for comparison
population_mean = df['Exited'].mean()
population_std = df['Exited'].std()

# Display results
boot_mean = np.mean(boot_means)
print(f"Bootstrapped mean: {boot_mean},Bootstrap Standard Deviation of Means: {bootstrap_std_of_means:.2f}")
print(f"Population mean: {population_mean},Population Standard Deviation: {population_std:.2f}")

# The bootstrapped mean closely approximates the original mean, suggesting the sample means are centered around the true population mean. 
# However, the bootstrapped standard deviation is considerably smaller than the original standard deviation, 
# indicating that the sample means exhibit less variability than individual data points in the original dataset. 

#==================================================================================
# Part 3.3 Calculate confidence interval for bootstrapped means
conf_interval = np.percentile(boot_means, [(1-confidence_level)/2*100, (1+confidence_level)/2*100])
print(f"{int(confidence_level*100)}% Confidence interval for 'Exited': {conf_interval}")

#==================================================================================
# Part 3.4 Generate charts for presentation

# 1. Histogram of Bootstrapped Means vs. Population Mean
plt.figure(figsize=(14, 7))
plt.hist(boot_means, bins=30, color='skyblue', edgecolor='black', alpha=0.7)
plt.axvline(population_mean, color='red', linestyle='dashed', linewidth=2)
plt.title('Bootstrapped Means vs. Population Mean')
plt.xlabel('Mean of Exited')
plt.ylabel('Frequency')
plt.legend(['Population Mean', 'Bootstrapped Means'])
plt.show()

# 2. Histogram of Bootstrapped Standard Deviations vs. Population Standard Deviation
plt.figure(figsize=(14, 7))
plt.hist(bootstrap_std_of_means, bins=30, color='lightgreen', edgecolor='black', alpha=0.7)
plt.axvline(population_std, color='orange', linestyle='dashed', linewidth=2)
plt.title('Bootstrapped Standard Deviations vs. Population Standard Deviation')
plt.xlabel('Standard Deviation of Exited')
plt.ylabel('Frequency')
plt.legend(['Population Std Dev', 'Bootstrapped Std Devs'])
plt.show()

# 3. Confidence Interval Visualization
plt.figure(figsize=(14, 7))
plt.hist(boot_means, bins=30, color='lightcoral', edgecolor='black', alpha=0.7)
plt.axvline(conf_interval[0], color='blue', linestyle='dashed', linewidth=2)
plt.axvline(conf_interval[1], color='blue', linestyle='dashed', linewidth=2)
plt.axvline(population_mean, color='red', linestyle='dashed', linewidth=2)
plt.title(f'{int(confidence_level*100)}% Confidence Interval for Bootstrapped Means')
plt.xlabel('Mean of Exited')
plt.ylabel('Frequency')
plt.legend(['Lower CI', 'Upper CI', 'Population Mean'])
plt.show()


In [None]:
# Part 4
# For this final part, formulate hypotheses related to customer churn. For example, ‘customers with a lower balance are more likely to churn.’ 
# Apply statistical significance tests to evaluate these hypotheses, interpret the results of your statistical tests and draw valid conclusions. This will help you gain an understanding of the principles of experimental design.
# Create a plan for a controlled experiment to test one of your hypotheses. Include the experiment design in your PowerPoint report.
# Write a conclusion for your report and include it in the PowerPoint.


In [None]:
# part 4
# Hypothesis testing

# variable = age
import scipy.stats as stats

# Split the data into groups
exited = df[df['Exited'] == 1]['Age']
not_exited = df[df['Exited'] == 0]['Age']

# Perform t-test
t_stat, p_value = stats.ttest_ind(exited, not_exited, equal_var=False)
print("Age:",f"t-statistic: {t_stat.round(2)}, p-value: {p_value.round(2)}")


# variable = Tenure
# Split the data into groups
exited = df[df['Exited'] == 1]['Tenure']
not_exited = df[df['Exited'] == 0]['Tenure']

# Perform t-test
t_stat, p_value = stats.ttest_ind(exited, not_exited, equal_var=False)
print("Tenure:",f"t-statistic: {t_stat.round(2)}, p-value: {p_value.round(2)}")

# Variable = Geography
# Create a contingency table
contingency_table = pd.crosstab(df['Geography'], df['Exited'])

# Perform Chi-Square test
chi2, p, dof, ex = stats.chi2_contingency(contingency_table)
print("Geography:",f"Chi2: {chi2.round(2)}, p-value: {p.round(2)}")


# Variable = Complain
# Create a contingency table
contingency_table = pd.crosstab(df['Complain'], df['Exited'])

# Perform Chi-Square test
chi2, p, dof, ex = stats.chi2_contingency(contingency_table)
print("Complain:",f"Chi2: {chi2.round(2)}, p-value: {p.round(2)}")

# Variable = Satisfaction Score
# Split the data into groups
exited = df[df['Exited'] == 1]['Satisfaction Score']
not_exited = df[df['Exited'] == 0]['Satisfaction Score']

# Perform t-test
t_stat, p_value = stats.ttest_ind(exited, not_exited, equal_var=False)
print("Satisfaction Score:",f"t-statistic: {t_stat.round(2)}, p-value: {p_value.round(2)}")

# Variable = Credit Score
# Split the data into groups
exited = df[df['Exited'] == 1]['CreditScore']
not_exited = df[df['Exited'] == 0]['CreditScore']

# Perform t-test
t_stat, p_value = stats.ttest_ind(exited, not_exited, equal_var=False)
print("Credit Score:",f"t-statistic: {t_stat.round(2)}, p-value: {p_value.round(2)}")

# Varible = Balance
# Split the data into groups
exited = df[df['Exited'] == 1]['Balance']
not_exited = df[df['Exited'] == 0]['Balance']

# Perform t-test
t_stat, p_value = stats.ttest_ind(exited, not_exited, equal_var=False)
print("Balance:",f"t-statistic: {t_stat.round(2)}, p-value: {p_value.round(2)}")

# Variale = active member
# Create a contingency table
contingency_table = pd.crosstab(df['IsActiveMember'], df['Exited'])

# Perform Chi-Square test
chi2, p, dof, ex = stats.chi2_contingency(contingency_table)
print("Active Member:",f"Chi2: {chi2.round(2)}, p-value: {p.round(2)}")

