In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import 

%matplotlib inline

In [None]:
df_1 = pd.read_csv(r'/Users/eliskasimova/Desktop/data_analytics_course_2024/project_folder/labs/second_project/data/raw/df_final_demo.txt')

In [None]:
def clean_df_final_demo():
    #key fields client_id, clnt_age, gender, num_accts, balance, clnt_tenure_yr
    df_copy = pd.read_csv("df_final_demo.txt")
    df_copy = df_copy.dropna()
    corrected_columns = {"gendr" : "gender", "bal" : "balance"}
    df_copy.rename(columns=corrected_columns, inplace = True)
    df_copy = df_copy[df_copy['gender'] != 'X']    
    gender_mapping = {
    'M': 'Male',
    'F': 'Female',
    'U': 'Unknown',
    }
    df_copy['gender'] = df_copy['gender'].map(gender_mapping)
    return df_copy

df_1 = clean_df_final_demo()

In [None]:
def clean_df_experiment_clients():
    df_copy = pd.read_csv("df_final_experiment_clients.txt")
    df_copy = df_copy.dropna() # dropping the NaN values
    df_copy.rename(columns={'Variation': 'variation'}, inplace=True)
    return df_copy

df_3 = clean_df_experiment_clients()

In [None]:
df_merged = df_1.merge(df_3, on='client_id', how='inner')
df_merged.dtypes
df_1["client_id"].nunique()

In [None]:
# -------------------------------------
# Demographics analysis
# -------------------------------------
# #####################################
# age analysis
# #####################################
print(df_1['clnt_age'].value_counts());
sns.histplot(df_1['clnt_age'], bins=20, kde=True)
plt.xlabel('Client age')
plt.savefig("client_age.png", dpi=300)
plt.ylabel('Number of clients')

## Gender analysis

In [None]:
# overview
# #####################################

primary_clients = df_1.groupby('gender').agg({
    'clnt_age': ['mean', 'median'],
    'clnt_tenure_yr': ['mean', 'median'],
    'balance': ['mean', 'median']
})

# Define the custom color palette from coolwarm
# We'll take three distinct colors from the coolwarm palette
palette = sns.color_palette("coolwarm", n_colors=3)  # Get 3 colors from the coolwarm palette

# Mapping the colors to gender categories manually (assuming the unique genders are "Male", "Female", "Unknown")
gender_colors = {
    'Male': palette[0],  # First color (coolest color in coolwarm palette)
    'Female': palette[2],  # Last color (warmest color in coolwarm palette)
    'Unknown': palette[1]  # Middle color
}

# Plot the gender distribution pie chart
fig, ax = plt.subplots()
df_1['gender'].value_counts().plot(
    kind='pie',
    autopct=lambda pct: f'{pct:.1f}%',  # Format percentages
    colors=[gender_colors.get(gender, 'grey') for gender in df_merged['gender'].value_counts().index],  # Apply custom colors based on gender
    startangle=90,  # Start from 90 degrees for better alignment
    ax=ax
)

# Add a title
plt.title('Gender Distribution')
plt.ylabel('')  # Suppress the default ylabel
plt.tight_layout()  # Ensure no clipping of text
plt.savefig("gender_distribution_coolwarm.png", dpi=300)
plt.show()


### Tenure Distribution

In [None]:
# how long clients have been with Vanguard
# #####################################
print("tenure_mean:", df_1['clnt_tenure_yr'].mean().round(2))
print("balance_mean:", df_1['balance'].mean().round(2))

# #####################################
sns.histplot(df_1['clnt_tenure_yr'], bins=15, kde=True)
plt.title('Tenure distribution')
plt.xlabel('Years of tenure')
plt.ylabel('Frequency')
plt.savefig("client_tenure.png", format='png', dpi=300);


plt.show()

### Age groups sorted by tenure and balance

In [None]:
# Define age bins and categorize ages
bins = [0, 30, 40, 50, 100]  # Age intervals
labels = ['Under 30', '30-39', '40-49', '50 and above']
df_1['age_group'] = pd.cut(df_1['clnt_age'], bins=bins, labels=labels)

# Group by gender and age group, aggregating both tenure and balance
client_age_tenure_balance = df_1.groupby(['gender', 'age_group']).agg({
    'clnt_tenure_yr': 'mean',
    'balance': 'mean'
}).reset_index().round(2)

# Find the group with the maximum average tenure
highest_tenure_group = client_age_tenure_balance.loc[client_age_tenure_balance['clnt_tenure_yr'].idxmax()]

# Find the group with the maximum average balance
highest_balance_group = client_age_tenure_balance.loc[client_age_tenure_balance['balance'].idxmax()]

print("Group with highest average tenure and balance:")
print(highest_tenure_group)

# Display the aggregated DataFrame
client_age_tenure_balance

### Potential persona

In [None]:
# Find the closest match to the overall average persona in terms of gender and age group
closest_to_avg = client_age_tenure_balance.iloc[
    ((client_age_tenure_balance['clnt_tenure_yr'] - average_tenure).abs() +
     (client_age_tenure_balance['balance'] - average_balance).abs()).idxmin()
]

# Create the "average persona" based on these averages
average_persona = {
    'average_tenure': average_tenure,
    'average_balance': average_balance,
    'gender': closest_to_avg['gender'],
    'age_group': closest_to_avg['age_group']
}

print("Potential Persona:")
print(f"Gender: {average_persona['gender']}")
print(f"Age Group: {average_persona['age_group']}")
print(f"Average Tenure: {average_persona['average_tenure']} years")
print(f"Average Balance: ${average_persona['average_balance']:,.2f}")

### Line charts per age group x balance / client tenure

In [None]:
fig, axes = plt.subplots(2, 1, figsize=(10, 12), sharex=True)

# Line Plot for Tenure
sns.lineplot(
    data=client_age_tenure_balance,
    x='age_group',
    y='clnt_tenure_yr',
    hue='gender',
    marker='o',
    ax=axes[0],
    palette='coolwarm'
)
axes[0].set_title("Average Tenure by Age Group and Gender")
axes[0].set_ylabel("Average Tenure (Years)")
axes[0].set_xlabel("")
axes[0].legend(title="Gender")

# Line Plot for Balance
sns.lineplot(
    data=client_age_tenure_balance,
    x='age_group',
    y='balance',
    hue='gender',
    marker='o',
    linestyle='--',
    ax=axes[1],
    palette='coolwarm'
)
axes[1].set_title("Average Balance by Age Group and Gender")
axes[1].set_ylabel("Average Balance ($)")
axes[1].set_xlabel("Age Group")
axes[1].legend(title="Gender")

plt.tight_layout()
plt.savefig("average_accounts.png")

plt.show()


In [None]:
# Group by gender and age group, aggregating both tenure and balance
logs_calls_accounts = df_1.groupby(['gender', 'age_group']).agg({
    'num_accts': 'mean',
    'calls_6_mnth': 'mean',
    'logons_6_mnth': 'mean'
}).reset_index().round(2)

# Display the aggregated DataFrame
logs_calls_accounts

In [None]:
# Plot for Number of Accounts
plt.figure(figsize=(10, 6))
sns.lineplot(
    data=logs_calls_accounts,
    x='age_group',
    y='num_accts',
    hue='gender',
    marker='o',
    palette='coolwarm'
)
plt.title("Average Number of Accounts")
plt.ylabel("Average Number of Accounts")
plt.xlabel("Age Group")
plt.legend(title="Gender")
plt.tight_layout()
plt.savefig("average_accounts.png")
plt.show()

# Plot for Calls in the Last 6 Months
plt.figure(figsize=(10, 6))
sns.lineplot(
    data=logs_calls_accounts,
    x='age_group',
    y='calls_6_mnth',
    hue='gender',
    marker='o',
    linestyle='--',
    palette='coolwarm'
)
plt.title("Average Calls in Last 6 Months")
plt.ylabel("Average Calls")
plt.xlabel("Age Group")
plt.legend(title="Gender")
plt.tight_layout()
plt.savefig("average_calls.png")
plt.show()

# Plot for Logons in the Last 6 Months
plt.figure(figsize=(10, 6))
sns.lineplot(
    data=logs_calls_accounts,
    x='age_group',
    y='logons_6_mnth',
    hue='gender',
    marker='o',
    linestyle=':',
    palette='coolwarm'
)
plt.title("Average Logons in Last 6 Months")
plt.ylabel("Average Logons")
plt.xlabel("Age Group")
plt.legend(title="Gender")
plt.tight_layout()
plt.savefig("average_logs.png")
plt.show()

