In [None]:
import pandas as pd 
import numpy as np

pd.set_option("display.max_rows", 50)
pd.set_option("display.max_columns", 50)

fp = "/mnt/data/customer_signups.csv" 

In [None]:
df = pd.read_csv(fp)

print("Shape:", df.shape)
display(df.head(10))
df.info()
df.describe(include="all")

In [None]:
df_clean = df.dropna(subset=["customer_id", "signup_date"]).copy()

df_clean["signup_date"] = pd.to_datetime(df_clean["signup_date"], errors="coerce", dayfirst=True)

before = len(df_clean)
df_clean = df_clean.dropna(subset=["signup_date"])
after = len(df_clean)
print(f"Rows removed due to bad dates: {before - after}")

df_clean["age"] = pd.to_numeric(df_clean["age"], errors="coerce")


def tidy_text(s):
    if pd.isna(s):
        return np.nan
    return str(s).strip()

for col in ["source", "region", "plan_selected", "marketing_opt_in", "gender"]:
    df_clean[col] = df_clean[col].map(tidy_text)

for col in ["plan_selected", "gender", "marketing_opt_in"]:
    df_clean[col] = df_clean[col].str.lower()


plan_map = {
    "basic":"Basic", "pro":"Pro", "premium":"Premium",
    "premiu m":"Premium", "prem ium":"Premium", "prem":"Premium",
    "unknownplan":"Unknown", "unknown":"Unknown"
}
gender_map = {
    "male":"Male", "female":"Female", "non-binary":"Non-Binary",
    "nonbinary":"Non-Binary", "nb":"Non-Binary", "other":"Other",
    "unknown":"Unknown", "prefer not to say":"Unknown", "123":"Unknown"  
}
opt_map = {
    "yes":"Yes", "no":"No", "y":"Yes", "n":"No",
    "true":"Yes", "false":"No", "none":"Unknown", "nil":"Unknown", "unknown":"Unknown"
}

df_clean["plan_selected"]     = df_clean["plan_selected"].map(plan_map).fillna(df_clean["plan_selected"].str.title())
df_clean["gender"]            = df_clean["gender"].map(gender_map).fillna(df_clean["gender"].str.title())
df_clean["marketing_opt_in"]  = df_clean["marketing_opt_in"].map(opt_map).fillna(df_clean["marketing_opt_in"].str.title())

df_clean["source"] = df_clean["source"].str.title()
df_clean["region"] = df_clean["region"].str.title()


before = len(df_clean)
df_clean = df_clean.drop_duplicates(subset="customer_id", keep="first")
duplicates_removed = before - len(df_clean)
print("Duplicates removed:", duplicates_removed)


for col in ["source", "region", "plan_selected", "marketing_opt_in", "gender"]:
    df_clean[col] = df_clean[col].fillna("Unknown")


age_median = df_clean["age"].median()
df_clean["age"] = df_clean["age"].fillna(age_median)

df_clean.loc[df_clean["age"] > 100, "age"] = np.nan
df_clean["age"] = df_clean["age"].fillna(df_clean["age"].median())


In [None]:
missing = df_clean.isna().sum().to_frame("missing_count")
missing["missing_pct"] = (missing["missing_count"] / len(df_clean) * 100).round(2)

print("Rows:", len(df_clean))
display(missing.sort_values("missing_count", ascending=False))
print("Duplicates removed earlier:", duplicates_removed)

def uniques(col):
    return sorted(df_clean[col].dropna().unique().tolist())

print("Plan values:", uniques("plan_selected"))
print("Gender values:", uniques("gender"))
print("Marketing opt-in values:", uniques("marketing_opt_in"))




In [None]:
signups_per_week = (
    df_clean
    .groupby(pd.Grouper(key="signup_date", freq="W"))["customer_id"]
    .count()
    .rename("signups")
)

display(signups_per_week)


signups_by_source = df_clean["source"].value_counts().rename("count")
signups_by_region = df_clean["region"].value_counts().rename("count")
signups_by_plan   = df_clean["plan_selected"].value_counts().rename("count")

display(signups_by_source)
display(signups_by_region)
display(signups_by_plan)


optin_by_gender = (
    df_clean
    .groupby(["gender", "marketing_opt_in"])["customer_id"]
    .count()
    .unstack(fill_value=0)
    .sort_index()
)

display(optin_by_gender)



age_summary = pd.Series({
    "min":   df_clean["age"].min(),
    "max":   df_clean["age"].max(),
    "mean":  df_clean["age"].mean(),
    "median":df_clean["age"].median(),
    "null_count": int(df_clean["age"].isna().sum())
}).round(2)

display(age_summary)

