In [33]:
import pandas as pd

df = pd.read_csv("Python_CustomerData_Raw.csv")

df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   CustomerID     5 non-null      int64  
 1   CustomerName   5 non-null      object 
 2   Email Address  5 non-null      object 
 3   Phone Number   5 non-null      object 
 4   Address        5 non-null      object 
 5   Age            4 non-null      float64
dtypes: float64(1), int64(1), object(4)
memory usage: 372.0+ bytes


In [36]:
# Example if your real column name is "Email Address"
df["CleanedPhone"] = df["Phone Number"].str.replace(r"\D", "", regex=True)
df["EmailValid"] = df["Email Address"].apply(
    lambda x: "Valid"
    if isinstance(x, str)
    and x.count("@") == 1
    and "." in x
    and x.index("@") < x.rindex(".")
    else "Invalid"
)



# Preview updated data
df[["Phone Number", "CleanedPhone", "Email Address", "EmailValid"]].head()


Unnamed: 0,Phone Number,CleanedPhone,Email Address,EmailValid
0,(987)-654-3210,9876543210,john.doe@gmail.com,Valid
1,123.456.7890,1234567890,jane_smith@@gmail.com,Invalid
2,+91 98765 43210,919876543210,emily@@banks.com,Invalid
3,9999999999,9999999999,rahul.sharma@,Invalid
4,8888-888-888,8888888888,tina.watson@gmail,Invalid


In [37]:
df["PhoneValid"] = df["CleanedPhone"].apply(lambda x: "Valid" if len(str(x)) == 10 else "Invalid")

# Preview updated data
df[["PhoneValid", "CleanedPhone", "Email Address", "EmailValid","Age"]].head()

Unnamed: 0,PhoneValid,CleanedPhone,Email Address,EmailValid,Age
0,Valid,9876543210,john.doe@gmail.com,Valid,28.0
1,Valid,1234567890,jane_smith@@gmail.com,Invalid,
2,Invalid,919876543210,emily@@banks.com,Invalid,35.0
3,Valid,9999999999,rahul.sharma@,Invalid,22.0
4,Valid,8888888888,tina.watson@gmail,Invalid,30.0


In [38]:
total_customers = df.shape[0]
valid_emails = df["EmailValid"].value_counts().get("Valid", 0)
valid_phones = df["PhoneValid"].value_counts().get("Valid", 0)

print("Total Customers:", total_customers)
print("Valid Emails:", valid_emails, f"({(valid_emails/total_customers)*100:.2f}%)")
print("Valid Phones:", valid_phones, f"({(valid_phones/total_customers)*100:.2f}%)")


Total Customers: 5
Valid Emails: 1 (20.00%)
Valid Phones: 4 (80.00%)


In [39]:
print("Missing Age count (before):", df["Age"].isnull().sum())

# After filling
df["Age"] = df["Age"].fillna(round(df["Age"].mean()))
print("Missing Age count (after):", df["Age"].isnull().sum())

df[["PhoneValid", "CleanedPhone", "Email Address", "EmailValid","Age"]].head()


Missing Age count (before): 1
Missing Age count (after): 0


Unnamed: 0,PhoneValid,CleanedPhone,Email Address,EmailValid,Age
0,Valid,9876543210,john.doe@gmail.com,Valid,28.0
1,Valid,1234567890,jane_smith@@gmail.com,Invalid,29.0
2,Invalid,919876543210,emily@@banks.com,Invalid,35.0
3,Valid,9999999999,rahul.sharma@,Invalid,22.0
4,Valid,8888888888,tina.watson@gmail,Invalid,30.0


In [32]:
df.to_csv("Python_CustomerData_Cleaned.csv", index=False)