In [1]:
import pandas as pd
import numpy as np

In [2]:
df=pd.read_csv("C:\\Users\\Remya\\Downloads\\customer_raw_data_1500.csv")
df

Unnamed: 0,Customer_ID,Age,Gender,City,Income,Customer_Since,Spending_Score
0,CUST1000,68,Unknown,new york,"$93,792",2022-02-16,79.0
1,CUST1001,43,MALE,Los Angeles,"$138,681",23/09/2022,74.0
2,CUST1002,73,MALE,Huston,"$30,858",30/07/2022,32.0
3,CUST1003,25,FEMALE,new york,"$138,103",2022-05-02,36.0
4,CUST1004,72,FeMale,Unknown,"$103,229",2022-08-30,83.0
...,...,...,...,...,...,...,...
1495,CUST2495,78,female,New York,"$42,001",2023-01-25,54.0
1496,CUST2496,33,FeMale,Houston,"$63,038","Dec 29, 2023",77.0
1497,CUST2497,45,Other,Huston,"$75,659",2022-09-23,75.0
1498,CUST2498,84,Male,Unknown,"$66,664",2023-09-03,39.0


### Identify and handle missing values

In [3]:
print("Missing values per column:\n", df.isnull().sum())

Missing values per column:
 Customer_ID         0
Age                 0
Gender              0
City                0
Income            130
Customer_Since      0
Spending_Score     18
dtype: int64


In [4]:

missing_values = df.isnull().sum()
print("Missing values per column:\n", missing_values)
df = df[~df['Age'].isnull()]
df['Income'] = df['Income'].fillna(df['Income'].median())



Missing values per column:
 Customer_ID         0
Age                 0
Gender              0
City                0
Income            130
Customer_Since      0
Spending_Score     18
dtype: int64


TypeError: Cannot convert ['$93,792' '$138,681' '$30,858' ... '$75,659' '$66,664' '$56,435'] to numeric

In [None]:
Age is critical for segmentation, can’t be guessed  drop it if missing.

Income varies a lot, so we use median to reduce skew impact.



In [None]:

df = df.dropna(subset=['Age', 'Gender'])
df['Income'] = df['Income'].fillna(df['Income'].median())
df['Spending_Score'] = df['Spending_Score'].fillna(df['Spending_Score'].median())


In [None]:

df['Customer_Since'] = pd.to_datetime(df['Customer_Since'], errors='coerce')
df['Income'] = df['Income'].replace('[/0,]', '', regex=True).astype(float)


In [None]:
 Justification:

Mixed date formats fixed with errors='coerce' → anything invalid becomes NaT.

Income strings cleaned and converted for analysis and plotting.



In [5]:

df['Gender'] = df['Gender'].str.strip().str.capitalize()  
df['City'] = df['City'].str.strip().str.title()
df = df[(df['Age'] > 0) & (df['Age'] < 100)]
df = df[~df['City'].str.lower().isin(['unknown'])]
df = df[~df['Gender'].str.lower().isin(['unknown'])]


In [6]:

duplicates_count = df.duplicated().sum()
df = df.drop_duplicates()

duplicates_count


np.int64(0)

## Exploratory Data Analysis


#### Summarize the datasets

In [8]:
 
df['Income'] = df['Income'].replace({'[$,]': ''}, regex=True).astype(float)

income_mean = df['Income'].mean()
income_median = df['Income'].median()
income_mode = df['Income'].mode()[0]

print(f"Mean Income: {income_mean}")
print(f"Median Income: {income_median}")
print(f"Mode Income: {income_mode}")


Mean Income: 84170.69293756397
Median Income: 82386.0
Mode Income: 27748.0


### Insight Generation

In [9]:

df['Age_Group'] = pd.cut(df['Age'], bins=[18, 25, 35, 50, 65, 100], 
                         labels=['18-25', '26-35', '36-50', '51-65', '65+'])

avg_spend_by_age = df.groupby('Age_Group')['Spending_Score'].mean()

income_by_gender = df.groupby('Gender')['Income'].mean()

(avg_spend_by_age, income_by_gender)


  avg_spend_by_age = df.groupby('Age_Group')['Spending_Score'].mean()


(Age_Group
 18-25    52.209524
 26-35    45.604317
 36-50    47.990783
 51-65    52.091787
 65+      50.613636
 Name: Spending_Score, dtype: float64,
 Gender
 Female    83606.875803
 Male      84161.823881
 Other     85692.257143
 Name: Income, dtype: float64)

## Data issues after cleaning

###### major issues

#### 1.A few Customer_Since dates were still NaT → original data issue.

#### 2.Cities like “Huston” might be typos for “Houston” → needs fuzzy matching fix.

#### 3.Some Income values still look super high → maybe consider capping outliers.

In [10]:
df.to_csv("//mnt//data//customer_data_cleaned.csv", index=False)


OSError: Cannot save file into a non-existent directory: '\\mnt\\data'