In [1]:
import pandas as pd
import numpy as np

In [14]:
df = pd.read_csv("Mall_Customers.csv")
df.head()

Unnamed: 0,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40


In [3]:
# Check for missing values in each column
print(df.isnull().sum())

CustomerID                0
Gender                    0
Age                       0
Annual Income (k$)        0
Spending Score (1-100)    0
dtype: int64


In [4]:
# Check duplicates
print("Duplicate rows:", df.duplicated().sum())

Duplicate rows: 0


In [5]:
# Remove duplicates
df = df.drop_duplicates()

In [6]:
# Lowercase and remove spaces
df['Gender'] = df['Gender'].str.lower().str.strip()

# Replace abbreviations if needed (not required here, just example)
df['Gender'] = df['Gender'].replace({'m': 'male', 'f': 'female'})
df

Unnamed: 0,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,1,male,19,15,39
1,2,male,21,15,81
2,3,female,20,16,6
3,4,female,23,16,77
4,5,female,31,17,40
...,...,...,...,...,...
195,196,female,35,120,79
196,197,female,45,126,28
197,198,male,32,126,74
198,199,male,32,137,18


In [7]:
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")
df

Unnamed: 0,customerid,gender,age,annual_income_(k$),spending_score_(1-100)
0,1,male,19,15,39
1,2,male,21,15,81
2,3,female,20,16,6
3,4,female,23,16,77
4,5,female,31,17,40
...,...,...,...,...,...
195,196,female,35,120,79
196,197,female,45,126,28
197,198,male,32,126,74
198,199,male,32,137,18


In [8]:
df.columns

Index(['customerid', 'gender', 'age', 'annual_income_(k$)',
       'spending_score_(1-100)'],
      dtype='object')

In [9]:
data = {
    'CustomerID': [1, 2, 3],
    'Join_Date': ['2026/02/12', '12-02-2026', 'Feb 12, 2026']
}

df = pd.DataFrame(data)

# Convert to datetime
df['Join_Date'] = pd.to_datetime(df['Join_Date'], dayfirst=True)

df

Unnamed: 0,CustomerID,Join_Date
0,1,2026-02-12
1,2,2026-02-12
2,3,2026-02-12


In [10]:
df['Join_Date'] = df['Join_Date'].dt.strftime('%d-%m-%Y')
df

Unnamed: 0,CustomerID,Join_Date
0,1,12-02-2026
1,2,12-02-2026
2,3,12-02-2026


In [15]:
print(df.columns.tolist())

['CustomerID', 'Gender', 'Age', 'Annual Income (k$)', 'Spending Score (1-100)']


In [16]:
# Remove leading/trailing spaces, lowercase, and replace spaces with underscores
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '', regex=False).str.replace(')', '', regex=False)
print(df.columns.tolist())

['customerid', 'gender', 'age', 'annual_income_k$', 'spending_score_1-100']


In [17]:
df['gender'] = df['gender'].astype('category')
print(df.dtypes)

customerid                 int64
gender                  category
age                        int64
annual_income_k$           int64
spending_score_1-100       int64
dtype: object


In [18]:
df.to_csv("Mall_Customers_Cleaned.csv", index=False)