In [1]:
import pandas as pd
import numpy as np

In [9]:
customer_feedback = pd.read_csv('customer_feedback.csv')
customer_feedback.head()

Unnamed: 0,CustomerID,Name,Email,Location,Satisfaction_Score,Feedback,Date
0,1102,Bob Lee,sarah@domain.com,Montreal,3.0,Great service,2023-06-01
1,1435,Alice Johnson,bob.lee@gmail.com,Toronto,2.0,,2023-03-02
2,1860,Alice Johnson,bob.lee@gmail.com,Vancouver,,,2023-03-02
3,1270,Alice Johnson,bob.lee@gmail.com,Vancouver,4.0,,2023-12-20
4,1106,Alice Johnson,bob.lee@gmail.com,Montreal,4.0,Okay,2023-01-28


In [16]:
print('customer_feedback raw shape:', customer_feedback.shape)
print('\nInfo:')
print(customer_feedback.info())

print('\nMissing values per column:')
print(customer_feedback.isna().sum())

print(customer_feedback['CustomerID'].value_counts(dropna=False))
print('\nCustomerID:')
print(customer_feedback['CustomerID'].value_counts(dropna=False).head())

print(customer_feedback['Name'].value_counts(dropna=False))
print('\nName:')
print(customer_feedback['Name'].value_counts(dropna=False).head())

print(customer_feedback['Email'].value_counts(dropna=False))
print('\nEmail:')
print(customer_feedback['Email'].value_counts(dropna=False).head())

print(customer_feedback['Location'].value_counts(dropna=False))
print('\nLocation:')
print(customer_feedback['Location'].value_counts(dropna=False).head())

print(customer_feedback['Satisfaction_Score'].value_counts(dropna=False))
print('\nSatisfaction_Score:')
print(customer_feedback['Satisfaction_Score'].value_counts(dropna=False).head())

print(customer_feedback['Feedback'].value_counts(dropna=False))
print('\nFeedback:')
print(customer_feedback['Feedback'].value_counts(dropna=False).head())

print(customer_feedback['Date'].value_counts(dropna=False))
print('\nDate:')
print(customer_feedback['Date'].value_counts(dropna=False).head())

print('\nNumber of duplicate rows:', customer_feedback.duplicated().sum())

customer_feedback raw shape: (525, 7)

Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 525 entries, 0 to 524
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   CustomerID          525 non-null    int64  
 1   Name                422 non-null    object 
 2   Email               433 non-null    object 
 3   Location            442 non-null    object 
 4   Satisfaction_Score  442 non-null    float64
 5   Feedback            298 non-null    object 
 6   Date                525 non-null    object 
dtypes: float64(1), int64(1), object(5)
memory usage: 28.8+ KB
None

Missing values per column:
CustomerID              0
Name                  103
Email                  92
Location               83
Satisfaction_Score     83
Feedback              227
Date                    0
dtype: int64
CustomerID
1098    5
1871    5
1957    5
1001    4
1815    4
       ..
1392    1
1206    1
1553    1
1460    1
1569    

Issues observed in customer_feedback.csv
- Irregular data
- it doesnt have feedback, date,location, E mail, Name, Customer ID
- More number of duplicate rows

3. Clean Retail Feedback Data

In [26]:
import pandas as pd
import numpy as np

# ----------------------------
# COPY RAW DATA
# ----------------------------
customer_feedback = customer_feedback.copy()

# Remove duplicate rows
customer_feedback = customer_feedback.drop_duplicates()

# ----------------------------
# CLEAN TEXT COLUMNS
# ----------------------------
text_cols = ['store_city', 'feedback_source', 'feedback_text']

for col in text_cols:
    if col in customer_feedback.columns:
        customer_feedback[col] = (
            customer_feedback[col]
            .astype('string')
            .str.strip()
            .str.lower()
            .replace(['nan', 'none', ''], pd.NA)
        )

# ----------------------------
# STANDARDIZE CITY NAMES
# ----------------------------
if 'store_city' in customer_feedback.columns:
    customer_feedback['store_city'] = customer_feedback['store_city'].replace({
        'nyc': 'new york',
        'la': 'los angeles',
        'san fran': 'san francisco',
        'chi town': 'chicago',
        'new york': 'new york',
        'los angeles': 'los angeles',
        'san francisco': 'san francisco',
        'chicago': 'chicago'
    })

# ----------------------------
# STANDARDIZE FEEDBACK SOURCE
# ----------------------------
if 'feedback_source' in customer_feedback.columns:
    customer_feedback['feedback_source'] = customer_feedback['feedback_source'].replace({
        'in store': 'in-store',
        'in-store': 'in-store',
        'online': 'online',
        'email': 'email'
    })

# ----------------------------
# CLEAN RATING
# ----------------------------
if 'rating' in customer_feedback.columns:
    customer_feedback['rating'] = pd.to_numeric(
        customer_feedback['rating'], errors='coerce'
    )
    customer_feedback.loc[
        ~customer_feedback['rating'].between(1, 5), 'rating'
    ] = np.nan

# ----------------------------
# CLEAN PURCHASE AMOUNT
# ----------------------------
if 'purchase_amount' in customer_feedback.columns:
    customer_feedback['purchase_amount'] = (
        customer_feedback['purchase_amount']
        .astype('string')
        .str.lower()
        .replace('n/a', pd.NA)
        .str.replace(r'[\$,]', '', regex=True)
    )

    customer_feedback['purchase_amount'] = pd.to_numeric(
        customer_feedback['purchase_amount'], errors='coerce'
    )

# ----------------------------
# CLEAN AGE
# ----------------------------
if 'age' in customer_feedback.columns:
    customer_feedback['age'] = pd.to_numeric(
        customer_feedback['age'], errors='coerce'
    )

# ----------------------------
# CONVERT VISIT DATE
# ----------------------------
if 'visit_date' in customer_feedback.columns:
    customer_feedback['visit_date'] = pd.to_datetime(
        customer_feedback['visit_date'], errors='coerce'
    )

# ----------------------------
# IMPUTE MISSING NUMERIC VALUES
# ----------------------------
for col in ['age', 'rating', 'purchase_amount']:
    if col in customer_feedback.columns:
        customer_feedback[col] = customer_feedback[col].fillna(
            customer_feedback[col].median()
        )

# ----------------------------
# FILL REMAINING TEXT NaNs
# ----------------------------
for col in text_cols:
    if col in customer_feedback.columns:
        customer_feedback[col] = customer_feedback[col].fillna('unknown')

# ----------------------------
# PREVIEW RESULT
# ----------------------------
customer_feedback.head()

Unnamed: 0,CustomerID,Name,Email,Location,Satisfaction_Score,Feedback,Date
0,1102,Bob Lee,sarah@domain.com,Montreal,3.0,Great service,2023-06-01
1,1435,Alice Johnson,bob.lee@gmail.com,Toronto,2.0,,2023-03-02
2,1860,Alice Johnson,bob.lee@gmail.com,Vancouver,,,2023-03-02
3,1270,Alice Johnson,bob.lee@gmail.com,Vancouver,4.0,,2023-12-20
4,1106,Alice Johnson,bob.lee@gmail.com,Montreal,4.0,Okay,2023-01-28


Customer Feedback

In [47]:
customer_feedback_clean = customer_feedback.copy()

customer_feedback_clean = customer_feedback.drop_duplicates()

Location ={
         'Montreal': 'Montreal',
         'MONTREAL': 'MONTREAL',
         'Toronto': 'Toronto',
         'toronto': 'toronto',
         'Vancouver':'Vancouver'
}
customer_feedback_clean['Location'] = customer_feedback_clean['Location'].map(Location)

name = {
       'Alice Johnson': 'Alice Johnson',
       'John Smith': 'John Smith',
       'Sarah Khan': 'Sarah Khan',
       'Bob Lee': 'Bob Lee'
}
#customer_feedback_clean['Name'] = customer_feedback_clean['Name'].map(Name)

customer_feedback_clean['Satisfaction_Score'] = customer_feedback_clean['Satisfaction_Score'].fillna(customer_feedback_clean['Satisfaction_Score'].median())

for col in ['Feedback', 'CustomerID', 'Location', 'Name', 'Email']:
    customer_feedback_clean[col] = customer_feedback_clean[col].fillna('Unknown')

    
customer_feedback_clean.head()

Unnamed: 0,CustomerID,Name,Email,Location,Satisfaction_Score,Feedback,Date
0,1102,Bob Lee,sarah@domain.com,Montreal,3.0,Great service,2023-06-01
1,1435,Alice Johnson,bob.lee@gmail.com,Toronto,2.0,Unknown,2023-03-02
2,1860,Alice Johnson,bob.lee@gmail.com,Vancouver,3.0,Unknown,2023-03-02
3,1270,Alice Johnson,bob.lee@gmail.com,Vancouver,4.0,Unknown,2023-12-20
4,1106,Alice Johnson,bob.lee@gmail.com,Montreal,4.0,Okay,2023-01-28


In [52]:
print(customer_feedback_sleep_clean.info())
print('\nMissing values per column after cleaning:')
print(customer_feedback_sleep_clean.isna().sum())

NameError: name 'customer_feedback_sleep_clean' is not defined

In [53]:
customer_feedback_sleep_clean.to_csv('customer_feedback_cleaned.csv', index=False)

NameError: name 'customer_feedback_sleep_clean' is not defined