1. Importing Libraries

In [1]:
# Data Analysis and Visualization Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Display settings
import warnings
warnings.filterwarnings('ignore')# To keep the output clean from warnings, can be removed if demonstration.
pd.set_option('display.max_columns', None) # Display all columns in DataFrames
sns.set_style("darkgrid")

2. Reading Data

In [2]:
df = pd.read_csv('../data/customer_sentiment.csv')
# Display basic information
print(f"Shape: {df.shape[0]} rows × {df.shape[1]} columns")
print(f"\nColumns: {list(df.columns)}")

Shape: 25000 rows × 13 columns

Columns: ['customer_id', 'gender', 'age_group', 'region', 'product_category', 'purchase_channel', 'platform', 'customer_rating', 'review_text', 'sentiment', 'response_time_hours', 'issue_resolved', 'complaint_registered']


3. EDA - Part 1: Basic Exploration and Summary Statistics

In [3]:
# Display first 5 rows of the dataset
print("=" * 80)
print("FIRST 5 ROWS OF THE DATASET")
print("=" * 80)
display(df.head())

# Display dataset information
print("\n" + "=" * 80)
print("DATASET INFORMATION")
print("=" * 80)
df.info()

# Display basic statistics for numerical columns
print("\n" + "=" * 80)
print("BASIC STATISTICS")
print("=" * 80)
display(df.describe())

# Check for missing values
print("\n" + "=" * 80)
print("MISSING VALUES CHECK")
print("=" * 80)
missing = df.isnull().sum()
print(missing[missing > 0] if missing.sum() > 0 else "No missing values!")

FIRST 5 ROWS OF THE DATASET


Unnamed: 0,customer_id,gender,age_group,region,product_category,purchase_channel,platform,customer_rating,review_text,sentiment,response_time_hours,issue_resolved,complaint_registered
0,1,male,60+,north,automobile,online,flipkart,1,very disappointed with the quality.,negative,46,yes,yes
1,2,other,46-60,central,books,online,swiggy instamart,5,fast delivery and great packaging.,positive,5,yes,no
2,3,female,36-45,east,sports,online,facebook marketplace,1,very disappointed with the quality.,negative,38,yes,yes
3,4,female,18-25,central,groceries,online,zepto,2,product stopped working after few days.,negative,16,yes,yes
4,5,female,18-25,east,electronics,online,croma,3,neutral about the quality.,neutral,15,yes,no



DATASET INFORMATION
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   customer_id           25000 non-null  int64 
 1   gender                25000 non-null  object
 2   age_group             25000 non-null  object
 3   region                25000 non-null  object
 4   product_category      25000 non-null  object
 5   purchase_channel      25000 non-null  object
 6   platform              25000 non-null  object
 7   customer_rating       25000 non-null  int64 
 8   review_text           25000 non-null  object
 9   sentiment             25000 non-null  object
 10  response_time_hours   25000 non-null  int64 
 11  issue_resolved        25000 non-null  object
 12  complaint_registered  25000 non-null  object
dtypes: int64(3), object(10)
memory usage: 2.5+ MB

BASIC STATISTICS


Unnamed: 0,customer_id,customer_rating,response_time_hours
count,25000.0,25000.0,25000.0
mean,12500.5,3.00248,36.02348
std,7217.022701,1.404647,20.597941
min,1.0,1.0,1.0
25%,6250.75,2.0,18.0
50%,12500.5,3.0,36.0
75%,18750.25,4.0,54.0
max,25000.0,5.0,71.0



MISSING VALUES CHECK
No missing values!


4. Basic Data Quality Checks

In [4]:
print("=" * 80)
print("DATA QUALITY CHECKS")
print("=" * 80)

# 1. Check for duplicate rows
print("\n1. DUPLICATE ROWS:")
duplicates = df.duplicated().sum()
print(f"   Total duplicates: {duplicates}")
if duplicates > 0:
    print(f"  Warning: {duplicates} duplicate rows found!")
else:
    print("   No duplicates found")

# 2. Check unique values for categorical columns
print("\n2. UNIQUE VALUES IN CATEGORICAL COLUMNS:")
categorical_cols = ['gender', 'age_group', 'region', 'product_category',
                    'purchase_channel', 'platform', 'sentiment',
                    'issue_resolved', 'complaint_registered']

for col in categorical_cols:
    print(f"\n   {col}: {df[col].nunique()} unique values")
    print(f"   Values: {df[col].unique()[:]}")

# 4. Check customer rating count
print("\n3. CUSTOMER RATING:")
print(df['customer_rating'].value_counts().sort_index())

# 5. Check sentiment distribution
print("\n4. SENTIMENT DISTRIBUTION:")
print(df['sentiment'].value_counts())
print(f"\n   Percentages:")
print(df['sentiment'].value_counts(normalize=True) * 100)

DATA QUALITY CHECKS

1. DUPLICATE ROWS:
   Total duplicates: 0
   No duplicates found

2. UNIQUE VALUES IN CATEGORICAL COLUMNS:

   gender: 3 unique values
   Values: ['male' 'other' 'female']

   age_group: 5 unique values
   Values: ['60+' '46-60' '36-45' '18-25' '26-35']

   region: 5 unique values
   Values: ['north' 'central' 'east' 'south' 'west']

   product_category: 9 unique values
   Values: ['automobile' 'books' 'sports' 'groceries' 'electronics' 'travel'
 'fashion' 'home & kitchen' 'beauty']

   purchase_channel: 1 unique values
   Values: ['online']

   platform: 20 unique values
   Values: ['flipkart' 'swiggy instamart' 'facebook marketplace' 'zepto' 'croma'
 'amazon' 'shopclues' 'tata cliq' 'snapdeal' 'paytm mall' 'ajio' 'myntra'
 'nykaa' 'reliance digital' 'meesho' 'bigbasket' 'lenskart' 'jiomart'
 'others' 'boat']

   sentiment: 3 unique values
   Values: ['negative' 'positive' 'neutral']

   issue_resolved: 2 unique values
   Values: ['yes' 'no']

   complaint_registe