# PART A: CLEANING Customer_comment.csv

In [None]:
import pandas as pd
import numpy as np

# Step 1: Load and Inspect Data

First, we load the Customer_comment.csv dataset. We then use .info() and .isnull().sum() to get an initial understanding of the data types, non-null counts, and identify columns with missing values. This helps us form a strategy for cleaning.

In [None]:
df_comments = pd.read_csv('Raw_Dataset\Customer_comment.csv')
print("Customer_comment.csv loaded successfully!")
print(f"Original shape: {df_comments.shape}")
df_comments.info()
print(df_comments.isnull().sum())

# Step 2: Handle Duplicate Rows

We check for and remove duplicate rows using df.drop_duplicates(). This is a crucial step to prevent data skew and ensure that our analysis and any potential models are not biased by redundant entries.

In [None]:
initial_rows = len(df_comments)
df_comments.drop_duplicates(inplace=True)
print(f"Removed {initial_rows - len(df_comments)} duplicate rows.")

# Step 3: Impute Missing Values

Our inspection revealed missing values in several columns. We will apply the following imputation strategies:

* **Text Columns (verbatim_text, etc.):** We fill NaN values with the string 'No Comment'. This is better than dropping rows, as it preserves the record and treats the absence of a comment as a distinct piece of information.
* **Categorical Columns (arrival_delay_group, etc.):** We fill NaN values with the mode (the most frequent value) of each column. This is a standard imputation technique that maintains the original distribution of the categorical data.

In [None]:
# Fill missing text data
text_cols = ['verbatim_text', 'ques_verbatim_text', 'transformed_text', 'sentiments']
for col in text_cols:
    if col in df_comments.columns:
        df_comments[col] = df_comments[col].fillna('N/A')
        print(f"Filled missing values in '{col}' with 'N/A'.")

# Fill missing categorical data
categorical_cols = [
    'arrival_delay_group', 'departure_delay_group', 
    'loyalty_program_level', 'fleet_type_description', 
    'entity', 'response_group', 'seat_factor_band'
]
for col in categorical_cols:
    if col in df_comments.columns and df_comments[col].isnull().any():
        mode_value = df_comments[col].mode()[0]
        df_comments[col] = df_comments[col].fillna(mode_value)
        print(f"Filled missing values in '{col}' with its mode ('{mode_value}').")

# Step 4: Save The Cleaned Dataset

In [None]:
output_path = 'Customer_comment_cleaned.csv'
df_comments.to_csv(output_path, index=False)
print(f"\nCleaned dataset saved successfully to '{output_path}'!")