In [68]:
import pandas as pd

df = pd.read_csv('/content/employee_data.csv', delimiter=',', quotechar='"')

In [67]:
import pandas as pd
import numpy as np
import re

In [69]:
df = df.drop_duplicates()

In [73]:
import pandas as pd
import numpy as np
import re

# Load the data as a single column (as it appears to be read this way)
# We expect a single column with name like 'emp_id,name,email,joining_date,department,salary'
try:
    df_single_col = pd.read_csv('/content/employee_data.csv', header=None, encoding='utf-8')
    # Assuming the header is the first row and also in the single column format
    header_string = df_single_col.iloc[0, 0]
    column_names = [name.strip() for name in header_string.replace('"', '').split(',')]
    # Data starts from the second row
    data_rows_single_col = df_single_col.iloc[1:, 0]

    # Manually split the single string column into multiple columns
    # We need to handle the quotes and commas within the string
    # A more robust way might involve regex or the csv module again,
    # but let's try a simple split after removing quotes first.
    # This might need refinement depending on the exact format variations.

    # Function to split a single row string into a list of values
    def parse_row_string(row_string):
        # Remove leading/trailing quotes if present
        row_string = row_string.strip().strip('"')
        # Split by comma, this is a basic split and might fail with commas within fields
        # A more complex parser might be needed for embedded commas
        return [item.strip().strip('"') for item in row_string.split(',')]

    parsed_data = [parse_row_string(row_str) for row_str in data_rows_single_col]

    # Create a new DataFrame from the manually parsed data
    # Ensure all rows have the same number of columns as the header
    # If not, this will likely cause errors. We might need padding or error handling.
    # For simplicity, let's assume basic splitting works for now.

    # Check if the number of columns matches
    if not all(len(row) == len(column_names) for row in parsed_data):
         print("Warning: Some rows do not have the expected number of columns based on the header.")
         # Optionally, handle rows that don't match the header column count

    df = pd.DataFrame(parsed_data, columns=column_names)


except Exception as e:
    print(f"Error during manual parsing: {e}")
    # Fallback or further diagnostics could go here
    df = pd.DataFrame() # Create an empty DataFrame to prevent further errors

# Now continue with cleaning steps on the newly created df
if not df.empty:
    print("\nDataFrame created by manual string splitting:")
    display(df.head())

    # Remove duplicate rows
    df_cleaned = df.drop_duplicates().copy()

    print("\nDataFrame after removing duplicates:")
    print("Number of rows:", len(df_cleaned))
    display(df_cleaned.head())

    # Define the email validation function (already defined, but included for completeness in this cell)
    def is_valid_email(email):
        if pd.isna(email):
            return False
        # Updated regex for better email validation
        return re.match(r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$', str(email)) is not None

    # Apply the email validation and set invalid emails to NaN
    # Need to ensure the 'email' column exists after manual parsing
    if 'email' in df_cleaned.columns:
        df_cleaned.loc[:, 'email'] = df_cleaned['email'].apply(lambda x: x if is_valid_email(x) else np.nan)
        print("\nMissing values after validating and potentially setting invalid emails to NaN:")
        print(df_cleaned.isnull().sum())
        print("\nDataFrame after email validation (first 5 rows):")
        display(df_cleaned.head())
    else:
        print("\n'email' column not found in the DataFrame after manual parsing.")

else:
    print("\nFailed to create DataFrame from manual parsing.")


DataFrame created by manual string splitting:


Unnamed: 0,emp_id,name,email,joining_date,department,salary
0,101,Alice Johnson,alice@company.com,2020-03-01,Human Resources,55000
1,102,Bob Smith,bob.smithcompany.com,2021-06-15,HR,60000
2,103,Charlie Lee,charlie.lee@company.com,2025-01-01,Finance,0
3,104,,,2019-08-01,hr,65000
4,105,David Kim,david.kim@company.com,,IT,72000



DataFrame after removing duplicates:
Number of rows: 6


Unnamed: 0,emp_id,name,email,joining_date,department,salary
0,101,Alice Johnson,alice@company.com,2020-03-01,Human Resources,55000
1,102,Bob Smith,bob.smithcompany.com,2021-06-15,HR,60000
2,103,Charlie Lee,charlie.lee@company.com,2025-01-01,Finance,0
3,104,,,2019-08-01,hr,65000
4,105,David Kim,david.kim@company.com,,IT,72000



Missing values after validating and potentially setting invalid emails to NaN:
emp_id          0
name            0
email           3
joining_date    0
department      0
salary          0
dtype: int64

DataFrame after email validation (first 5 rows):


Unnamed: 0,emp_id,name,email,joining_date,department,salary
0,101,Alice Johnson,alice@company.com,2020-03-01,Human Resources,55000
1,102,Bob Smith,,2021-06-15,HR,60000
2,103,Charlie Lee,charlie.lee@company.com,2025-01-01,Finance,0
3,104,,,2019-08-01,hr,65000
4,105,David Kim,david.kim@company.com,,IT,72000


In [75]:
print("Columns available in df_cleaned DataFrame:")
print(df_cleaned.columns)

Columns available in df_cleaned DataFrame:
Index(['emp_id', 'name', 'email', 'joining_date', 'department', 'salary'], dtype='object')


In [77]:
df['email'] = df['email'].fillna('abc@gmail.com')

In [80]:
df.to_csv('employee_cleaned2.csv', index=False)

In [79]:
print("Data cleaning completed. Cleaned data saved to 'customers_cleaned.csv'.")

Data cleaning completed. Cleaned data saved to 'customers_cleaned.csv'.
