In [5]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Step 1: Load the Data
df = pd.read_csv('WineQT.csv')

# Step 2: Initial Data Exploration
print("Missing Values:\n", df.isnull().sum())
print("\nData Types:\n", df.dtypes)
print("\nDuplicate Rows:", df.duplicated().sum())

# Step 3: Handle Missing Values
# Numeric columns imputation (mean)
numeric_cols = df.select_dtypes(include=[np.number]).columns
imputer = SimpleImputer(strategy='mean')
df[numeric_cols] = imputer.fit_transform(df[numeric_cols])

# Categorical columns imputation (mode)
categorical_cols = df.select_dtypes(include=[object]).columns
imputer_cat = SimpleImputer(strategy='most_frequent')
df[categorical_cols] = imputer_cat.fit_transform(df[categorical_cols])

# Step 4: Remove Duplicates
df = df.drop_duplicates()

# Step 5: Handle Outliers
# Using IQR method for numeric columns
Q1 = df[numeric_cols].quantile(0.25)
Q3 = df[numeric_cols].quantile(0.75)
IQR = Q3 - Q1
df = df[~((df[numeric_cols] < (Q1 - 1.5 * IQR)) | (df[numeric_cols] > (Q3 + 1.5 * IQR))).any(axis=1)]

# Step 6: Fix Data Types
# Convert 'date_column' to datetime format (replace 'date_column' with actual column name)
if 'date_column' in df.columns:
    df['date_column'] = pd.to_datetime(df['date_column'])

# Convert categorical columns to category type (if applicable)
categorical_cols = df.select_dtypes(include=[object]).columns
for col in categorical_cols:
    df[col] = df[col].astype('category')

# Step 7: Feature Scaling (Optional, only if you are using machine learning)
scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

# Step 8: Final Check
print("\nMissing Values After Cleaning:\n", df.isnull().sum())
print("\nDuplicate Rows After Cleaning:", df.duplicated().sum())

# Step 9: Save Cleaned Data
df.to_csv('cleaned_data.csv', index=False)

print("Data cleaning complete. Cleaned data saved to 'cleaned_data.csv'.")


Missing Values:
 fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
Id                      0
dtype: int64

Data Types:
 fixed acidity           float64
volatile acidity        float64
citric acid             float64
residual sugar          float64
chlorides               float64
free sulfur dioxide     float64
total sulfur dioxide    float64
density                 float64
pH                      float64
sulphates               float64
alcohol                 float64
quality                   int64
Id                        int64
dtype: object

Duplicate Rows: 0


ValueError: at least one array or dtype is required

In [7]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Step 1: Load the Data
df = pd.read_csv('diabetes.csv')

# Step 2: Initial Data Exploration
print("Missing Values:\n", df.isnull().sum())
print("\nData Types:\n", df.dtypes)
print("\nDuplicate Rows:", df.duplicated().sum())

# Step 3: Handle Missing Values
# Numeric columns imputation (mean)
numeric_cols = df.select_dtypes(include=[np.number]).columns
imputer = SimpleImputer(strategy='mean')
df[numeric_cols] = imputer.fit_transform(df[numeric_cols])

# Categorical columns imputation (mode)
categorical_cols = df.select_dtypes(include=[object]).columns

# Filter out categorical columns with no data
categorical_cols = [col for col in categorical_cols if df[col].notnull().any()]

if categorical_cols:  # Only apply imputation if there are valid categorical columns
    imputer_cat = SimpleImputer(strategy='most_frequent')
    df[categorical_cols] = imputer_cat.fit_transform(df[categorical_cols])

# Step 4: Remove Duplicates
df = df.drop_duplicates()

# Step 5: Handle Outliers
# Using IQR method for numeric columns
Q1 = df[numeric_cols].quantile(0.25)
Q3 = df[numeric_cols].quantile(0.75)
IQR = Q3 - Q1
df = df[~((df[numeric_cols] < (Q1 - 1.5 * IQR)) | (df[numeric_cols] > (Q3 + 1.5 * IQR))).any(axis=1)]

# Step 6: Fix Data Types
# Convert 'date_column' to datetime format (replace 'date_column' with actual column name)
if 'date_column' in df.columns:
    df['date_column'] = pd.to_datetime(df['date_column'])

# Convert categorical columns to category type (if applicable)
categorical_cols = df.select_dtypes(include=[object]).columns
for col in categorical_cols:
    df[col] = df[col].astype('category')

# Step 7: Feature Scaling (Optional, only if you are using machine learning)
scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

# Step 8: Final Check
print("\nMissing Values After Cleaning:\n", df.isnull().sum())
print("\nDuplicate Rows After Cleaning:", df.duplicated().sum())

# Step 9: Save Cleaned Data
df.to_csv('cleaned_data.csv', index=False)

print("Data cleaning complete. Cleaned data saved to 'cleaned_data.csv'.")


Missing Values:
 Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

Data Types:
 Pregnancies                   int64
Glucose                       int64
BloodPressure                 int64
SkinThickness                 int64
Insulin                       int64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                           int64
Outcome                       int64
dtype: object

Duplicate Rows: 0

Missing Values After Cleaning:
 Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

Duplicate Ro