In [None]:
# Question: Advanced Data Cleaning with Multiple Issues
# Objective: Handle multiple issues in one dataset, including missing values, duplicates, and outliers.
# Description: Given a dataset with various data quality issues, employ multiple data cleaning techniques.




In [None]:
# Question: Data Transformation Techniques
# Objective: Transform skewed data using log transformation.
# Description: Perform a log transformation to handle skewness in a dataset, which is particularly useful for
# certain machine learning models.



In [None]:
# Question: Feature Engineering by Creating New Features
# Objective: Create a new feature based on existing features to add predictive power.
# Description: Generate additional features from existing data to potentially improve the performance of
# prediction models.




In [None]:
# Question: Handling Complex Outliers with Z-Scores
# Objective: Detect and handle outliers using Z-score method.
# Description: Use the Z-score method to identify outliers which significantly differ from the rest of the data points.




In [None]:
# Question: Data Imputation with K-Nearest Neighbors (KNN)
# Objective: Impute missing numerical values using the KNN method.
# Description: Use the K-nearest neighbors algorithm to fill in missing values, which considers the values of
# nearest neighbors for imputation.




In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from scipy.stats import zscore
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('swiggy.csv')

# For demonstration, we create a sample dataset
np.random.seed(0)
data = {
    'A': np.random.normal(50, 10, 100).tolist() + [np.nan, np.nan],
    'B': np.random.exponential(scale=2.0, size=102),
    'C': np.random.normal(30, 5, 102),
    'D': np.random.randint(1, 5, 102)
}
df = pd.DataFrame(data)

# Introduce duplicates
df = pd.concat([df, df.iloc[:5]], ignore_index=True)

print("Initial dataset shape:", df.shape)

# ---------------------------------
# 2. Remove Duplicate Rows
# ---------------------------------
df = df.drop_duplicates()
print("After removing duplicates:", df.shape)

# ---------------------------------
# 3. KNN Imputation for Missing Values
# ---------------------------------
imputer = KNNImputer(n_neighbors=3)
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

# ---------------------------------
# 4. Z-Score Outlier Detection
# ---------------------------------
z_scores = np.abs(zscore(df_imputed.select_dtypes(include=[np.number])))
outliers = (z_scores > 3)
df_no_outliers = df_imputed[~(outliers.any(axis=1))]

print("After outlier removal:", df_no_outliers.shape)

# ---------------------------------
# 5. Log Transformation for Skewed Data
# ---------------------------------
# Log transform column 'B' which is exponentially distributed
df_transformed = df_no_outliers.copy()
df_transformed['B_log'] = np.log1p(df_transformed['B'])

# Visualize skewness before and after
# sns.histplot(df_no_outliers['B'], kde=True)
# sns.histplot(df_transformed['B_log'], kde=True)
# plt.show()

# ---------------------------------
# 6. Feature Engineering: New Feature
# ---------------------------------
# Example: New feature as ratio of C to A
df_transformed['C_A_ratio'] = df_transformed['C'] / df_transformed['A']

# ---------------------------------
# 7. Final Cleaned and Transformed Dataset
# ---------------------------------
print("Final cleaned dataset shape:", df_transformed.shape)
print(df_transformed.head())

# Optional: Save the clean dataset
# df_transformed.to_csv("cleaned_dataset.csv", index=False)


Initial dataset shape: (107, 4)
After removing duplicates: (102, 4)
After outlier removal: (100, 4)
Final cleaned dataset shape: (100, 6)
           A         B          C    D     B_log  C_A_ratio
0  67.640523  1.102792  24.828786  2.0  0.743266   0.367070
1  54.001572  1.864806  33.407973  4.0  1.052501   0.618648
2  59.787380  0.038760  25.982952  4.0  0.038027   0.434589
3  72.408932  0.717854  26.552251  1.0  0.541076   0.366699
4  68.675580  2.158640  27.722337  2.0  1.150142   0.403671
