In [None]:
### Data Cleaning and Preparation: Case Study Notebook

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA

In [None]:
import requests

# URLs of the files
data_url = 'https://www.raphaelcousin.com/modules/module5/course/module5_course_handling_categorical.csv'

# Function to download a file
def download_file(url, file_name):
    response = requests.get(url)
    response.raise_for_status()  # Ensure we notice bad responses
    with open(file_name, 'wb') as file:
        file.write(response.content)
    print(f'Downloaded {file_name} from {url}')

# Downloading the files
download_file(train_data, 'module5_course_handling_categorical.csv')

In [None]:
# Load the dataset
data = pd.read_csv('data.csv')

# Display the first few rows of the dataset
data.head()

In [None]:
#### 1. Handling Inconsistencies

**Types of Inconsistencies**
- Inconsistent data formats
- Duplicated entries
- Typographical errors

In [None]:
# Convert date columns to datetime format
data['date'] = pd.to_datetime(data['date'], errors='coerce')

# Standardize categorical columns
data['category'] = data['category'].str.lower()

# Detecting and correcting inconsistencies
data = data.drop_duplicates()

# Display the cleaned dataframe
data.head()

In [None]:
#### 2. Handling Missing Values

**Visualize Missing Values**

In [None]:
import missingno as msno
msno.matrix(data)
plt.show()

In [None]:
**Imputation Techniques**

In [None]:
# Mean/Median/Mode Imputation
imputer = SimpleImputer(strategy='mean')
data['column_name'] = imputer.fit_transform(data[['column_name']])

# Forward Fill
data.fillna(method='ffill', inplace=True)

# K-Nearest Neighbors Imputation
imputer = KNeighborsClassifier(n_neighbors=3)
data['column_name'] = imputer.fit_transform(data[['column_name']])

In [None]:
**Adding a 'is_missing' Column**

In [None]:
data['column_is_missing'] = data['column_name'].isnull()

In [None]:
#### 3. Handling Categorical Values
**One-Hot Encoding and Label Encoding**

In [None]:
# One-Hot Encoding
data = pd.get_dummies(data, columns=['categorical_column'])

# Label Encoding
label_encoder = LabelEncoder()
data['categorical_column'] = label_encoder.fit_transform(data['categorical_column'])

In [None]:
#### 4. Handling Duplicates
**Identifying and Removing Duplicates**

In [None]:
# Identify duplicates
duplicates = data[data.duplicated()]

# Remove duplicates
data = data.drop_duplicates()

# Display the dataframe after removing duplicates
data.head()

In [None]:
#### 5. Handling Outliers
**Detecting and Managing Outliers**

In [None]:
# Detect outliers using Z-score
data['z_score'] = (data['numeric_column'] - data['numeric_column'].mean()) / data['numeric_column'].std()
outliers = data[data['z_score'].abs() > 3]

# Managing outliers by capping
data['numeric_column'] = np.where(data['z_score'].abs() > 3, data['numeric_column'].mean(), data['numeric_column'])

# Remove the z_score column
data.drop(columns=['z_score'], inplace=True)

In [None]:
#### 6. Feature Engineering
**Creating Interaction Features and Polynomial Features**
# Interaction Features
data['interaction_feature'] = data['feature1'] * data['feature2']

# Polynomial Features
data['feature_squared'] = data['feature'] ** 2

In [None]:
#### 7. Scaling and Normalization
**Scaling and Normalizing Features**

In [None]:
# Standard Scaling
scaler = StandardScaler()
data['scaled_feature'] = scaler.fit_transform(data[['feature']])

# Min-Max Scaling
scaler = MinMaxScaler()
data['normalized_feature'] = scaler.fit_transform(data[['feature']])

In [None]:

#### 8. Feature Selection and Dimensionality Reduction

**Feature Selection and PCA**

In [None]:
# Feature Selection using ANOVA F-test
selector = SelectKBest(score_func=f_classif, k=5)
selected_features = selector.fit_transform(data.drop(columns=['target']), data['target'])

# Dimensionality Reduction using PCA
pca = PCA(n_components=2)
principal_components = pca.fit_transform(selected_features)

# Add principal components to the dataframe
data['PC1'] = principal_components[:, 0]
data['PC2'] = principal_components[:, 1]

# Display the dataframe with principal components
data.head()
