In [1]:
# Importance of Data Cleaning

# 1. Missing Values: Missing data points in a dataset can lead to biased results.
#     Task 1: Load a dataset and identify which columns have missing values.
#     Task 2: Replace missing values in a dataset with the column mean or mode.
#     Task 3: Compare model performance with and without handling missing values.
    





In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer

# 1. Missing Values: Missing data points in a dataset can lead to biased results.

# Task 1: Load a dataset and identify which columns have missing values.
print("\nTask 1: Load a dataset and identify which columns have missing values.")

# Create a sample dataset with missing values
data = {'Feature_A': [10, 12, np.nan, 15, 18, np.nan, 22, 25],
        'Feature_B': [5, np.nan, 7, 9, 11, 13, np.nan, 17],
        'Feature_C': [1, 2, 3, np.nan, 5, 6, 7, np.nan],
        'Target': [20, 25, 30, 35, 40, 45, 50, 55]}
df = pd.DataFrame(data)

print("\nOriginal DataFrame:")
print(df)

# Identify columns with missing values
missing_values_count = df.isnull().sum()
print("\nNumber of missing values per column:")
print(missing_values_count)

columns_with_missing = missing_values_count[missing_values_count > 0].index.tolist()
print("\nColumns with missing values:")
print(columns_with_missing)

# Task 2: Replace missing values in a dataset with the column mean or mode.
print("\n\nTask 2: Replace missing values in a dataset with the column mean or mode.")

df_imputed_mean = df.copy()
df_imputed_mode = df.copy()

# Impute missing numerical values with the mean
numerical_cols = df.select_dtypes(include=np.number).columns
for col in numerical_cols:
    df_imputed_mean[col].fillna(df_imputed_mean[col].mean(), inplace=True)

print("\nDataFrame with missing numerical values imputed with the mean:")
print(df_imputed_mean)

# Impute missing values with the mode (for all columns in this simple example)
# Be cautious: mode is more appropriate for categorical data, but we'll demonstrate
for col in df_imputed_mode.columns:
    if df_imputed_mode[col].isnull().any():
        # Calculate mode (handles multiple modes by taking the first)
        mode_value = df_imputed_mode[col].mode()[0]
        df_imputed_mode[col].fillna(mode_value, inplace=True)

print("\nDataFrame with missing values imputed with the mode:")
print(df_imputed_mode)

# Task 3: Compare model performance with and without handling missing values.
print("\n\nTask 3: Compare model performance with and without handling missing values.")

# Prepare data for modeling

# Option 1: Model with rows containing missing values removed
df_dropna = df.dropna().copy()
X_dropna = df_dropna[['Feature_A', 'Feature_B', 'Feature_C']]
y_dropna = df_dropna['Target']
X_train_dropna, X_test_dropna, y_train_dropna, y_test_dropna = train_test_split(X_dropna, y_dropna, test_size=0.3, random_state=42)

model_dropna = LinearRegression()
if not X_train_dropna.empty:
    model_dropna.fit(X_train_dropna, y_train_dropna)
    if not X_test_dropna.empty:
        y_pred_dropna = model_dropna.predict(X_test_dropna)
        mse_dropna = mean_squared_error(y_test_dropna, y_pred_dropna)
        print(f"\nModel performance (missing values dropped): Mean Squared Error = {mse_dropna:.2f}")
    else:
        print("\nNot enough data to test model after dropping missing values.")
else:
    print("\nNot enough data to train model after dropping missing values.")

# Option 2: Model with missing values imputed (using mean imputation)
X_imputed_mean = df_imputed_mean[['Feature_A', 'Feature_B', 'Feature_C']]
y_imputed_mean = df_imputed_mean['Target']
X_train_imputed_mean, X_test_imputed_mean, y_train_imputed_mean, y_test_imputed_mean = train_test_split(X_imputed_mean, y_imputed_mean, test_size=0.3, random_state=42)

model_imputed_mean = LinearRegression()
model_imputed_mean.fit(X_train_imputed_mean, y_train_imputed_mean)
y_pred_imputed_mean = model_imputed_mean.predict(X_test_imputed_mean)
mse_imputed_mean = mean_squared_error(y_test_imputed_mean, y_pred_imputed_mean)
print(f"Model performance (missing values imputed with mean): Mean Squared Error = {mse_imputed_mean:.2f}")

# Option 3: Model with missing values imputed (using mode imputation)
X_imputed_mode = df_imputed_mode[['Feature_A', 'Feature_B', 'Feature_C']]
y_imputed_mode = df_imputed_mode['Target']
X_train_imputed_mode, X_test_imputed_mode, y_train_imputed_mode, y_test_imputed_mode = train_test_split(X_imputed_mode, y_imputed_mode, test_size=0.3, random_state=42)

model_imputed_mode = LinearRegression()
model_imputed_mode.fit(X_train_imputed_mode, y_train_imputed_mode)
y_pred_imputed_mode = model_imputed_mode.predict(X_test_imputed_mode)
mse_imputed_mode = mean_squared_error(y_test_imputed_mode, y_pred_imputed_mode)
print(f"Model performance (missing values imputed with mode): Mean Squared Error = {mse_imputed_mode:.2f}")

# Option 4: Model using a more sophisticated imputation technique (SimpleImputer with median)
df_imputed_median = df.copy()
imputer = SimpleImputer(strategy='median')
df_imputed_median[['Feature_A', 'Feature_B', 'Feature_C']] = imputer.fit_transform(df_imputed_median[['Feature_A', 'Feature_B', 'Feature_C']])
X_imputed_median = df_imputed_median[['Feature_A', 'Feature_B', 'Feature_C']]
y_imputed_median = df_imputed_median['Target']
X_train_imputed_median, X_test_imputed_median, y_train_imputed_median, y_test_imputed_median = train_test_split(X_imputed_median, y_imputed_median, test_size=0.3, random_state=42)

model_imputed_median = LinearRegression()
model_imputed_median.fit(X_train_imputed_median, y_train_imputed_median)
y_pred_imputed_median = model_imputed_median.predict(X_test_imputed_median)
mse_imputed_median = mean_squared_error(y_test_imputed_median, y_pred_imputed_median)
print(f"Model performance (missing values imputed with median): Mean Squared Error = {mse_imputed_median:.2f}")


Task 1: Load a dataset and identify which columns have missing values.

Original DataFrame:
   Feature_A  Feature_B  Feature_C  Target
0       10.0        5.0        1.0      20
1       12.0        NaN        2.0      25
2        NaN        7.0        3.0      30
3       15.0        9.0        NaN      35
4       18.0       11.0        5.0      40
5        NaN       13.0        6.0      45
6       22.0        NaN        7.0      50
7       25.0       17.0        NaN      55

Number of missing values per column:
Feature_A    2
Feature_B    2
Feature_C    2
Target       0
dtype: int64

Columns with missing values:
['Feature_A', 'Feature_B', 'Feature_C']


Task 2: Replace missing values in a dataset with the column mean or mode.

DataFrame with missing numerical values imputed with the mean:
   Feature_A  Feature_B  Feature_C  Target
0       10.0   5.000000        1.0      20
1       12.0  10.333333        2.0      25
2       17.0   7.000000        3.0      30
3       15.0   9.000000    

In [3]:
# 2. Duplicate Data: Repeated data points can skew analysis and model results.
#     Task 1: Identify and remove duplicate entries from a dataset using a programming language or tool.
#     Task 2: Document the before-and-after dataset shape to understand the impact of duplicates.
#     Task 3: Explain to a classmate how duplicate data can affect prediction accuracy.
    
    
    

In [4]:
import pandas as pd

# 2. Duplicate Data: Repeated data points can skew analysis and model results.

# Task 1: Identify and remove duplicate entries from a dataset using a programming language or tool.
print("\nTask 1: Identify and remove duplicate entries from a dataset.")

# Create a sample dataset with duplicate rows
data = {'ID': [1, 2, 3, 4, 2, 5, 1, 6],
        'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Bob', 'Eve', 'Alice', 'Frank'],
        'Age': [25, 30, 35, 28, 30, 22, 25, 40],
        'City': ['New York', 'London', 'Paris', 'London', 'London', 'Tokyo', 'New York', 'Berlin']}
df = pd.DataFrame(data)

print("\nOriginal DataFrame:")
print(df)

# Identify duplicate rows
duplicate_rows = df[df.duplicated()]
print("\nDuplicate rows:")
print(duplicate_rows)

# Remove duplicate rows, keeping the first occurrence by default
df_no_duplicates = df.drop_duplicates()
print("\nDataFrame after removing duplicate rows:")
print(df_no_duplicates)

# Task 2: Document the before-and-after dataset shape to understand the impact of duplicates.
print("\n\nTask 2: Document the before-and-after dataset shape.")

print(f"\nShape of the original DataFrame: {df.shape}")
print(f"Shape of the DataFrame after removing duplicates: {df_no_duplicates.shape}")

rows_removed = df.shape[0] - df_no_duplicates.shape[0]
print(f"\nNumber of duplicate rows removed: {rows_removed}")

# Task 3: Explain to a classmate how duplicate data can affect prediction accuracy.
print("\n\nTask 3: Explain how duplicate data can affect prediction accuracy.")

explanation = """
Hey [Classmate's Name],

Let's talk about how duplicate data can mess with our machine learning models and their prediction accuracy. Imagine we're trying to train a model to predict someone's age based on their name and city.

If our training data has many identical entries for the same person (same name, same city, same age), the model might get an overinflated sense of how common that specific combination is. It might learn to heavily favor the age associated with those repeated entries, even if that combination isn't actually that frequent in the real world. This can lead to:

1. **Bias:** The model can become biased towards the features and target variable of the duplicated data. It might perform very well on data that looks exactly like the duplicates but poorly on slightly different, yet valid, data points.

2. **Overfitting:** If the duplicates are essentially the same training example repeated multiple times, the model might start to 'memorize' these specific examples rather than learning the underlying patterns in the data. This overfitting means the model performs well on the training data (including the duplicates) but fails to generalize to new, unseen data.

3. **Skewed Evaluation Metrics:** When we evaluate our model, if our test set also contains duplicates that were present in the training data, our performance metrics (like accuracy, precision, recall, or MSE) might appear artificially high. The model is essentially being tested on data it has seen multiple times before.

4. **Inefficient Training:** Training on duplicate data is computationally wasteful. The model isn't learning any new information from the repeated entries, but it's still processing them, increasing training time and resource usage.

In short, duplicate data can mislead our models into learning spurious correlations, overfit to the training set, and give us a false sense of good performance. Removing duplicates ensures that each data point contributes uniquely to the learning process, leading to more robust and accurate predictions on unseen data.

So, it's really important to identify and handle duplicates during our data cleaning process!
"""

print(explanation)


Task 1: Identify and remove duplicate entries from a dataset.

Original DataFrame:
   ID     Name  Age      City
0   1    Alice   25  New York
1   2      Bob   30    London
2   3  Charlie   35     Paris
3   4    David   28    London
4   2      Bob   30    London
5   5      Eve   22     Tokyo
6   1    Alice   25  New York
7   6    Frank   40    Berlin

Duplicate rows:
   ID   Name  Age      City
4   2    Bob   30    London
6   1  Alice   25  New York

DataFrame after removing duplicate rows:
   ID     Name  Age      City
0   1    Alice   25  New York
1   2      Bob   30    London
2   3  Charlie   35     Paris
3   4    David   28    London
5   5      Eve   22     Tokyo
7   6    Frank   40    Berlin


Task 2: Document the before-and-after dataset shape.

Shape of the original DataFrame: (8, 4)
Shape of the DataFrame after removing duplicates: (6, 4)

Number of duplicate rows removed: 2


Task 3: Explain how duplicate data can affect prediction accuracy.

Hey [Classmate's Name],

Let's ta

In [5]:
# 3. Incorrect Data Types: Data stored in incorrect formats can lead to parsing errors or incorrect analysis.
#     Task 1: Convert a column of string numbers to integers in a dataset.
#     Task 2: Identify and correct columns with inconsistent data types in a dataset.
#     Task 3: Discuss why correct data types are critical for feature engineering.
    
    
    

In [6]:
import pandas as pd
import numpy as np

# 3. Incorrect Data Types: Data stored in incorrect formats can lead to parsing errors or incorrect analysis.

# Task 1: Convert a column of string numbers to integers in a dataset.
print("\nTask 1: Convert a column of string numbers to integers.")

# Sample DataFrame with 'Age' as strings
data_strings = {'ID': [1, 2, 3, 4, 5],
                'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
                'Age': ['25', '30', '35', '28', '40']}
df_strings = pd.DataFrame(data_strings)

print("\nOriginal DataFrame (Age as strings):")
print(df_strings)
print(f"Data type of 'Age' column: {df_strings['Age'].dtype}")

# Convert 'Age' column to integers
df_strings['Age'] = pd.to_numeric(df_strings['Age'], errors='raise', downcast='integer')

print("\nDataFrame after converting 'Age' to integers:")
print(df_strings)
print(f"Data type of 'Age' column: {df_strings['Age'].dtype}")

# Task 2: Identify and correct columns with inconsistent data types in a dataset.
print("\n\nTask 2: Identify and correct columns with inconsistent data types.")

# Sample DataFrame with inconsistent data types
data_inconsistent = {'ID': [1, 2, 3, 4, 5],
                     'Price': ['10.50', '20', '15.75', '30.0', 25],
                     'Quantity': ['5', 10, '7', '12.0', 8],
                     'Active': ['True', 'False', 'TRUE', 'false', True]}
df_inconsistent = pd.DataFrame(data_inconsistent)

print("\nOriginal DataFrame (with inconsistent data types):")
print(df_inconsistent)
print("\nData types of each column:")
print(df_inconsistent.dtypes)

# Correct 'Price' to float
df_inconsistent['Price'] = pd.to_numeric(df_inconsistent['Price'], errors='coerce')
print("\n'Price' column after conversion to float:")
print(df_inconsistent['Price'])
print(f"Data type of 'Price' column: {df_inconsistent['Price'].dtype}")

# Correct 'Quantity' to integer (handling potential floats)
df_inconsistent['Quantity'] = pd.to_numeric(df_inconsistent['Quantity'], errors='coerce').astype('Int64')
print("\n'Quantity' column after conversion to integer:")
print(df_inconsistent['Quantity'])
print(f"Data type of 'Quantity' column: {df_inconsistent['Quantity'].dtype}")

# Correct 'Active' to boolean
df_inconsistent['Active'] = df_inconsistent['Active'].astype('bool')
print("\n'Active' column after conversion to boolean:")
print(df_inconsistent['Active'])
print(f"Data type of 'Active' column: {df_inconsistent['Active'].dtype}")

print("\nDataFrame after correcting inconsistent data types:")
print(df_inconsistent)
print("\nCorrected data types of each column:")
print(df_inconsistent.dtypes)

# Task 3: Discuss why correct data types are critical for feature engineering.
print("\n\nTask 3: Discuss why correct data types are critical for feature engineering.")

discussion = """
Correct data types are absolutely fundamental for effective feature engineering. Here's why:

1. **Mathematical Operations:** Many feature engineering techniques involve mathematical operations between columns (e.g., addition, subtraction, multiplication, division). If a column containing numerical data is stored as a string, these operations will either fail or produce incorrect results (e.g., string concatenation instead of addition). Converting these columns to the appropriate numeric type (integer or float) is essential to perform calculations correctly.

2. **Categorical Encoding:** When dealing with categorical features, we often need to encode them into numerical representations that machine learning models can understand (e.g., one-hot encoding, label encoding). These encoding techniques rely on the column being recognized as a categorical type (e.g., 'object' or 'category' in Pandas). If a categorical column is mistakenly identified as a numeric type, encoding will not be applied correctly, leading to errors or suboptimal features. Conversely, trying to encode a numerical column as categorical would also be inappropriate.

3. **Date and Time Features:** Feature engineering with date and time data often involves extracting components like day of the week, month, year, hour, time differences, etc. These operations are only possible when the column is correctly recognized as a datetime object in Pandas. If a date or time column is stored as a string, we first need to convert it to datetime using functions like `pd.to_datetime()` before we can extract these meaningful features.

4. **Boolean Logic:** Features representing True/False conditions are often used in feature engineering. Ensuring these are stored as boolean data types allows for efficient and correct logical operations and can be crucial for creating indicator variables or applying conditional logic.

5. **Type-Specific Functions and Methods:** Pandas and other libraries provide type-specific functions and methods for data manipulation and feature creation. For example, the `.dt` accessor for datetime columns offers a wide range of functionalities for extracting date and time components. Similarly, string-specific methods (`.str`) are available for text data. If the data types are incorrect, these powerful tools cannot be used effectively.

6. **Avoiding Unexpected Behavior and Errors:** Incorrect data types can lead to subtle errors in feature engineering that are hard to debug. For instance, comparing a string '10' with an integer 10 might yield unexpected results. Ensuring consistent and correct data types helps prevent such issues and leads to more reliable and predictable feature engineering processes.

In summary, having the correct data types ensures that we can apply the appropriate feature engineering techniques, perform calculations accurately, leverage type-specific functionalities, and ultimately create meaningful and effective features that can improve the performance of our machine learning models and the accuracy of our data analysis."
"""

print(discussion)


Task 1: Convert a column of string numbers to integers.

Original DataFrame (Age as strings):
   ID     Name Age
0   1    Alice  25
1   2      Bob  30
2   3  Charlie  35
3   4    David  28
4   5      Eve  40
Data type of 'Age' column: object

DataFrame after converting 'Age' to integers:
   ID     Name  Age
0   1    Alice   25
1   2      Bob   30
2   3  Charlie   35
3   4    David   28
4   5      Eve   40
Data type of 'Age' column: int8


Task 2: Identify and correct columns with inconsistent data types.

Original DataFrame (with inconsistent data types):
   ID  Price Quantity Active
0   1  10.50        5   True
1   2     20       10  False
2   3  15.75        7   TRUE
3   4   30.0     12.0  false
4   5     25        8   True

Data types of each column:
ID           int64
Price       object
Quantity    object
Active      object
dtype: object

'Price' column after conversion to float:
0    10.50
1    20.00
2    15.75
3    30.00
4    25.00
Name: Price, dtype: float64
Data type of 'Price

In [7]:
# 4. Outliers & Inconsistencies: Irregularities in data can mislead statistical analysis and model predictions.
#     Task 1: Visualize a dataset and identify outliers using a boxplot.
#     Task 2: Remove or adjust outliers and re-analyze the dataset.
#     Task 3: Research and report on a technique for handling outliers effectively.
    
    
    

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# 4. Outliers & Inconsistencies: Irregularities in data can mislead statistical analysis and model predictions.

# Task 1: Visualize a dataset and identify outliers using a boxplot.
print("\nTask 1: Visualize a dataset and identify outliers using a boxplot.")

# Create a sample dataset with outliers
np.random.seed(42)
data_outliers = {'Feature_X': np.concatenate([np.random.normal(loc=20, scale=5, size=100),
                                             np.random.normal(loc=50, scale=10, size=10),
                                             [-10, 60, 65]])}
df_outliers = pd.DataFrame(data_outliers)

print("\nOriginal DataFrame with potential outliers:")
print(df_outliers.head())

# Visualize using a boxplot
plt.figure(figsize=(8, 6))
sns.boxplot(y=df_outliers['Feature_X'])
plt.title('Boxplot of Feature_X (Identifying Outliers)')
plt.ylabel('Feature_X')
plt.show()

# Identify outliers based on IQR (Interquartile Range)
Q1 = df_outliers['Feature_X'].quantile(0.25)
Q3 = df_outliers['Feature_X'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers_iqr = df_outliers[(df_outliers['Feature_X'] < lower_bound) | (df_outliers['Feature_X'] > upper_bound)]
print(f"\nOutliers identified using IQR method:")
print(outliers_iqr)

# Task 2: Remove or adjust outliers and re-analyze the dataset.
print("\n\nTask 2: Remove or adjust outliers and re-analyze the dataset.")

# Option 1: Remove outliers
df_no_outliers = df_outliers[(df_outliers['Feature_X'] >= lower_bound) & (df_outliers['Feature_X'] <= upper_bound)].copy()
print("\nDataFrame after removing outliers (based on IQR):")
print(df_no_outliers.head())

# Re-analyze (e.g., visualize the distribution again)
plt.figure(figsize=(8, 6))
sns.boxplot(y=df_no_outliers['Feature_X'])
plt.title('Boxplot of Feature_X (Outliers Removed)')
plt.ylabel('Feature_X')
plt.show()

plt.figure(figsize=(8, 6))
sns.histplot(df_no_outliers['Feature_X'], kde=True)
plt.title('Distribution of Feature_X (Outliers Removed)')
plt.xlabel('Feature_X')
plt.ylabel('Frequency')
plt.show()

# Option 2: Adjust outliers (Capping/Winsorizing)
df_capped = df_outliers.copy()
df_capped['Feature_X_Capped'] = np.where(df_capped['Feature_X'] < lower_bound, lower_bound,
                                        np.where(df_capped['Feature_X'] > upper_bound, upper_bound,
                                                 df_capped['Feature_X']))

print("\nDataFrame with outliers capped (based on IQR):")
print(df_capped.head())

# Re-analyze the capped data
plt.figure(figsize=(8, 6))
sns.boxplot(y=df_capped['Feature_X_Capped'])
plt.title('Boxplot of Feature_X (Outliers Capped)')
plt.ylabel('Feature_X_Capped')
plt.show()

plt.figure(figsize=(8, 6))
sns.histplot(df_capped['Feature_X_Capped'], kde=True)
plt.title('Distribution of Feature_X (Outliers Capped)')
plt.xlabel('Feature_X_Capped')
plt.ylabel('Frequency')
plt.show()

# Task 3: Research and report on a technique for handling outliers effectively.
print("\n\nTask 3: Research and report on a technique for handling outliers effectively.")

outlier_handling_report = """
**Technique for Handling Outliers Effectively: Robust Scaling**

**Description:**
Robust scaling is a technique used to standardize numerical features by removing the median and scaling the data according to the Interquartile Range (IQR). The IQR is the range between the first quartile (Q1) and the third quartile (Q3). This method is less sensitive to outliers compared to standard scaling (which uses mean and standard deviation) because the median and IQR are robust to extreme values.

**Formula:**
The RobustScaler scales data using the following formula:

$$
X_{scaled} = \frac{X - median(X)}{IQR(X)}
$$

Where:
- $X$ is the original data point.
- $median(X)$ is the median of the feature.
- $IQR(X) = Q3(X) - Q1(X)$ is the interquartile range of the feature.

**Why it's effective for handling outliers:**

1.  **Robust to Outliers:** Unlike mean and standard deviation, the median and IQR are not significantly affected by the presence of outliers. This means that the scaling is driven by the central portion of the data distribution, and outliers do not have a disproportionate influence on the scaling.

2.  **Preserves the Spread:** While it scales the data, robust scaling maintains the relative spread and distribution of the non-outlier data points. It focuses on making the bulk of the data have a similar scale.

3.  **Useful for Algorithms Sensitive to Feature Scale:** Many machine learning algorithms (e.g., distance-based algorithms like KNN, SVM, and neural networks) are sensitive to the scale of input features. Robust scaling can help to ensure that features with outliers do not dominate the distance calculations or model learning process.

**When to use Robust Scaling:**

- When your data contains significant outliers.
- When you want to scale your data but are concerned that outliers might skew the scaling if you use StandardScaler.
- When the underlying distribution of your data is not necessarily Gaussian.

**Implementation in Python (using scikit-learn):**

```python
from sklearn.preprocessing import RobustScaler
import numpy as np

data = np.array([[-1.0], [0.0], [1.0], [2.0], [100.0]])
scaler = RobustScaler()
scaled_data = scaler.fit_transform(data)
print(f"Original data: {data.flatten()}")
print(f"Robustly scaled data: {scaled_data.flatten()}")

SyntaxError: incomplete input (1240372205.py, line 88)