In [1]:
# Question: Introduction to Missing Data in a DataFrame
# Description: Load a simple CSV file into a DataFrame and identify missing values.

# Steps to follow:
# 1. Load the data: Use the pandas library to read a CSV file.
# 2. Check for missing values: Use the isnull() method to find missing values.
# 3. Summarize missing data: Use the sum() function to count the number of missing values in each column.



In [2]:
import pandas as pd
import numpy as np

# 1. Load the data
# Let's create a sample CSV file for demonstration
data = {'col1': [1, 2, np.nan, 4, 5],
        'col2': ['a', np.nan, 'c', 'd', 'e'],
        'col3': [10.5, 20.3, 15.2, np.nan, 12.1]}
df = pd.DataFrame(data)
df.to_csv('data.csv', index=False)

df = pd.read_csv('data.csv')

# 2. Check for missing values
missing_values = df.isnull()
print("DataFrame with boolean indicating missing values:\n", missing_values)

# 3. Summarize missing data
missing_counts = missing_values.sum()
print("\nNumber of missing values per column:\n", missing_counts)

DataFrame with boolean indicating missing values:
     col1   col2   col3
0  False  False  False
1  False   True  False
2   True  False  False
3  False  False   True
4  False  False  False

Number of missing values per column:
 col1    1
col2    1
col3    1
dtype: int64


In [3]:
# Question: Dropping Rows with Missing Values
# Description: Practice the deletion method by removing rows with any missing values from a dataset.

# Steps to follow:
# 1. Use dropna() method: Use the dropna() method to remove rows with missing values.

In [4]:
import pandas as pd
import numpy as np

# Let's reload the data to have the missing values again
data = {'col1': [1, 2, np.nan, 4, 5],
        'col2': ['a', np.nan, 'c', 'd', 'e'],
        'col3': [10.5, 20.3, 15.2, np.nan, 12.1]}
df = pd.DataFrame(data)
df.to_csv('data.csv', index=False)
df = pd.read_csv('data.csv')

print("Original DataFrame:\n", df)

# 1. Use dropna() method
df_cleaned = df.dropna()

print("\nDataFrame after dropping rows with missing values:\n", df_cleaned)

Original DataFrame:
    col1 col2  col3
0   1.0    a  10.5
1   2.0  NaN  20.3
2   NaN    c  15.2
3   4.0    d   NaN
4   5.0    e  12.1

DataFrame after dropping rows with missing values:
    col1 col2  col3
0   1.0    a  10.5
4   5.0    e  12.1


In [5]:
# Question: Dropping Columns with Missing Values
# Description: Practice deleting entire columns that contain missing values.

# Steps to follow:
# 1. Use dropna() with axis parameter: Set axis=1 in dropna() to remove columns with missing values.



In [6]:
import pandas as pd
import numpy as np

# Let's reload the data to have the missing values again
data = {'col1': [1, 2, np.nan, 4, 5],
        'col2': ['a', np.nan, 'c', 'd', 'e'],
        'col3': [10.5, 20.3, 15.2, np.nan, 12.1],
        'col4': [True, False, True, True, False]}
df = pd.DataFrame(data)
df.to_csv('data.csv', index=False)
df = pd.read_csv('data.csv')

print("Original DataFrame:\n", df)

# 1. Use dropna() with axis parameter
df_cleaned_cols = df.dropna(axis=1)

print("\nDataFrame after dropping columns with missing values:\n", df_cleaned_cols)

Original DataFrame:
    col1 col2  col3   col4
0   1.0    a  10.5   True
1   2.0  NaN  20.3  False
2   NaN    c  15.2   True
3   4.0    d   NaN   True
4   5.0    e  12.1  False

DataFrame after dropping columns with missing values:
     col4
0   True
1  False
2   True
3   True
4  False


In [7]:
# Question: Mean Imputation for Numerical Data
# Description: Fill missing values in a numerical column with the mean of that column.

# Steps to follow:
# 1. Calculate mean and fill NA: Use mean() to calculate and fillna() to fill the missing values.



In [8]:
import pandas as pd
import numpy as np

# Let's reload the data, focusing on the numerical columns
data = {'col1': [1, 2, np.nan, 4, 5],
        'col2': ['a', 'b', 'c', 'd', 'e'],
        'col3': [10.5, 20.3, 15.2, np.nan, 12.1]}
df = pd.DataFrame(data)
df.to_csv('data.csv', index=False)
df = pd.read_csv('data.csv')

print("Original DataFrame:\n", df)

# 1. Calculate mean and fill NA for 'col1'
mean_col1 = df['col1'].mean()
df['col1'].fillna(mean_col1, inplace=True)
print("\nDataFrame after mean imputation for 'col1':\n", df)

# Let's do the same for 'col3'
mean_col3 = df['col3'].mean()
df['col3'].fillna(mean_col3, inplace=True)
print("\nDataFrame after mean imputation for 'col1' and 'col3':\n", df)

Original DataFrame:
    col1 col2  col3
0   1.0    a  10.5
1   2.0    b  20.3
2   NaN    c  15.2
3   4.0    d   NaN
4   5.0    e  12.1

DataFrame after mean imputation for 'col1':
    col1 col2  col3
0   1.0    a  10.5
1   2.0    b  20.3
2   3.0    c  15.2
3   4.0    d   NaN
4   5.0    e  12.1

DataFrame after mean imputation for 'col1' and 'col3':
    col1 col2    col3
0   1.0    a  10.500
1   2.0    b  20.300
2   3.0    c  15.200
3   4.0    d  14.525
4   5.0    e  12.100


In [9]:
# Question: Mode Imputation for Categorical Data
# Description: Fill missing values in a categorical column with the mode of that column.

# Steps to follow:
# 1. Calculate mode and fill NA: Use mode() to find the most frequent value and fillna() to fill the missing values.



In [10]:
import pandas as pd
import numpy as np

# Let's create a DataFrame with a categorical column containing missing values
data = {'category': ['A', 'B', np.nan, 'A', 'C', 'B', 'A', np.nan],
        'value': [10, 20, 30, 40, 50, 60, 70, 80]}
df = pd.DataFrame(data)
df.to_csv('categorical_data.csv', index=False)
df = pd.read_csv('categorical_data.csv')

print("Original DataFrame:\n", df)

# 1. Calculate mode and fill NA for the 'category' column
mode_category = df['category'].mode()[0]
df['category'].fillna(mode_category, inplace=True)

print("\nDataFrame after mode imputation for 'category':\n", df)

Original DataFrame:
   category  value
0        A     10
1        B     20
2      NaN     30
3        A     40
4        C     50
5        B     60
6        A     70
7      NaN     80

DataFrame after mode imputation for 'category':
   category  value
0        A     10
1        B     20
2        A     30
3        A     40
4        C     50
5        B     60
6        A     70
7        A     80


In [11]:
# Question: Median Imputation for Skewed Data
# Description: Handle missing values in columns with a skewed distribution using the median.

# Steps to follow:
# 1. Calculate median and fill NA: Use median() for skewed data and fillna() to handle missing values.



In [12]:
import pandas as pd
import numpy as np

# Let's create a DataFrame with a skewed numerical column containing missing values
data = {'skewed_col': [10, 15, 12, np.nan, 8, 50, 11, np.nan, 13, 16],
        'other_col': ['p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y']}
df = pd.DataFrame(data)
df.to_csv('skewed_data.csv', index=False)
df = pd.read_csv('skewed_data.csv')

print("Original DataFrame:\n", df)

# 1. Calculate median and fill NA for the 'skewed_col'
median_skewed = df['skewed_col'].median()
df['skewed_col'].fillna(median_skewed, inplace=True)

print("\nDataFrame after median imputation for 'skewed_col':\n", df)

Original DataFrame:
    skewed_col other_col
0        10.0         p
1        15.0         q
2        12.0         r
3         NaN         s
4         8.0         t
5        50.0         u
6        11.0         v
7         NaN         w
8        13.0         x
9        16.0         y

DataFrame after median imputation for 'skewed_col':
    skewed_col other_col
0        10.0         p
1        15.0         q
2        12.0         r
3        12.5         s
4         8.0         t
5        50.0         u
6        11.0         v
7        12.5         w
8        13.0         x
9        16.0         y


In [13]:
# Question: KNN Imputation
# Description: Use K-Nearest Neighbors to impute missing values in a dataset.

# Steps to follow:
# 1. Install and import required libraries: Use pip install sklearn if not already installed.
# 2. KNN Imputer: Use KNNImputer to fill in missing values.



In [14]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer

# 1. Install and import required libraries (you might have already done this)
# pip install scikit-learn

# Let's create a DataFrame with missing values
data = {'feature1': [1, 2, np.nan, 4, 5, np.nan],
        'feature2': [6, np.nan, 8, 9, 10, 11],
        'feature3': [12.5, 13.1, 14.7, np.nan, 16.2, 17.9]}
df = pd.DataFrame(data)
print("Original DataFrame:\n", df)

# 2. KNN Imputer
imputer = KNNImputer(n_neighbors=2)  # You can adjust the number of neighbors
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

print("\nDataFrame after KNN Imputation:\n", df_imputed)

Original DataFrame:
    feature1  feature2  feature3
0       1.0       6.0      12.5
1       2.0       NaN      13.1
2       NaN       8.0      14.7
3       4.0       9.0       NaN
4       5.0      10.0      16.2
5       NaN      11.0      17.9

DataFrame after KNN Imputation:
    feature1  feature2  feature3
0       1.0       6.0     12.50
1       2.0       7.0     13.10
2       3.0       8.0     14.70
3       4.0       9.0     15.45
4       5.0      10.0     16.20
5       4.5      11.0     17.90


In [15]:
# Question: Detecting and Handling Missing Categorical Data
# Description: Detect missing categorical data and handle it by filling with the next frequent category.

# Steps to follow:
# 1. Identify missing values in categorical data: Use the isnull() method on categorical columns.
# 2. Impute with next frequent category: Use the mode() method to choose the next frequent category.



In [16]:
import pandas as pd
import numpy as np

# Let's create a DataFrame with a categorical column containing missing values
data = {'category': ['A', 'B', np.nan, 'A', 'C', 'B', 'A', np.nan, 'B', 'D', 'A', 'B'],
        'value': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120]}
df = pd.DataFrame(data)

# 1. Identify missing values in categorical data
missing_category = df['category'].isnull()
print("Missing values in 'category' column:\n", missing_category)
print("\nNumber of missing values:", missing_category.sum())

# 2. Impute with the next frequent category
category_counts = df['category'].value_counts()
print("\nFrequency of each category:\n", category_counts)

# Identify the most frequent category (mode)
most_frequent_category = category_counts.index[0]
print("\nMost frequent category:", most_frequent_category)

# Identify the next most frequent category (if it exists)
next_frequent_category = category_counts.index[1] if len(category_counts) > 1 else None
print("Next most frequent category:", next_frequent_category)

if next_frequent_category:
    df['category'].fillna(next_frequent_category, inplace=True)
    print("\nDataFrame after imputing with the next frequent category ('{}'):\n".format(next_frequent_category), df)
else:
    print("\nOnly one category exists, so imputing with the most frequent ('{}'):\n".format(most_frequent_category), df)
    df['category'].fillna(most_frequent_category, inplace=True)

Missing values in 'category' column:
 0     False
1     False
2      True
3     False
4     False
5     False
6     False
7      True
8     False
9     False
10    False
11    False
Name: category, dtype: bool

Number of missing values: 2

Frequency of each category:
 category
A    4
B    4
C    1
D    1
Name: count, dtype: int64

Most frequent category: A
Next most frequent category: B

DataFrame after imputing with the next frequent category ('B'):
    category  value
0         A     10
1         B     20
2         B     30
3         A     40
4         C     50
5         B     60
6         A     70
7         B     80
8         B     90
9         D    100
10        A    110
11        B    120


In [17]:
# Question: Predictive Modeling for Imputation
# Description: Use a predictive model to impute missing values for a particular feature using other features.

# Steps to follow:
# 1. Partition the data: Split the dataset into train and test based on the presence of missing values.
# 2. Train a model: Use a regression model to predict missing values.
# 3. Impute missing values with predictions.




In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Let's create a DataFrame with some missing values in a numerical column ('target_feature')
data = {'feature1': [1, 2, 3, 4, 5, 6, np.nan, 8, 9, 10],
        'feature2': [11, np.nan, 13, 14, 15, 16, 17, np.nan, 19, 20],
        'target_feature': [21, 22, np.nan, 24, 25, 26, 27, np.nan, 29, 30]}
df = pd.DataFrame(data)

# Identify rows with missing values in the 'target_feature'
missing_target = df[df['target_feature'].isnull()]
non_missing_target = df[df['target_feature'].notnull()]

print("Rows with missing 'target_feature':\n", missing_target)
print("\nRows with non-missing 'target_feature':\n", non_missing_target)

# 1. Partition the data
# Use the non-missing rows as the training set and the missing rows as the test set for prediction
X_train = non_missing_target[['feature1', 'feature2']]
y_train = non_missing_target['target_feature']
X_test = missing_target[['feature1', 'feature2']]

# 2. Train a model (using Linear Regression as an example)
model = LinearRegression()
model.fit(X_train, y_train)

# 3. Impute missing values with predictions
predicted_values = model.predict(X_test)

# Create a copy of the original DataFrame to store the imputed values
df_imputed_predictive = df.copy()

# Fill the missing 'target_feature' values with the predictions
df_imputed_predictive.loc[df_imputed_predictive['target_feature'].isnull(), 'target_feature'] = predicted_values

print("\nDataFrame after predictive imputation for 'target_feature':\n", df_imputed_predictive)

# Evaluate the model (optional, but good practice if you have a way to validate)
# If you had held out a portion of the non-missing data as a validation set, you could evaluate the model's performance.
# For instance:
# X_train_val, X_val, y_train_val, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
# model_eval = LinearRegression()
# model_eval.fit(X_train_val, y_train_val)
# predictions_val = model_eval.predict(X_val)
# mse = mean_squared_error(y_val, predictions_val)
# print(f"\nMean Squared Error on the validation set: {mse}")

Rows with missing 'target_feature':
    feature1  feature2  target_feature
2       3.0      13.0             NaN
7       8.0       NaN             NaN

Rows with non-missing 'target_feature':
    feature1  feature2  target_feature
0       1.0      11.0            21.0
1       2.0       NaN            22.0
3       4.0      14.0            24.0
4       5.0      15.0            25.0
5       6.0      16.0            26.0
6       NaN      17.0            27.0
8       9.0      19.0            29.0
9      10.0      20.0            30.0


ValueError: Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
# Question: Handling Time Series Data with Forward and Backward Fill
# Description: Impute missing values in a time series dataset using forward and backward fill methods.

# Steps to follow:
# 1. Sort the data: Ensure the dataset is sorted by dates.
# 2. Use fillna() with method parameter: Apply ffill() and bfill() for forward and backward fill.



In [None]:
import pandas as pd
import numpy as np

# 1. Sort the data
# Let's create a sample time series DataFrame with missing values
data = {'date': pd.to_datetime(['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04',
                                   '2023-01-05', '2023-01-06', '2023-01-07', '2023-01-08']),
        'value': [10, np.nan, 12, np.nan, 15, 16, np.nan, 18]}
df_time_series = pd.DataFrame(data)

# Ensure the DataFrame is sorted by date (it already is in this example, but it's good practice to do explicitly)
df_time_series.sort_values(by='date', inplace=True)
df_time_series.set_index('date', inplace=True) # Set 'date' as index for time series operations

print("Original Time Series DataFrame:\n", df_time_series)

# 2. Use fillna() with method parameter

# Forward Fill (ffill): Propagates the last valid observation forward to the next missing value.
df_ffill = df_time_series.fillna(method='ffill')
print("\nTime Series DataFrame after Forward Fill (ffill):\n", df_ffill)

# Backward Fill (bfill): Propagates the next valid observation backward to the previous missing value.
df_bfill = df_time_series.fillna(method='bfill')
print("\nTime Series DataFrame after Backward Fill (bfill):\n", df_bfill)

# You can also combine them. For example, fill with ffill and then any remaining NaNs with bfill
df_combined_fill = df_time_series.fillna(method='ffill').fillna(method='bfill')
print("\nTime Series DataFrame after Combined Forward and Backward Fill:\n", df_combined_fill)