# How to deal with missing values?

### 1. Remove rows/column:

In [None]:
# Remove rows with any missing values
df_cleaned_rows = df.dropna()

# Remove columns with any missing values
df_cleaned_columns = df.dropna(axis=1)

### 3. Imputation:
Replace missing values with a suitable estimate. For numerical data, you might use the mean, median, or mode. For categorical data, you might use the most frequent category.

In [None]:
# Impute missing values with the mean
df_imputed = df.fillna(df.mean())

### 3. Forward or Backward Fill:

For time series data, you might consider using forward fill (`ffill()`) or backward fill (`bfill()`) to fill missing values based on the previous or next observation.

In [None]:
# Forward fill missing values
df_forward_filled = df.ffill()

# Backward fill missing values
df_backward_filled = df.bfill()

### 4. Interpolation:
Interpolation methods can be used to estimate missing values based on the values present in the dataframe.

In [None]:
# Linear interpolation
df_interpolated = df.interpolate()

### 5. Custom Functions:
For more complex situations, you may implement custom functions to fill missing values based on specific business logic or domain knowledge.

In [None]:
# Custom function to fill missing values based on specific logic
def custom_fill(column):
    # Your custom logic here
    return filled_column

df['column_name'] = custom_fill(df['column_name'])

### 6. Handling Categorical Data:
For categorical data, you can replace missing values with a new category or use the most frequent category.

In [None]:
# Replace missing categorical values with a new category
df['categorical_column'].fillna('Unknown', inplace=True)

# Replace missing categorical values with the most frequent category
df['categorical_column'].fillna(df['categorical_column'].mode()[0], inplace=True)

### 7. Drop Irrelevant Columns:
If a column has a high percentage of missing values and is not relevant for analysis, consider dropping it.

In [None]:
# Drop a column with a high percentage of missing values
df.drop('column_name', axis=1, inplace=True)

### 8. Use Specialized Libraries:
Consider using specialized libraries like scikit-learn or fancyimpute for more advanced imputation techniques.

In [None]:
from sklearn.impute import SimpleImputer

# Impute missing values with the mean using scikit-learn
imputer = SimpleImputer(strategy='mean')
df_imputed_sklearn = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

## Example of each scenario:

In [3]:
import pandas as pd
import numpy as np

# Creating a sample dataframe with missing values
data = {
    'A': [1, 2, np.nan, 4, 5],
    'B': ['red', 'blue', np.nan, 'green', 'red'],
    'C': [0.1, np.nan, 0.3, 0.4, 0.5],
    'D': [10, 20, 30, np.nan, 50]
}

df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)
print("\n")

# 1. Identify Missing Values
missing_values = df.isnull().sum()
print("Missing Values:")
print(missing_values)
print("\n")

# 2. Remove Rows or Columns
df_cleaned_rows = df.dropna()
print("DataFrame after removing rows with missing values:")
print(df_cleaned_rows)
print("\n")

df_cleaned_columns = df.dropna(axis=1)
print("DataFrame after removing columns with missing values:")
print(df_cleaned_columns)
print("\n")

# 3. Imputation
df_imputed = df.fillna(df.mean())
print("DataFrame after imputation with mean:")
print(df_imputed)
print("\n")

# 4. Forward or Backward Fill
df_forward_filled = df.ffill()
print("DataFrame after forward fill:")
print(df_forward_filled)
print("\n")

# 5. Interpolation
df_interpolated = df.interpolate()
print("DataFrame after linear interpolation:")
print(df_interpolated)
print("\n")

# 6. Custom Functions (Assuming a custom function that fills missing values with -1)
def custom_fill(column):
    return column.fillna(-1)

df['A'] = custom_fill(df['A'])
print("DataFrame after custom fill:")
print(df)
print("\n")

# 7. Handling Categorical Data
df['B'].fillna('Unknown', inplace=True)
print("DataFrame after filling missing categorical values with 'Unknown':")
print(df)
print("\n")

# 8. Drop Irrelevant Columns
df.drop('C', axis=1, inplace=True)
print("DataFrame after dropping column 'C':")
print(df)
print("\n")

# 9. Use Specialized Libraries (Scikit-learn) - Handling Numeric and Categorical Columns
from sklearn.impute import SimpleImputer

# Identify numeric and categorical columns
numeric_cols = df.select_dtypes(include='number').columns
categorical_cols = df.select_dtypes(include='object').columns

# Impute missing values in numeric columns with mean
numeric_imputer = SimpleImputer(strategy='mean')
df[numeric_cols] = numeric_imputer.fit_transform(df[numeric_cols])

# Impute missing values in categorical columns with most frequent
categorical_imputer = SimpleImputer(strategy='most_frequent')
df[categorical_cols] = categorical_imputer.fit_transform(df[categorical_cols])

# Display the imputed DataFrame
print("DataFrame after imputation using scikit-learn:")
print(df)

Original DataFrame:
     A      B    C     D
0  1.0    red  0.1  10.0
1  2.0   blue  NaN  20.0
2  NaN    NaN  0.3  30.0
3  4.0  green  0.4   NaN
4  5.0    red  0.5  50.0


Missing Values:
A    1
B    1
C    1
D    1
dtype: int64


DataFrame after removing rows with missing values:
     A    B    C     D
0  1.0  red  0.1  10.0
4  5.0  red  0.5  50.0


DataFrame after removing columns with missing values:
Empty DataFrame
Columns: []
Index: [0, 1, 2, 3, 4]


DataFrame after imputation with mean:
     A      B      C     D
0  1.0    red  0.100  10.0
1  2.0   blue  0.325  20.0
2  3.0    NaN  0.300  30.0
3  4.0  green  0.400  27.5
4  5.0    red  0.500  50.0


DataFrame after forward fill:
     A      B    C     D
0  1.0    red  0.1  10.0
1  2.0   blue  0.1  20.0
2  2.0   blue  0.3  30.0
3  4.0  green  0.4  30.0
4  5.0    red  0.5  50.0


DataFrame after linear interpolation:
     A      B    C     D
0  1.0    red  0.1  10.0
1  2.0   blue  0.2  20.0
2  3.0    NaN  0.3  30.0
3  4.0  green  0.4

  df_imputed = df.fillna(df.mean())


# What is object datatype?

In the context of pandas, when a column in a DataFrame has the data type "object," it usually means that the column contains textual data or a mixture of different data types (e.g., strings, mixed types, or Python objects). The "object" data type is a catch-all for columns that don't fit into the more specific data types like int, float, or datetime.

You can leave the "object" datatype as it is if it suits your analysis or if the nature of the data in that column is inherently text-based or mixed. The "object" datatype is a generic and flexible datatype that can accommodate various types of data, including strings and mixed types.

However, it's essential to be aware of potential **limitations when working with "object" datatype columns:**

**Performance:** Operations on columns with "object" datatype might be slower than on columns with more specific datatypes like integers or floats. This is because pandas can optimize operations on homogeneous data types more effectively.

**Functionality:** Some pandas and NumPy operations may not work as expected on "object" datatypes. For example, mathematical operations are generally designed for numerical data types and may not apply directly to "object" columns.

**Memory Usage:** Columns with the "object" datatype can consume more memory than columns with specific datatypes, as each element in an "object" column is essentially a reference to a Python object.

**If the data in your "object" columns consists of text or mixed types, and you don't need to perform extensive numerical or date-based operations on these columns, leaving them as "object" might be perfectly fine**.

In [4]:
# changing datatype
df[['col1', 'col2']].astype("int")

data = {
    'A': [1, 2, np.nan, 4, 5],
    'B': ['red', 'blue', np.nan, 'green', 'red'],
    'C': [0.1, np.nan, 0.3, 0.4, 0.5],
    'D': [10, 20, 30, np.nan, 50]
}

df = pd.DataFrame(data)

In [9]:
data = {
    'A': [1, 2, np.nan, 4, 5],
    'B': ['red', 'blue', np.nan, 'green', 'red'],
    'C': [0.1, np.nan, 0.3, 0.4, 0.5],
    'D': [10, 20, 30, np.nan, 50]
}

df = pd.DataFrame(data)
df

Unnamed: 0,A,B,C,D
0,1.0,red,0.1,10.0
1,2.0,blue,,20.0
2,,,0.3,30.0
3,4.0,green,0.4,
4,5.0,red,0.5,50.0


In [10]:
df.drop('A', axis=0, inplace=True)
df

KeyError: "['A'] not found in axis"