# Handling missing data

In [None]:
import pandas as pd

# Sample data
data = {'A': [1, 2, None], 'B': [4, None, 6]}
df = pd.DataFrame(data)

# Remove rows with missing values
df_dropped = df.dropna()

# Fill missing values with mean
df_filled = df.fillna(df.mean())


# Encoding (one hot and labelled)

In [5]:
# Sample data
data = {'Color': ['Red', 'Green', 'Blue']}
df = pd.DataFrame(data)

# One-hot encoding
df_encoded = pd.get_dummies(df, columns=['Color'], drop_first=True)


# feature scaling

In [6]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

scaler_standard = StandardScaler()
scaler_minmax = MinMaxScaler()

# Sample data
data = [[1, 2], [2, 3], [3, 4]]
data_standardized = scaler_standard.fit_transform(data)
data_normalized = scaler_minmax.fit_transform(data)


# feature scaling

In [7]:
# Sample data
data = {'Value': [10, 12, 14, 15, 100]}
df = pd.DataFrame(data)

Q1 = df['Value'].quantile(0.25)
Q3 = df['Value'].quantile(0.75)
IQR = Q3 - Q1

# Remove outliers
df_no_outliers = df[(df['Value'] >= (Q1 - 1.5 * IQR)) & (df['Value'] <= (Q3 + 1.5 * IQR))]


# data transformation (Log transformation)

In [8]:
import numpy as np

# Sample data
data = {'Value': [1, 10, 100]}
df = pd.DataFrame(data)

# Log transformation
df['Log_Value'] = np.log(df['Value'])


# Feature Engineerung

In [9]:
from sklearn.model_selection import train_test_split

# Sample features and target variable
X = df[['Value']]
y = df['Log_Value']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
