# Handling Missing Data

In [None]:
import pandas as pd
import numpy as np

# Create sample data with missing values
data = {'Age': [25, 30, np.nan, 35, 40],
        'Income': [50000, 60000, 70000, np.nan, 90000]}

df = pd.DataFrame(data)
print("Original Data:\n", df)

# Imputation with mean for missing values
df['Age'].fillna(df['Age'].mean(), inplace=True)
df['Income'].fillna(df['Income'].mean(), inplace=True)

print("\nData After Imputation:\n", df)


# Detecting and Handling Outliers

In [None]:
import numpy as np
import pandas as pd

# Create sample data
data = [10, 20, 30, 40, 100]
df = pd.DataFrame(data, columns=['Value'])

# Calculate IQR (Interquartile Range)
Q1 = df['Value'].quantile(0.25)
Q3 = df['Value'].quantile(0.75)
IQR = Q3 - Q1

# Determine outlier threshold
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

print("Lower Bound:", lower_bound)
print("Upper Bound:", upper_bound)

# Filter out outliers
df_filtered = df[(df['Value'] >= lower_bound) & (df['Value'] <= upper_bound)]
print("\nFiltered Data (Outliers Removed):\n", df_filtered)


# Feature Scaling - Normalization

In [None]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

# Create sample data
data = np.array([2, 8, 5, 10]).reshape(-1, 1)

# Apply Min-Max normalization
scaler = MinMaxScaler()
normalized_data = scaler.fit_transform(data)

print("Normalized Data:", normalized_data)


# Feature Scaling - Standardization

In [None]:
from sklearn.preprocessing import StandardScaler
import numpy as np

# Create sample data
data = np.array([10, 20, 30, 40, 50]).reshape(-1, 1)

# Apply Standardization (Z-Score Normalization)
scaler = StandardScaler()
standardized_data = scaler.fit_transform(data)

print("Standardized Data:", standardized_data)


# Encoding Categorical Data - One-Hot Encoding

In [None]:
import pandas as pd

# Create a DataFrame with a categorical column
df = pd.DataFrame({'Color': ['Red', 'Green', 'Blue', 'Green', 'Red']})

# Perform One-Hot Encoding
one_hot_encoded = pd.get_dummies(df['Color'], prefix='Color')

print("One-Hot Encoded Data:\n", one_hot_encoded)


# Encoding Categorical Data - Label Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder

# Create sample data
df = pd.DataFrame({'Size': ['Small', 'Medium', 'Large', 'Medium', 'Small']})

# Initialize the label encoder
label_encoder = LabelEncoder()

# Perform Label Encoding
df['Size_Encoded'] = label_encoder.fit_transform(df['Size'])

print("Label Encoded Data:\n", df)


# Feature Selection - Filter Methods

In [None]:
import pandas as pd
import numpy as np

# Create a sample dataframe with two features
data = {'Feature1': [1, 2, 3, 4, 5],
        'Feature2': [5, 4, 3, 2, 1],
        'Feature3': [2, 3, 4, 5, 6]}
df = pd.DataFrame(data)

# Compute correlation matrix
correlation_matrix = df.corr()

print("Correlation Matrix:\n", correlation_matrix)


# Data Splitting - Training and Test Sets

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np

# Create sample data
X = np.array([[1, 2], [2, 3], [3, 4], [4, 5], [5, 6], [6, 7]])
y = np.array([0, 1, 0, 1, 0, 1])

# Split data into training and test sets (80%/20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training Data (X):\n", X_train)
print("Test Data (X):\n", X_test)
