In [None]:
'''Apply data pre-processing techniques such as standardization/normalization,
transformation, aggregation, discretization/binarization, sampling etc. on any dataset'''

In [3]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Binarizer
import seaborn as sns

df = sns.load_dataset("titanic")

# Dropping rows with missing values
df.dropna(subset=['age', 'fare', 'embarked', 'sex'], inplace=True)

# 1. Standardization
scaler = StandardScaler()
df[['age', 'fare']] = scaler.fit_transform(df[['age', 'fare']])

# 2. Normalization
normalizer = MinMaxScaler()
df[['age', 'fare']] = normalizer.fit_transform(df[['age', 'fare']])

# 3. Transformation (Logarithmic transformation of 'Fare')
df['fare_log'] = np.log1p(df['fare'])  # Using log1p to handle zero values

# 4. Aggregation
agg_df = df.groupby(['pclass', 'sex'])[['age', 'fare']].mean().reset_index()

# 5. Discretization/Binarization
df['age_bin'] = pd.cut(df['age'], bins=4, labels=['Young', 'Middle-aged', 'Senior', 'Old'])

# Binarize the 'Fare' column: 0 if Fare is below median, else 1
binarizer = Binarizer(threshold=df['fare'].median())
df['fare_bin'] = binarizer.fit_transform(df[['fare']])

# 6. Sampling
# Random Sampling: Select 30% of the data randomly
random_sample = df.sample(frac=0.3, random_state=1)


# Display some of the processed data
print(df.head())
print("Aggregated Data:", agg_df.head())
print("Random Sample Data:", random_sample.head())


   survived  pclass     sex       age  sibsp  parch      fare embarked  class  \
0         0       3    male  0.271174      1      0  0.014151        S  Third   
1         1       1  female  0.472229      1      0  0.139136        C  First   
2         1       3  female  0.321438      0      0  0.015469        S  Third   
3         1       1  female  0.434531      1      0  0.103644        S  First   
4         0       3    male  0.434531      0      0  0.015713        S  Third   

     who  adult_male deck  embark_town alive  alone  fare_log      age_bin  \
0    man        True  NaN  Southampton    no  False  0.014052  Middle-aged   
1  woman       False    C    Cherbourg   yes  False  0.130270  Middle-aged   
2  woman       False  NaN  Southampton   yes   True  0.015350  Middle-aged   
3  woman       False    C  Southampton   yes  False  0.098618  Middle-aged   
4    man        True  NaN  Southampton    no   True  0.015590  Middle-aged   

   fare_bin  
0       0.0  
1       1.0  
2 

In [None]:
#using iris dataset

In [4]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler, MinMaxScaler, KBinsDiscretizer, Binarizer
from sklearn.model_selection import train_test_split

# Load the Iris dataset
iris = load_iris()
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
iris_df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)

# Display initial dataset
print("Initial Dataset:")
print(iris_df.head())

# --- Data Preprocessing ---

# 1. Standardization
scaler = StandardScaler()
iris_standardized = scaler.fit_transform(iris_df.iloc[:, :-1])
iris_df_standardized = pd.DataFrame(iris_standardized, columns=iris.feature_names)
print("\nStandardized Data (Mean ~ 0, Std ~ 1):")
print(iris_df_standardized.head())

# 2. Normalization
normalizer = MinMaxScaler()
iris_normalized = normalizer.fit_transform(iris_df.iloc[:, :-1])
iris_df_normalized = pd.DataFrame(iris_normalized, columns=iris.feature_names)
print("\nNormalized Data (Values between 0 and 1):")
print(iris_df_normalized.head())

# 3. Transformation (Log Transformation)
iris_transformed = np.log1p(iris_df.iloc[:, :-1])  # log1p to avoid log(0)
iris_df_transformed = pd.DataFrame(iris_transformed, columns=iris.feature_names)
print("\nLog-Transformed Data:")
print(iris_df_transformed.head())

# 4. Aggregation (e.g., Combine sepal length and width into a single feature)
iris_df['sepal_mean'] = iris_df[['sepal length (cm)', 'sepal width (cm)']].mean(axis=1)
print("\nDataset with Aggregated Feature (Sepal Mean):")
print(iris_df[['sepal length (cm)', 'sepal width (cm)', 'sepal_mean']].head())

# 5. Discretization (Bin Continuous Values into Categories)
discretizer = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')
iris_discretized = discretizer.fit_transform(iris_df.iloc[:, :-2])  # Exclude 'species' and 'sepal_mean'
iris_df_discretized = pd.DataFrame(iris_discretized, columns=iris_df.columns[:-2])  # Match columns
print("\nDiscretized Data (3 Bins):")
print(iris_df_discretized.head())

# 6. Binarization (Convert to Binary Values Based on Threshold)
binarizer = Binarizer(threshold=3.0)
iris_binarized = binarizer.fit_transform(iris_df.iloc[:, :-2])
iris_df_binarized = pd.DataFrame(iris_binarized, columns=iris_df.columns[:-2])
print("\nBinarized Data (Threshold = 3.0):")
print(iris_df_binarized.head())

# 7. Sampling (Random Sampling)
iris_sampled = iris_df.sample(frac=0.3, random_state=42)  # 30% of the dataset
print("\nRandom Sampled Data (30% of Total):")
print(iris_sampled)


Initial Dataset:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

  species  
0  setosa  
1  setosa  
2  setosa  
3  setosa  
4  setosa  

Standardized Data (Mean ~ 0, Std ~ 1):
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0          -0.900681          1.019004          -1.340227         -1.315444
1          -1.143017         -0.131979          -1.340227         -1.315444
2          -1.385353          0.328414          -1.397064         -1.315444
3          -1.506521          0.098217          -1.283389         -1.315444
4          -1.02