## **Synthetic Data**

In [None]:
import pandas as pd
import numpy as np

# Create a dictionary with the data
data = {
    'Numeric_Column_1': [10.5, 12.1, 9.8, np.nan, 11.3, 10.0, 13.5, 11.8, 9.5, 10.9,
                         12.0, np.nan, 10.2, 11.5, 9.9, 13.0, 11.1, np.nan, 12.5, 10.7],
    'Numeric_Column_2': [55, 62, np.nan, 58, 65, 50, 70, 61, 53, 59,
                         63, 57, 60, np.nan, 54, 68, 56, 64, 66, 52],
    'Categorical_Column_1': [1, 2, 1, 3, 2, 1, 3, 2, 1, 3,
                             2, 1, 3, 2, 1, 3, 2, 1, 3, 2],
    'Categorical_Column_2': [101, 102, 101, 103, 102, 101, 103, 102, 101, 103,
                             102, 101, 103, np.nan, 101, 103, 102, 101, 103, 102]
}


In [None]:

# Create the DataFrame
df = pd.DataFrame(data)

# Introduce some random missing values in categorical columns (represented by NaN)
# For demonstrating missing value handling in categorical columns, we'll add more NaNs
df.loc[[2, 13, 17], 'Categorical_Column_1'] = np.nan
df.loc[[5, 9, 14], 'Categorical_Column_2'] = np.nan

In [None]:
df

## **Missing Data Handling**

In [None]:
# prompt: perform missing data handling in above dataset

# Display initial info about missing values
print("Initial missing values:")
print(df.isnull().sum())
print("\n")



In [None]:
# Handling missing values:
# For numerical columns, we can fill missing values with the mean or median.
# Let's use the mean for 'Numeric_Column_1' and median for 'Numeric_Column_2'.
df.fillna({'Numeric_Column_1':df['Numeric_Column_1'].mean()}, inplace=True)
df.fillna({'Numeric_Column_2':df['Numeric_Column_2'].median()}, inplace=True)


In [None]:

# For categorical columns, we can fill missing values with the mode (most frequent value).
df.fillna({'Categorical_Column_1':df['Categorical_Column_1'].mode()[0]}, inplace=True)
df.fillna({'Categorical_Column_2':df['Categorical_Column_2'].mode()[0]}, inplace=True)


In [None]:
# Display info about missing values after handling
print("Missing values after handling:")
print(df.isnull().sum())
print("\n")


In [None]:

# Display the first few rows of the cleaned DataFrame
print("DataFrame after handling missing values:")
df

In [None]:

# Duplicate a row (e.g., the first row) three times and append it
duplicated_rows = pd.concat([df.iloc[[0]]] * 3, ignore_index=True)
df = pd.concat([df, duplicated_rows], ignore_index=True)

print("DataFrame after adding 3 duplicate rows:")
df


##**Data Cleaning**

In [None]:
# Identifying and removing duplicate rows
print("Initial number of rows:", len(df))
df.drop_duplicates(inplace=True)
print("Number of rows after removing duplicates:", len(df))
print("\n")

In [None]:
# Identifying outliers (example using IQR for a numerical column)
# Calculate Q1 (25th percentile) and Q3 (75th percentile)
Q1 = df['Numeric_Column_1'].quantile(0.25)
Q3 = df['Numeric_Column_1'].quantile(0.75)
IQR = Q3 - Q1

# Define lower and upper bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

In [None]:
# Identify outliers
outliers = df[(df['Numeric_Column_1'] < lower_bound) | (df['Numeric_Column_1'] > upper_bound)]
print("Potential outliers in 'Numeric_Column_1':")
print(outliers)
print("\n")

# Removing outliers (example: removing rows where 'Numeric_Column_1' is an outlier)
df = df[~((df['Numeric_Column_1'] < lower_bound) | (df['Numeric_Column_1'] > upper_bound))]
print("DataFrame after removing outliers:")
print(df)
print("\n")

In [None]:
# Data type conversion (if necessary)
# Check current data types
print("Current data types:")
print(df.dtypes)
print("\n")

# Example: converting categorical columns to 'category' dtype for memory efficiency
df['Categorical_Column_1'] = df['Categorical_Column_1'].astype('category')
df['Categorical_Column_2'] = df['Categorical_Column_2'].astype('category')

print("Data types after conversion:")
print(df.dtypes)
print("\n")

# Display the final cleaned DataFrame info
print("Final cleaned DataFrame info:")
df.info()

## **Normalization and Scaling**

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Select numerical columns for scaling
numerical_cols = ['Numeric_Column_1', 'Numeric_Column_2']

# Min-Max Scaling
scaler_minmax = MinMaxScaler()
df[numerical_cols] = scaler_minmax.fit_transform(df[numerical_cols])

print("DataFrame after Min-Max Scaling:")
print(df)
print("\n")



In [None]:
from sklearn.preprocessing import StandardScaler

# Select numerical columns for scaling
numerical_cols = ['Numeric_Column_1', 'Numeric_Column_2']

# Standard Scaling (Z-score normalization)
scaler_standard = StandardScaler()
df[numerical_cols] = scaler_standard.fit_transform(df[numerical_cols])

print("DataFrame after Standard Scaling:")
print(df)
print("\n")

## **Transformation**

In [None]:

import matplotlib.pyplot as plt
from sklearn.preprocessing import QuantileTransformer, PowerTransformer

# Select a numerical column for transformation
column_to_transform = 'Numeric_Column_1'

# Plot histogram before transformation
plt.figure(figsize=(12, 5))
plt.subplot(1, 3, 1)
plt.hist(df[column_to_transform], bins=15, edgecolor='black')
plt.title(f'Histogram of {column_to_transform} Before Transformation')
plt.xlabel(column_to_transform)
plt.ylabel('Frequency')

# Quantile Transformation
quantile_transformer = QuantileTransformer(output_distribution='uniform')
df[f'{column_to_transform}_quantile'] = quantile_transformer.fit_transform(df[[column_to_transform]])

# Plot histogram after Quantile Transformation
plt.subplot(1, 3, 2)
plt.hist(df[f'{column_to_transform}_quantile'], bins=15, edgecolor='black')
plt.title(f'Histogram After Quantile Transformation')
plt.xlabel(f'{column_to_transform}_quantile')
plt.ylabel('Frequency')


# Power Transformation (Yeo-Johnson or Box-Cox)
# Yeo-Johnson works with positive and negative values
power_transformer = PowerTransformer(method='yeo-johnson')
df[f'{column_to_transform}_power'] = power_transformer.fit_transform(df[[column_to_transform]])

# Plot histogram after Power Transformation
plt.subplot(1, 3, 3)
plt.hist(df[f'{column_to_transform}_power'], bins=15, edgecolor='black')
plt.title(f'Histogram After Power Transformation')
plt.xlabel(f'{column_to_transform}_power')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

print("\nDataFrame with transformed columns:")
print(df[[column_to_transform, f'{column_to_transform}_quantile', f'{column_to_transform}_power']].head())


##**Discretization and Binning**

In [None]:

# Increase the number of rows for better distribution in binning
num_rows = 100

# Create a new dictionary with data for the new dataset
new_data = {
    # A column with a range of values suitable for binning
    'Income': np.random.randint(20000, 150000, size=num_rows),
    # A column with values that might benefit from polynomial features
    'Years_of_Experience': np.random.uniform(0, 25, size=num_rows).round(1),
    # Two columns to create interaction features from
    'Spend_per_Visit': np.random.uniform(10, 200, size=num_rows).round(2),
    'Visits_per_Month': np.random.randint(1, 10, size=num_rows),
    # A potential target variable based on some interaction
    'Customer_Value': np.random.normal(loc=np.random.uniform(100, 500, size=num_rows), scale=np.random.uniform(20, 100, size=num_rows))
}

# Create the new DataFrame
df_new = pd.DataFrame(new_data)


In [None]:
df_new

In [None]:

# Binning 'Income' into 4 bins (e.g., Low, Medium, High, Very High)
# We can use pd.cut for equal-width bins or pd.qcut for equal-frequency bins

# Using pd.cut for equal-width bins
# Determine the bin edges
min_income = df_new['Income'].min()
max_income = df_new['Income'].max()
# Let's define the bin labels
bin_labels = ['Low', 'Medium', 'High', 'Very High']
# Create bins with pd.cut
df_new['Income_Bins_EqualWidth'] = pd.cut(df_new['Income'], bins=len(bin_labels), labels=bin_labels, include_lowest=True)

print("\nDataFrame after Equal-Width Binning on 'Income':")
print(df_new[['Income', 'Income_Bins_EqualWidth']].head())

# Using pd.qcut for equal-frequency bins (quantiles)
# This will ensure approximately the same number of records in each bin
df_new['Income_Bins_EqualFrequency'] = pd.qcut(df_new['Income'], q=len(bin_labels), labels=bin_labels, duplicates='drop')

print("\nDataFrame after Equal-Frequency Binning on 'Income':")
print(df_new[['Income', 'Income_Bins_EqualFrequency']].head())

# Display the value counts for each binning method
print("\nValue counts for Equal-Width Bins:")
print(df_new['Income_Bins_EqualWidth'].value_counts())

print("\nValue counts for Equal-Frequency Bins:")
print(df_new['Income_Bins_EqualFrequency'].value_counts())

# Optionally, visualize the distribution of the binned column
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
df_new['Income_Bins_EqualWidth'].value_counts().sort_index().plot(kind='bar')
plt.title('Distribution of Income (Equal-Width Bins)')
plt.xlabel('Income Bin')
plt.ylabel('Count')
plt.xticks(rotation=0)

plt.subplot(1, 2, 2)
df_new['Income_Bins_EqualFrequency'].value_counts().sort_index().plot(kind='bar')
plt.title('Distribution of Income (Equal-Frequency Bins)')
plt.xlabel('Income Bin')
plt.ylabel('Count')
plt.xticks(rotation=0)

plt.tight_layout()
plt.show()

In [None]:
df_new

In [None]:
# Apply discretization to the 'Income' column using pd.cut
# Let's create 5 bins of approximately equal width
df_new['Income_Bins_Cut'] = pd.cut(df_new['Income'], bins=5)

print("\nDataFrame after Equal-Width Binning on 'Income' with 5 bins:")
print(df_new[['Income', 'Income_Bins_Cut']].head())

# Apply discretization to the 'Income' column using pd.qcut
# Let's create 5 bins with approximately equal frequencies
df_new['Income_Bins_Qcut'] = pd.qcut(df_new['Income'], q=5, duplicates='drop')

print("\nDataFrame after Equal-Frequency Binning on 'Income' with 5 bins:")
print(df_new[['Income', 'Income_Bins_Qcut']].head())

# Display the value counts for each binning method
print("\nValue counts for Equal-Width Bins (5 bins):")
print(df_new['Income_Bins_Cut'].value_counts())

print("\nValue counts for Equal-Frequency Bins (5 bins):")
print(df_new['Income_Bins_Qcut'].value_counts())

# Optionally, visualize the distribution of the new binned columns
plt.figure(figsize=(14, 6))

plt.subplot(1, 2, 1)
df_new['Income_Bins_Cut'].value_counts().sort_index().plot(kind='bar')
plt.title('Distribution of Income (Equal-Width Bins, 5 bins)')
plt.xlabel('Income Bin')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')

plt.subplot(1, 2, 2)
df_new['Income_Bins_Qcut'].value_counts().sort_index().plot(kind='bar')
plt.title('Distribution of Income (Equal-Frequency Bins, 5 bins)')
plt.xlabel('Income Bin')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')

plt.tight_layout()
plt.show()

In [None]:
df_new

## **Polynomial Interaction**

In [None]:

from sklearn.preprocessing import PolynomialFeatures

# Select the numerical columns for creating polynomial and interaction features
# Let's use 'Years_of_Experience' and 'Spend_per_Visit'

features_for_poly = ['Years_of_Experience', 'Spend_per_Visit']

# Create a PolynomialFeatures object
# degree=2 includes the original features, squares of each feature, and the interaction term
# include_bias=False means we don't add a column of all ones (which is typically handled by the model's intercept)
poly_transformer = PolynomialFeatures(degree=2, include_bias=False)

# Fit and transform the selected columns
poly_features = poly_transformer.fit_transform(df_new[features_for_poly])

# The get_feature_names_out method helps understand the new feature names
poly_feature_names = poly_transformer.get_feature_names_out(features_for_poly)

# Create a new DataFrame from the polynomial features
df_poly = pd.DataFrame(poly_features, columns=poly_feature_names, index=df_new.index)

# Add these new polynomial and interaction features back to the original DataFrame
df_new = pd.concat([df_new, df_poly], axis=1)

print("\nDataFrame after adding Polynomial and Interaction Features:")
print(df_new.head())

# You can see new columns like 'Years_of_Experience^2', 'Spend_per_Visit^2', and 'Years_of_Experience Spend_per_Visit'

In [None]:
df_new

## **Encoding Techniques**

In [None]:

from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Create a new sample DataFrame for demonstrating encoding
encoding_data = {
    'Color': ['Red', 'Blue', 'Green', 'Red', 'Green', 'Blue', 'Red', 'Green', 'Blue', 'Red'],
    'Size': ['Small', 'Medium', 'Large', 'Small', 'Medium', 'Large', 'Small', 'Medium', 'Large', 'Small'],
    'Rating': ['A', 'B', 'C', 'A', 'C', 'B', 'A', 'C', 'B', 'A']
}
df_encode = pd.DataFrame(encoding_data)

print("Original DataFrame for Encoding:")
print(df_encode)
print("\n")

# --- Label Encoding ---
print("--- Label Encoding ---")

# Create a LabelEncoder object
label_encoder = LabelEncoder()

# Apply Label Encoding to the 'Size' column
df_encode['Size_Encoded'] = label_encoder.fit_transform(df_encode['Size'])

print("DataFrame after Label Encoding 'Size':")
print(df_encode)
print("\n")

# To see the mapping between original labels and encoded values
print("Label Encoding Mapping for 'Size':")
print(dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))
print("\n")



In [None]:
encoding_data = {
    'Color': ['Red', 'Blue', 'Green', 'Red', 'Green', 'Blue', 'Red', 'Green', 'Blue', 'Red'],
    'Size': ['Small', 'Medium', 'Large', 'Small', 'Medium', 'Large', 'Small', 'Medium', 'Large', 'Small'],
    'Rating': ['A', 'B', 'C', 'A', 'C', 'B', 'A', 'C', 'B', 'A']
}
df_encode = pd.DataFrame(encoding_data)


# Create a OneHotEncoder object
# sparse=False returns a dense NumPy array instead of a sparse matrix
onehot_encoder = OneHotEncoder(sparse_output=False)

# Reshape the column to be a 2D array because fit_transform expects it
onehot_encoded_colors = onehot_encoder.fit_transform(df_encode[['Color']])

# The get_feature_names_out method helps understand the new feature names
onehot_feature_names = onehot_encoder.get_feature_names_out(['Color'])

# Create a new DataFrame from the one-hot encoded features
df_onehot_colors = pd.DataFrame(onehot_encoded_colors, columns=onehot_feature_names, index=df_encode.index)

# Concatenate the original DataFrame with the one-hot encoded columns
df_encode = pd.concat([df_encode, df_onehot_colors], axis=1)

print("DataFrame after One-Hot Encoding 'Color':")
print(df_encode)
print("\n")

# You can perform One-Hot Encoding directly using pandas get_dummies, which is often simpler
print("--- One-Hot Encoding using pandas get_dummies ---")

# Select the categorical column(s) to encode
columns_to_onehot = ['Rating']

# Apply get_dummies
df_onehot_pd = pd.get_dummies(df_encode[columns_to_onehot], prefix=columns_to_onehot)

# Concatenate the original DataFrame with the new one-hot encoded columns from pandas
df_encode = pd.concat([df_encode, df_onehot_pd], axis=1)

# Optionally, drop the original categorical columns if you no longer need them
# df_encode = df_encode.drop(columns=columns_to_onehot + ['Color']) # also drop the 'Color' column

print("DataFrame after One-Hot Encoding 'Rating' using pandas get_dummies:")
print(df_encode)
print("\n")

print("Final DataFrame with both Label and One-Hot Encoding:")
df_encode

## **Feature Reduction**

In [None]:
from sklearn import datasets

In [None]:
iris = datasets.load_iris()
print(iris.data)
print(iris.target)

In [None]:
print(iris.target_names)

In [None]:
# prompt: concatenate data and target vertically

import numpy as np

# Assuming iris.data and iris.target are already loaded as in the previous example
concatenated_data = np.concatenate((iris.data, iris.target.reshape(-1,1)), axis=1)
print(concatenated_data)


In [None]:
import pandas as pd
df = pd.DataFrame(concatenated_data)
df

## Components of dimensionality reduction
- Feature selection - select a subset
- Feature extraction - build a new feature from original feature set

In [None]:
import seaborn as sns
sns.heatmap(df.corr(), annot=True)

In [None]:


from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE, SelectFromModel

# Assuming df is your DataFrame and the target variable is the last column (index 4 for iris data)
# Separate features (X) and target (y)
X = df.iloc[:, :-1] # All columns except the last one
y = df.iloc[:, -1]  # The last column is the target

# --- Filter Method: SelectKBest ---
print("--- Filter Method: SelectKBest ---")

# Using chi2 for non-negative data like counts (often used with categorical features)
# Since iris data is numerical, f_classif is more appropriate for classification
# Let's use f_classif (ANOVA F-value) to select features based on their relationship with the target
# Select the top k features, e.g., k=2
k = 2
selector_fclassif = SelectKBest(score_func=f_classif, k=k)

# Fit the selector on the data and transform it
X_filtered_fclassif = selector_fclassif.fit_transform(X, y)

# Get the indices of the selected features
selected_feature_indices_fclassif = selector_fclassif.get_support(indices=True)

# Get the names of the selected features (assuming original column names are available)
# If not available, you can use the indices
selected_feature_names_fclassif = X.columns[selected_feature_indices_fclassif]

print(f"Selected features using SelectKBest (f_classif, k={k}): {selected_feature_names_fclassif.tolist()}")
print("Transformed data (filtered):")
print(X_filtered_fclassif[:5]) # Display the first 5 rows
print("\n")


In [None]:

# Using mutual_info_classif
selector_mutual_info = SelectKBest(score_func=mutual_info_classif, k=k)
X_filtered_mutual_info = selector_mutual_info.fit_transform(X, y)
selected_feature_indices_mutual_info = selector_mutual_info.get_support(indices=True)
selected_feature_names_mutual_info = X.columns[selected_feature_indices_mutual_info]

print(f"Selected features using SelectKBest (mutual_info_classif, k={k}): {selected_feature_names_mutual_info.tolist()}")
print("Transformed data (filtered):")
print(X_filtered_mutual_info[:5])
print("\n")


In [None]:

# --- Wrapper Method: Recursive Feature Elimination (RFE) ---
print("--- Wrapper Method: Recursive Feature Elimination (RFE) ---")

# RFE uses an estimator (like a model) to rank features and recursively eliminate the weakest ones.
# Let's use a RandomForestClassifier as the estimator.
estimator = RandomForestClassifier(n_estimators=100, random_state=42)

# Create the RFE object
# n_features_to_select: the number of features to select (can be an integer or float representing a fraction)
# step: the number of features to remove at each step (can be an integer or float representing a fraction)
n_features_rfe = 2
rfe_selector = RFE(estimator=estimator, n_features_to_select=n_features_rfe, step=1)

# Fit RFE to the data
rfe_selector.fit(X, y)

# Get the selected features
selected_feature_indices_rfe = rfe_selector.get_support(indices=True)
selected_feature_names_rfe = X.columns[selected_feature_indices_rfe]

print(f"Selected features using RFE ({estimator.__class__.__name__}, n_features_to_select={n_features_rfe}): {selected_feature_names_rfe.tolist()}")

# Transform the data to keep only the selected features
X_wrapped_rfe = rfe_selector.transform(X)
print("Transformed data (wrapped - RFE):")
print(X_wrapped_rfe[:5])
print("\n")


In [None]:

# --- Wrapper Method: SelectFromModel ---
print("--- Wrapper Method: SelectFromModel ---")

# Selects features based on importance weights from an estimator.
# Let's use a RandomForestClassifier again.
estimator_sfm = RandomForestClassifier(n_estimators=100, random_state=42)

# Create the SelectFromModel object
# threshold: the threshold for the importance scores. Features with scores >= threshold are kept.
# You can set threshold='median' or 'mean' or a specific value.
sfm_selector = SelectFromModel(estimator=estimator_sfm, threshold='median')

# Fit the selector and transform the data
sfm_selector.fit(X, y)

# Get the selected features
selected_feature_indices_sfm = sfm_selector.get_support(indices=True)
selected_feature_names_sfm = X.columns[selected_feature_indices_sfm]

print(f"Selected features using SelectFromModel ({estimator_sfm.__class__.__name__}, threshold='median'): {selected_feature_names_sfm.tolist()}")

# Transform the data to keep only the selected features
X_wrapped_sfm = sfm_selector.transform(X)
print("Transformed data (wrapped - SelectFromModel):")
print(X_wrapped_sfm[:5])
print("\n")


In [None]:

# --- Hybrid Method ---
print("--- Hybrid Method ---")

# Hybrid methods combine aspects of filter and wrapper methods.
# A common approach is to use a filter method to reduce the search space,
# then apply a wrapper or embedded method on the reduced set.
# Or, use an embedded method's feature importances within a wrapper framework.

# Example Hybrid Approach: Filter + Wrapper (RFE)
# 1. Use SelectKBest as a filter to select the top 'm' features (m > n_features_to_select for RFE)
m = 3 # Select top 3 features using a filter
filter_step_selector = SelectKBest(score_func=f_classif, k=m)
X_filtered_step = filter_step_selector.fit_transform(X, y)
selected_feature_indices_filter_step = filter_step_selector.get_support(indices=True)
selected_feature_names_filter_step = X.columns[selected_feature_indices_filter_step]

print(f"Hybrid Step 1 (Filter): Selected {m} features using SelectKBest (f_classif): {selected_feature_names_filter_step.tolist()}")

# Create a DataFrame with the filtered features for the next step
X_filtered_df = X[selected_feature_names_filter_step]

# 2. Apply RFE on the filtered set to select the final 'n' features (n < m)
n_features_hybrid = 2 # Select final 2 features using RFE
hybrid_rfe_estimator = RandomForestClassifier(n_estimators=100, random_state=42)
hybrid_rfe_selector = RFE(estimator=hybrid_rfe_estimator, n_features_to_select=n_features_hybrid, step=1)

# Fit RFE on the filtered data
hybrid_rfe_selector.fit(X_filtered_df, y)

# Get the selected features from the filtered set
selected_feature_indices_hybrid = hybrid_rfe_selector.get_support(indices=True)

# Map the indices back to the original feature names from the filtered set
selected_feature_names_hybrid = X_filtered_df.columns[selected_feature_indices_hybrid]

print(f"Hybrid Step 2 (Wrapper): Selected {n_features_hybrid} final features using RFE: {selected_feature_names_hybrid.tolist()}")

# Transform the original data to keep only the final selected features from the hybrid method
X_hybrid = X[selected_feature_names_hybrid]
print("Transformed data (hybrid - Filter + RFE):")
print(X_hybrid[:5])
print("\n")


# Another Hybrid Approach: Embedded + Wrapper (SelectFromModel with a threshold based on importance)
# 1. Use an embedded method (e.g., RandomForest feature importances) to get feature importances.
# The SelectFromModel method already does this internally when fit is called.

# 2. Use SelectFromModel with a threshold (implicitly uses feature importances).
# We already demonstrated this in the wrapper section. The strength of SelectFromModel
# comes from using an embedded method's importance scores.

# So, the SelectFromModel example can also be considered a form of hybrid method if
# the underlying estimator is an embedded method that provides feature importances
# (like tree-based models or models with L1 regularization).

print("SelectFromModel using RandomForest (an embedded method) can be considered a hybrid approach:")
print(f"Selected features using SelectFromModel ({estimator_sfm.__class__.__name__}, threshold='median'): {selected_feature_names_sfm.tolist()}")
print("Transformed data (hybrid/wrapped - SelectFromModel):")
print(X_wrapped_sfm[:5])
print("\n")

In [None]:
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

In [None]:
X = iris.data
Y = iris.target
print(X)
print(Y)

In [None]:
pca = PCA(n_components=2) #unsupervised
X_R = pca.fit(X).transform(X)
print(X_R)

In [None]:
lda = LDA(n_components=2) #converting 4 dimension to 2. It is a supervised
X_L = lda.fit(X,Y).transform(X)
print(X_L)