In [14]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.feature_selection import VarianceThreshold
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import train_test_split

In [15]:
data = pd.read_csv('/content/Week3_GA_dataset.csv')
data

Unnamed: 0,V1,V2,V3,V4,V5,Target
0,2.0,50.0,12500.0,98.0,NEGATIVE,YES
1,0.0,13.0,3250.0,28.0,NEGATIVE,YES
2,?,?,4000.0,35.0,NEGATIVE,YES
3,?,20.0,5000.0,45.0,NEGATIVE,YES
4,1.0,24.0,6000.0,77.0,NEGATIVE,NO
...,...,...,...,...,...,...
743,23.0,2.0,500.0,38.0,NEGATIVE,NO
744,21.0,2.0,500.0,52.0,NEGATIVE,NO
745,23.0,3.0,750.0,62.0,NEGATIVE,NO
746,39.0,1.0,250.0,39.0,NEGATIVE,NO


In [8]:
data.replace('?', np.nan, inplace=True)

# Convert numerical columns to float
for col in ['V1', 'V2', 'V3', 'V4']:
    data[col] = data[col].astype(float)

# Define feature indices
numerical_features = [0, 1, 2, 3]  # Features 1, 2, 3, 4
categorical_features = [4]  # Feature 5

# Define transformers
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Handling missing values
    ('scaler', StandardScaler())  # Standardizing numerical features
])

cat_pipeline = Pipeline([
    ('encoder', OrdinalEncoder())  # Encoding categorical feature
])

# Combine transformations using ColumnTransformer
preprocessor = ColumnTransformer([
    ('num', num_pipeline, numerical_features),
    ('cat', cat_pipeline, categorical_features)
])

# Full pipeline with variance threshold feature selection
full_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('feature_selection', VarianceThreshold(threshold=0.1))  # Removing low-variance features
])

# Apply pipeline to the dataset
processed_data = full_pipeline.fit_transform(data)

# Display the transformed data
print("Transformed Feature Matrix:\n", processed_data)
print("Number of remaining features after VarianceThreshold:", processed_data.shape[1])


Transformed Feature Matrix:
 [[-9.38169390e-01  7.70986653e+00  7.62334626e+00  2.61563344e+00]
 [-1.18627754e+00  1.30454949e+00  1.28273826e+00 -2.57880900e-01]
 [ 0.00000000e+00 -1.53758496e-16  1.79684161e+00  2.94705348e-02]
 ...
 [ 1.66696622e+00 -4.26617275e-01 -4.30939574e-01  1.13782607e+00]
 [ 3.65183145e+00 -7.72850628e-01 -7.73675141e-01  1.93671355e-01]
 [ 7.74561598e+00 -7.72850628e-01 -7.73675141e-01  1.54832812e+00]]
Number of remaining features after VarianceThreshold: 4


In [13]:
df = data
# Convert missing values ('?') to NaN and ensure numerical columns are floats
df.replace('?', np.nan, inplace=True)
numerical_features = ['V1', 'V2', 'V3', 'V4']
categorical_features = ['V5']

for col in numerical_features:
    df[col] = df[col].astype(float)

# Encode target variable
target_encoder = OrdinalEncoder()
df['Target'] = target_encoder.fit_transform(df[['Target']]).astype(int)  # Ensuring integer labels

# Define preprocessing pipeline
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Fill missing values
    ('scaler', StandardScaler())  # Standardize numerical features
])

cat_pipeline = Pipeline([
    ('encoder', OrdinalEncoder())  # Encode categorical features
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, numerical_features),
    ('cat', cat_pipeline, categorical_features)
])

# Apply pipeline to feature matrix
X = df.drop(columns=['Target'])  # Features
y = df['Target']  # Target variable

# Preprocess features
X_processed = preprocessor.fit_transform(X)

# Apply RFE with Logistic Regression
model = LogisticRegression()
rfe = RFE(model, n_features_to_select=2)  # Select top 2 features
rfe.fit(X_processed, y)

# Get selected feature indices
selected_feature_indices = np.where(rfe.support_)[0]

# Get original feature names
all_feature_names = numerical_features + categorical_features
selected_features = [all_feature_names[i] for i in selected_feature_indices]

print("The two most important features selected by RFE:", selected_features)

The two most important features selected by RFE: ['V1', 'V3']


In [17]:
df = data

# Convert missing values ('?') to NaN and ensure numerical columns are floats
df.replace('?', np.nan, inplace=True)
numerical_features = ['V1', 'V2', 'V3', 'V4']
categorical_features = ['V5']

for col in numerical_features:
    df[col] = df[col].astype(float)

# Encode target variable
target_encoder = OrdinalEncoder()
df['Target'] = target_encoder.fit_transform(df[['Target']]).astype(int)  # Ensuring integer labels

# Define preprocessing pipeline
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Fill missing values
    ('scaler', StandardScaler())  # Standardize numerical features
])

cat_pipeline = Pipeline([
    ('encoder', OrdinalEncoder())  # Encode categorical features
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, numerical_features),
    ('cat', cat_pipeline, categorical_features)
])

# Apply pipeline to feature matrix
X = df.drop(columns=['Target'])  # Features
y = df['Target']  # Target variable

# Preprocess features
X_processed = preprocessor.fit_transform(X)

# Split dataset for feature selection
X_train, _, y_train, _ = train_test_split(X_processed, y, test_size=0.2, random_state=42)

# Apply SFS with Logistic Regression
model = LogisticRegression()
sfs = SequentialFeatureSelector(model, n_features_to_select=2, direction='backward')
sfs.fit(X_train, y_train)

# Get selected feature indices
selected_feature_indices = np.where(sfs.get_support())[0]

print("Indices of the two most important features selected by SFS (forward):", selected_feature_indices)

Indices of the two most important features selected by SFS (forward): [2 3]
