In [1]:
# Question 5: Label Encoding vs One-Hot Encoding
# Task: Show the difference between Label Encoding and One-Hot Encoding on the Titanic dataset for the 'Sex' feature.





# Question 6: Combining Feature Scaling Techniques
# Task: Demonstrate combining Min-Max Scaling and Standardization for the same datasetand explain the results.





# Question 7: Handling Multiple Categorical Features
# Task: Handle multiple categorical features ('Sex', 'Embarked') from the Titanic dataset using One-Hot Encoding.




# Question 8: Ordinal Encoding for Ranked Categories
# Task: Ordinal encode 'Pclass' (Passenger class) from the Titanic dataset considering passenger class as a ranked feature.





# Question 9: Impact of Scaling on Different Algorithms
# Task: Investigate the impact of different scaling techniques on a decision tree model and compare it with a SVM.



# Question 10: Custom Transformations for Categorical Features
# Task: Implement a custom transformation function for encoding high cardinality categorical features efficiently.






In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Load the Titanic dataset
try:
    df_titanic = pd.read_csv('train.csv')  # Assuming 'train.csv' is in the same directory
except FileNotFoundError:
    print("Error: 'train.csv' not found. Please ensure the file is in the correct directory.")
    exit()

# Handle missing values for simplicity (you might want to use more sophisticated methods)
df_titanic['Age'].fillna(df_titanic['Age'].median(), inplace=True)
df_titanic['Embarked'].fillna(df_titanic['Embarked'].mode()[0], inplace=True)
df_titanic.dropna(subset=['Cabin'], axis=1, inplace=True)

# Select relevant features and target
X = df_titanic[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
y = df_titanic['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Error: 'train.csv' not found. Please ensure the file is in the correct directory.


NameError: name 'df_titanic' is not defined

: 

In [None]:
# Label Encoding for 'Sex'
label_encoder = LabelEncoder()
df_titanic['Sex_LabelEncoded'] = label_encoder.fit_transform(df_titanic['Sex'])
print("\n--- Question 5: Label Encoding vs One-Hot Encoding ---")
print("\nLabel Encoding for 'Sex':")
print(df_titanic[['Sex', 'Sex_LabelEncoded']].head())
print("\nLabel Mapping:", dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))

# One-Hot Encoding for 'Sex'
onehot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
sex_encoded = onehot_encoder.fit_transform(df_titanic[['Sex']])
df_sex_encoded = pd.DataFrame(sex_encoded, columns=onehot_encoder.get_feature_names_out(['Sex']))
df_titanic_onehot = pd.concat([df_titanic.reset_index(drop=True), df_sex_encoded], axis=1)
print("\nOne-Hot Encoding for 'Sex':")
print(df_titanic_onehot[['Sex', 'Sex_female', 'Sex_male']].head())

In [None]:
print("\n--- Question 6: Combining Feature Scaling Techniques ---")

# Select a numerical feature
fare = df_titanic[['Fare']].copy()

# Apply Min-Max Scaling
minmax_scaler = MinMaxScaler()
fare['Fare_MinMax'] = minmax_scaler.fit_transform(fare[['Fare']])
print("\nFare with Min-Max Scaling:")
print(fare.head())
print(f"Min Value (Min-Max Scaled): {fare['Fare_MinMax'].min():.2f}, Max Value (Min-Max Scaled): {fare['Fare_MinMax'].max():.2f}")

# Apply Standardization
standard_scaler = StandardScaler()
fare['Fare_Standard'] = standard_scaler.fit_transform(fare[['Fare']])
print("\nFare with Standardization:")
print(fare.head())
print(f"Mean (Standardized): {fare['Fare_Standard'].mean():.2f}, Std Dev (Standardized): {fare['Fare_Standard'].std():.2f}")

# Applying both sequentially (though not a common practice for a single feature)
combined_scaled = minmax_scaler.fit_transform(standard_scaler.fit_transform(fare[['Fare']]))
fare['Fare_Combined'] = combined_scaled
print("\nFare with Standardization followed by Min-Max Scaling:")
print(fare.head())
print(f"Min Value (Combined): {fare['Fare_Combined'].min():.2f}, Max Value (Combined): {fare['Fare_Combined'].max():.2f}")

In [None]:
print("\n--- Question 7: Handling Multiple Categorical Features ---")

# Select categorical features
categorical_features = ['Sex', 'Embarked']

# Apply One-Hot Encoding using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), categorical_features)],
    remainder='passthrough')  # Keep other columns

X_train_encoded = preprocessor.fit_transform(X_train)
X_test_encoded = preprocessor.transform(X_test)

# Get the names of the new one-hot encoded columns
feature_names = preprocessor.get_feature_names_out(categorical_features)
other_features = [col for col in X_train.columns if col not in categorical_features]
all_feature_names = list(feature_names) + other_features

df_train_encoded = pd.DataFrame(X_train_encoded, columns=all_feature_names)
df_test_encoded = pd.DataFrame(X_test_encoded, columns=all_feature_names)

print("\nEncoded Training Data (first 5 rows):")
print(df_train_encoded.head())

print("\nEncoded Testing Data (first 5 rows):")
print(df_test_encoded.head())

In [None]:
print("\n--- Question 8: Ordinal Encoding for Ranked Categories ---")

# Ordinal Encoding for 'Pclass'
ordinal_encoder = OrdinalEncoder(categories=[['3', '2', '1']])  # Specify the order
df_titanic['Pclass_OrdinalEncoded'] = ordinal_encoder.fit_transform(df_titanic[['Pclass']].astype(str))
print("\nOrdinal Encoding for 'Pclass':")
print(df_titanic[['Pclass', 'Pclass_OrdinalEncoded']].head())
print("\nClass Mapping:", dict(zip(ordinal_encoder.categories_[0], range(len(ordinal_encoder.categories_[0])))))

In [None]:
print("\n--- Question 9: Impact of Scaling on Different Algorithms ---")

# Prepare data with One-Hot Encoding for categorical features
preprocessor_scaling = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), ['Sex', 'Embarked'])],
    remainder='passthrough')

X_train_prep = preprocessor_scaling.fit_transform(X_train)
X_test_prep = preprocessor_scaling.transform(X_test)

# Get feature names after preprocessing
feature_names_scaling = list(preprocessor_scaling.get_feature_names_out(['Sex', 'Embarked'])) + \
                        [col for col in X_train.columns if col not in ['Sex', 'Embarked']]
df_train_prep = pd.DataFrame(X_train_prep, columns=feature_names_scaling)
df_test_prep = pd.DataFrame(X_test_prep, columns=feature_names_scaling)

# Decision Tree (not sensitive to scaling)
tree_model = DecisionTreeClassifier(random_state=42)
tree_model.fit(df_train_prep, y_train)
tree_pred = tree_model.predict(df_test_prep)
tree_accuracy = accuracy_score(y_test, tree_pred)
print(f"\nDecision Tree Accuracy (Unscaled): {tree_accuracy:.4f}")

# SVM (sensitive to scaling) - Unscaled
svm_unscaled = SVC(random_state=42)
svm_unscaled.fit(df_train_prep, y_train)
svm_pred_unscaled = svm_unscaled.predict(df_test_prep)
svm_accuracy_unscaled = accuracy_score(y_test, svm_pred_unscaled)
print(f"SVM Accuracy (Unscaled): {svm_accuracy_unscaled:.4f}")

# SVM (sensitive to scaling) - Min-Max Scaled
scaler_minmax = MinMaxScaler()
X_train_minmax = scaler_minmax.fit_transform(df_train_prep)
X_test_minmax = scaler_minmax.transform(df_test_prep)
svm_scaled_minmax = SVC(random_state=42)
svm_scaled_minmax.fit(X_train_minmax, y_train)
svm_pred_scaled_minmax = svm_scaled_minmax.predict(X_test_minmax)
svm_accuracy_scaled_minmax = accuracy_score(y_test, svm_pred_scaled_minmax)
print(f"SVM Accuracy (Min-Max Scaled): {svm_accuracy_scaled_minmax:.4f}")

# SVM (sensitive to scaling) - Standard Scaled
scaler_standard = StandardScaler()
X_train_standard = scaler_standard.fit_transform(df_train_prep)
X_test_standard = scaler_standard.transform(df_test_prep)
svm_scaled_standard = SVC(random_state=42)
svm_scaled_standard.fit(X_train_standard, y_train)
svm_pred_scaled_standard = svm_scaled_standard.predict(X_test_standard)
svm_accuracy_scaled_standard = accuracy_score(y_test, svm_pred_scaled_standard)
print(f"SVM Accuracy (Standard Scaled): {svm_accuracy_scaled_standard:.4f}")

In [None]:
print("\n--- Question 10: Custom Transformations for Categorical Features ---")

# Example of a high cardinality categorical feature (not directly in Titanic, so we'll create a sample)
high_cardinality_data = pd.DataFrame({'City': ['London', 'Paris', 'Tokyo', 'London', 'New York',
                                               'Berlin', 'Tokyo', 'Sydney', 'Paris', 'London',
                                               'Rome', 'Tokyo', 'Berlin', 'Sydney', 'Paris']})

print("\nHigh Cardinality Categorical Data:")
print(high_cardinality_data['City'].value_counts())

# Custom transformation: Frequency Encoding
def frequency_encode(series):
    frequencies = series.value_counts(normalize=True)
    return series.map(frequencies)

high_cardinality_data['City_FreqEncoded'] = frequency_encode(high_cardinality_data['City'])
print("\nHigh Cardinality Data with Frequency Encoding:")
print(high_cardinality_data)

# Applying custom transformation using ColumnTransformer
custom_transformer = Pipeline(steps=[('freq_encode', FunctionTransformer(frequency_encode))])

preprocessor_custom = ColumnTransformer(
    transformers=[
        ('freq', custom_transformer, ['City'])],
    remainder='passthrough')

encoded_data = preprocessor_custom.fit_transform(high_cardinality_data)
encoded_df = pd.DataFrame(encoded_data, columns=['City_FreqEncoded_CT'])
print("\nEncoded Data using ColumnTransformer with Custom Transformer:")
print(encoded_df)