In [None]:
# Question 5: Label Encoding vs One-Hot Encoding
# Task: Show the difference between Label Encoding and One-Hot Encoding on the Titanic dataset for the 'Sex' feature.





# Question 6: Combining Feature Scaling Techniques
# Task: Demonstrate combining Min-Max Scaling and Standardization for the same datasetand explain the results.





# Question 7: Handling Multiple Categorical Features
# Task: Handle multiple categorical features ('Sex', 'Embarked') from the Titanic dataset using One-Hot Encoding.




# Question 8: Ordinal Encoding for Ranked Categories
# Task: Ordinal encode 'Pclass' (Passenger class) from the Titanic dataset considering passenger class as a ranked feature.





# Question 9: Impact of Scaling on Different Algorithms
# Task: Investigate the impact of different scaling techniques on a decision tree model and compare it with a SVM.



# Question 10: Custom Transformations for Categorical Features
# Task: Implement a custom transformation function for encoding high cardinality categorical features efficiently.






In [1]:
import pandas as pd
import seaborn as sns

# Load Titanic dataset
titanic = sns.load_dataset('titanic')

# Show basic info
print(titanic.head())
print(titanic.info())


   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-nu

In [2]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Extract 'Sex' column and drop NAs for clean demo
sex = titanic['sex'].dropna()

# Label Encoding
le = LabelEncoder()
sex_label_encoded = le.fit_transform(sex)
print("Label Encoded 'Sex':", sex_label_encoded[:10])

# One-Hot Encoding using pandas
sex_onehot = pd.get_dummies(sex, prefix='sex')
print("One-Hot Encoded 'Sex':\n", sex_onehot.head())


Label Encoded 'Sex': [1 0 0 0 1 1 1 1 0 0]
One-Hot Encoded 'Sex':
    sex_female  sex_male
0       False      True
1        True     False
2        True     False
3        True     False
4       False      True


In [3]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Select numerical columns
num_cols = ['age', 'fare']

# For demo, fill NA
titanic[num_cols] = titanic[num_cols].fillna(titanic[num_cols].median())

# Apply Min-Max Scaling to 'age', Standardization to 'fare'
scaler_minmax = MinMaxScaler()
scaler_std = StandardScaler()

titanic['age_scaled'] = scaler_minmax.fit_transform(titanic[['age']])
titanic['fare_scaled'] = scaler_std.fit_transform(titanic[['fare']])

print(titanic[['age', 'age_scaled', 'fare', 'fare_scaled']].head())


    age  age_scaled     fare  fare_scaled
0  22.0    0.271174   7.2500    -0.502445
1  38.0    0.472229  71.2833     0.786845
2  26.0    0.321438   7.9250    -0.488854
3  35.0    0.434531  53.1000     0.420730
4  35.0    0.434531   8.0500    -0.486337


In [4]:
# Select categorical features with missing values handled
cat_cols = ['sex', 'embarked']
titanic[cat_cols] = titanic[cat_cols].fillna('missing')

# One-hot encode
titanic_encoded = pd.get_dummies(titanic, columns=cat_cols, drop_first=True)

print(titanic_encoded.filter(regex='sex_|embarked_').head())


   sex_male  embarked_Q  embarked_S  embarked_missing
0      True       False        True             False
1     False       False       False             False
2     False       False        True             False
3     False       False        True             False
4      True       False        True             False


In [5]:
from sklearn.preprocessing import OrdinalEncoder

# Pclass: 1 (1st class), 2 (2nd), 3 (3rd) - ranked from 1 (highest) to 3 (lowest)
ordinal_enc = OrdinalEncoder(categories=[['3', '2', '1']])  # order: worst to best

# Convert pclass to string to match categories
pclass_str = titanic['pclass'].astype(str).values.reshape(-1,1)

# Fit and transform
pclass_encoded = ordinal_enc.fit_transform(pclass_str)
titanic['pclass_encoded'] = pclass_encoded

print(titanic[['pclass', 'pclass_encoded']].head())


   pclass  pclass_encoded
0       3             0.0
1       1             2.0
2       3             0.0
3       1             2.0
4       3             0.0


In [6]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# Prepare dataset with numeric features only for simplicity
titanic_clean = titanic.dropna(subset=['age', 'fare', 'pclass', 'survived'])
X = titanic_clean[['age', 'fare', 'pclass']]
y = titanic_clean['survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# No scaling
dt = DecisionTreeClassifier(random_state=42)
svm = SVC(random_state=42)

dt.fit(X_train, y_train)
svm.fit(X_train, y_train)

print("Decision Tree accuracy (no scaling):", accuracy_score(y_test, dt.predict(X_test)))
print("SVM accuracy (no scaling):", accuracy_score(y_test, svm.predict(X_test)))

# With scaling (StandardScaler)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

dt.fit(X_train_scaled, y_train)
svm.fit(X_train_scaled, y_train)

print("Decision Tree accuracy (with scaling):", accuracy_score(y_test, dt.predict(X_test_scaled)))
print("SVM accuracy (with scaling):", accuracy_score(y_test, svm.predict(X_test_scaled)))


Decision Tree accuracy (no scaling): 0.6681614349775785
SVM accuracy (no scaling): 0.6591928251121076
Decision Tree accuracy (with scaling): 0.672645739910314
SVM accuracy (with scaling): 0.7309417040358744


In [7]:
def frequency_encoding(df, col):
    freq = df[col].value_counts(normalize=True)
    return df[col].map(freq)

titanic['cabin_freq_enc'] = frequency_encoding(titanic, 'cabin').fillna(0)

print(titanic[['cabin', 'cabin_freq_enc']].head(10))


KeyError: 'cabin'