# **Task 1: Load and Inspect a Dataset**
*Instruction:* Load the titanic.csv dataset and display the first 5 rows. Show basic info and describe statistics of the dataset.


In [21]:
import pandas as pd

df = pd.read_csv('titanic.csv')
print(df.head(50))
print(df.info())
print(df.describe())

    Survived  Pclass                                               Name  \
0          0       3                             Mr. Owen Harris Braund   
1          1       1  Mrs. John Bradley (Florence Briggs Thayer) Cum...   
2          1       3                              Miss. Laina Heikkinen   
3          1       1        Mrs. Jacques Heath (Lily May Peel) Futrelle   
4          0       3                            Mr. William Henry Allen   
5          0       3                                    Mr. James Moran   
6          0       1                             Mr. Timothy J McCarthy   
7          0       3                      Master. Gosta Leonard Palsson   
8          1       3   Mrs. Oscar W (Elisabeth Vilhelmina Berg) Johnson   
9          1       2                 Mrs. Nicholas (Adele Achem) Nasser   
10         1       3                     Miss. Marguerite Rut Sandstrom   
11         1       1                            Miss. Elizabeth Bonnell   
12         0       3     

# **Task 2: Identify and Handle Missing Data**
*Instruction:*

Display the number of missing values per column.

Fill missing Age values with the median.

Drop the second row in the dataset.


In [20]:
import pandas as pd

df = pd.read_csv('titanic.csv')
print(df.isnull().sum())
df['Age'].fillna(df['Age'].median(), inplace=True)
df.drop(index=1, inplace=True)
print(df.head())

Survived                   0
Pclass                     0
Name                       0
Sex                        0
Age                        0
Siblings/Spouses Aboard    0
Parents/Children Aboard    0
Fare                       0
dtype: int64
   Survived  Pclass                                         Name     Sex  \
0         0       3                       Mr. Owen Harris Braund    male   
2         1       3                        Miss. Laina Heikkinen  female   
3         1       1  Mrs. Jacques Heath (Lily May Peel) Futrelle  female   
4         0       3                      Mr. William Henry Allen    male   
5         0       3                              Mr. James Moran    male   

    Age  Siblings/Spouses Aboard  Parents/Children Aboard     Fare  
0  22.0                        1                        0   7.2500  
2  26.0                        0                        0   7.9250  
3  35.0                        1                        0  53.1000  
4  35.0               

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)


# **ask 3: Convert Categorical to Numeric**
*Instruction:* Convert Sex and Pclass columns to numeric using:

Label Encoding for Sex
One-Hot Encoding for Pclass



In [22]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv('titanic.csv')
df['Age'].fillna(df['Age'].median(), inplace=True)
df.drop(index=1, inplace=True)

le = LabelEncoder()
df['Sex'] = le.fit_transform(df['Sex'])

df = pd.get_dummies(df, columns=['Pclass'])

print(df.head())

   Survived                                         Name  Sex   Age  \
0         0                       Mr. Owen Harris Braund    1  22.0   
2         1                        Miss. Laina Heikkinen    0  26.0   
3         1  Mrs. Jacques Heath (Lily May Peel) Futrelle    0  35.0   
4         0                      Mr. William Henry Allen    1  35.0   
5         0                              Mr. James Moran    1  27.0   

   Siblings/Spouses Aboard  Parents/Children Aboard     Fare  Pclass_1  \
0                        1                        0   7.2500     False   
2                        0                        0   7.9250     False   
3                        1                        0  53.1000      True   
4                        0                        0   8.0500     False   
5                        0                        0   8.4583     False   

   Pclass_2  Pclass_3  
0     False      True  
2     False      True  
3     False     False  
4     False      True  
5     Fa

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)


# **Task 4: Scale Numerical Features**
*Instruction:* Use StandardScaler to scale the Age and Fare columns.italicized text

In [23]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler

df = pd.read_csv('titanic.csv')
df['Age'].fillna(df['Age'].median(), inplace=True)
df.drop(index=1, inplace=True)

le = LabelEncoder()
df['Sex'] = le.fit_transform(df['Sex'])

df = pd.get_dummies(df, columns=['Pclass'])

scaler = StandardScaler()
df[['Age', 'Fare']] = scaler.fit_transform(df[['Age', 'Fare']])

print(df[['Age', 'Fare']].head())

        Age      Fare
0 -0.528495 -0.502593
2 -0.245189 -0.489029
3  0.392250  0.418741
4  0.392250 -0.486517
5 -0.174362 -0.478313


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)


# **Task 5: Build Preprocessing Pipeline**
*Instruction:* Using ColumnTransformer and Pipeline from sklearn, build a pipeline that:

Imputes missing values
Scales numeric data
Encodes categorical data

In [30]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder

# Load dataset
df = pd.read_csv('titanic.csv')

# Define the pipeline for numeric features
numeric_features = ['Age', 'Fare']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Impute missing values with the median
    ('scaler', StandardScaler())  # Scale the numeric features
])

# Define the pipeline for categorical features
categorical_features = ['Sex', 'Pclass']
categorical_transformer = ColumnTransformer(
    transformers=[
        ('sex', Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent'))]), ['Sex']),  # Impute missing values and Label Encode 'Sex'
        ('pclass', OneHotEncoder(), ['Pclass'])  # One-Hot encoding for 'Pclass'
    ]
)

# Combine both pipelines into one full pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Apply the pipeline to the data
df_transformed = preprocessor.fit_transform(df)

print(df_transformed)

[[-0.529366007257325 -0.5035863459797053 'male' 0.0 0.0 1.0]
 [0.6042645431881828 0.7834124485273979 'female' 1.0 0.0 0.0]
 [-0.24595836964594797 -0.4900195895218577 'female' 0.0 0.0 1.0]
 ...
 [-1.5921446482999884 -0.1779841909913622 'female' 0.0 0.0 1.0]
 [-0.24595836964594797 -0.04633640610409999 'male' 1.0 0.0 0.0]
 [0.1791530867711174 -0.49353689675167006 'male' 0.0 0.0 1.0]]


# **Task 6: Create a New Feature**
*Instruction:* Create a new feature FamilySize = Siblings/Spouses Aboard + Parents/Children Aboard + 1.

In [31]:
import pandas as pd

# Load dataset
df = pd.read_csv('titanic.csv')

# Create the new feature 'FamilySize'
df['FamilySize'] = df['Siblings/Spouses Aboard'] + df['Parents/Children Aboard'] + 1

# Display the updated DataFrame
print(df[['Siblings/Spouses Aboard', 'Parents/Children Aboard', 'FamilySize']].head())

   Siblings/Spouses Aboard  Parents/Children Aboard  FamilySize
0                        1                        0           2
1                        1                        0           2
2                        0                        0           1
3                        1                        0           2
4                        0                        0           1
