In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, chi2, RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA

In [2]:
# Load the Titanic dataset from the URL
url = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'
data = pd.read_csv(url)

# Display the first few rows
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
from google.colab import sheets
df = pd.DataFrame(data)
sheet = sheets.InteractiveSheet(df=df)

https://docs.google.com/spreadsheets/d/16Y2WnLPGRzrQmJvSpqhP2nMvPKFyoNdydmzLq5L3uAg#gid=0


  return frame.applymap(_clean_val).replace({np.nan: None})


In [6]:
# Check for missing values
data.isnull().sum()

Unnamed: 0,0
PassengerId,0
Survived,0
Pclass,0
Name,0
Sex,0
Age,177
SibSp,0
Parch,0
Ticket,0
Fare,0


In [7]:
# Fill missing values in 'Age' with the median
data['Age'].fillna(data['Age'].median(), inplace=True)

# Fill missing values in 'Embarked' with the mode
data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)

# Drop 'Cabin' as it has too many missing values
data.drop('Cabin', axis=1, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Age'].fillna(data['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)


In [8]:
# Convert 'Sex' column using Label Encoding
label_encoder = LabelEncoder()
data['Sex'] = label_encoder.fit_transform(data['Sex'])

# Convert 'Embarked' using One-Hot Encoding
data = pd.get_dummies(data, columns=['Embarked'], drop_first=True)

# Check the first few rows to verify encoding
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked_Q,Embarked_S
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,False,True
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,False,False
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,False,True
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,False,True
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,False,True


In [9]:
# Create 'FamilySize' feature
data['FamilySize'] = data['SibSp'] + data['Parch'] + 1

# Create 'IsAlone' feature
data['IsAlone'] = np.where(data['FamilySize'] == 1, 1, 0)

# Check the new features
data[['SibSp', 'Parch', 'FamilySize', 'IsAlone']].head()

Unnamed: 0,SibSp,Parch,FamilySize,IsAlone
0,1,0,2,0
1,1,0,2,0
2,0,0,1,1
3,1,0,2,0
4,0,0,1,1


In [10]:
scaler = StandardScaler()
data[['Age', 'Fare', 'FamilySize']] = scaler.fit_transform(data[['Age', 'Fare', 'FamilySize']])

# Check the scaled features
data[['Age', 'Fare', 'FamilySize']].head()

Unnamed: 0,Age,Fare,FamilySize
0,-0.565736,-0.502445,0.05916
1,0.663861,0.786845,0.05916
2,-0.258337,-0.488854,-0.560975
3,0.433312,0.42073,0.05916
4,0.433312,-0.486337,-0.560975


In [11]:
# Separate features (X) and target (y)
X = data.drop(['PassengerId', 'Name', 'Ticket', 'Survived'], axis=1)
y = data['Survived']

In [13]:
# Ensure all values in X are non-negative
X[X < 0] = 0

# Use SelectKBest to select top 5 features based on chi-squared test
selector = SelectKBest(score_func=chi2, k=5)
X_new = selector.fit_transform(X, y)

# Display the scores of each feature
selected_features = pd.DataFrame({
    'Feature': X.columns,
    'Score': selector.scores_
}).sort_values(by='Score', ascending=False)

print(selected_features)

      Feature       Score
5        Fare  117.233400
1         Sex   92.702447
0      Pclass   30.873699
9     IsAlone   14.640793
4       Parch   10.097499
7  Embarked_S    5.489205
8  FamilySize    4.046859
3       SibSp    2.581865
2         Age    0.103042
6  Embarked_Q    0.010847


In [14]:
# Use RFE to select the top 5 features
model = LogisticRegression()
rfe = RFE(model, n_features_to_select=5)
fit = rfe.fit(X, y)

# Display selected features and their rankings
selected_features_rfe = pd.DataFrame({
    'Feature': X.columns,
    'Selected': fit.support_,
    'Ranking': fit.ranking_
}).sort_values(by='Ranking')

print(selected_features_rfe)

      Feature  Selected  Ranking
0      Pclass      True        1
1         Sex      True        1
2         Age      True        1
8  FamilySize      True        1
9     IsAlone      True        1
7  Embarked_S     False        2
3       SibSp     False        3
4       Parch     False        4
5        Fare     False        5
6  Embarked_Q     False        6


In [15]:
# Fit a Random Forest Classifier
model = RandomForestClassifier()
model.fit(X, y)

# Get feature importance
importances = model.feature_importances_

# Display the feature importance
feature_importance_rf = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

print(feature_importance_rf)

      Feature  Importance
1         Sex    0.353271
2         Age    0.183765
0      Pclass    0.127187
5        Fare    0.125003
8  FamilySize    0.068257
3       SibSp    0.040284
4       Parch    0.036447
7  Embarked_S    0.033384
6  Embarked_Q    0.016387
9     IsAlone    0.016016


In [16]:
# Apply PCA to reduce dimensions to 2
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

# Display explained variance ratio and the first 5 rows of the new components
print('Explained Variance Ratio:', pca.explained_variance_ratio_)
print('PCA Components:', X_pca[:5])

Explained Variance Ratio: [0.41706977 0.22061589]
PCA Components: [[ 0.16229872 -0.78274658]
 [ 0.17288646  1.3913265 ]
 [-0.71748002 -0.69622037]
 [ 0.16737685  1.01688818]
 [-0.80424101 -0.67853389]]
