In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split  # Fixed 'train test split' to 'train_test_split'
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, chi2, RFE  # Fixed 'Select Best' to 'SelectKBest'
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier  # Fixed 'Random ForestClassifier' to 'RandomForestClassifier'
from sklearn.decomposition import PCA

#URL to load the Titanic dataset
url = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'

#Load the dataset
data = pd.read_csv(url)

#Display the first few rows of the dataset
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [15]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

data['Sex'] = label_encoder.fit_transform(data['Sex'])

data = pd.get_dummies(data, columns=['Embarked'], drop_first=True)

data.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked_Q,Embarked_S
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,False,True
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,False,False
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,False,True
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,False,True
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,False,True


In [17]:
data['FamilySize'] = data['SibSp'] + data['Parch'] + 1
data['IsAlone'] = np.where(data['FamilySize'] == 1, 1, 0)
data[['SibSp', 'Parch', 'FamilySize', 'IsAlone']].head()

Unnamed: 0,SibSp,Parch,FamilySize,IsAlone
0,1,0,2,0
1,1,0,2,0
2,0,0,1,1
3,1,0,2,0
4,0,0,1,1


In [19]:
scaler = StandardScaler()
data[['Age', 'Fare', 'FamilySize']] = scaler.fit_transform(data[['Age', 'Fare', 'FamilySize']])
data[['Age', 'Fare', 'FamilySize']].head()

Unnamed: 0,Age,Fare,FamilySize
0,-0.565736,-0.502445,0.05916
1,0.663861,0.786845,0.05916
2,-0.258337,-0.488854,-0.560975
3,0.433312,0.42073,0.05916
4,0.433312,-0.486337,-0.560975


In [29]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
import pandas as pd

# Assuming 'data' is your DataFrame with features and target
X = data.drop(['PassengerId', 'Name', 'Ticket', 'Survived'], axis=1)  # Features
y = data['Survived']  # Target

# Initialize Logistic Regression model
model = LogisticRegression()

# Correct the parameter name to 'n_features_to_select'
rfe = RFE(model, n_features_to_select=5)

# Fit the RFE model to the data
rfe.fit(X, y)

# Create a DataFrame to display selected features and their rankings
selected_features_rfe = pd.DataFrame({
    'Feature': X.columns,
    'Selected': rfe.support_,  # Corrected the attribute name (underscore)
    'Ranking': rfe.ranking_    # Corrected the attribute name (underscore)
}).sort_values(by='Ranking')

# Print the DataFrame with the selected features and rankings
print(selected_features_rfe)


      Feature  Selected  Ranking
0      Pclass      True        1
1         Sex      True        1
2         Age      True        1
8  FamilySize      True        1
9     IsAlone      True        1
7  Embarked_S     False        2
3       SibSp     False        3
5        Fare     False        4
4       Parch     False        5
6  Embarked_Q     False        6


In [35]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

# Assuming 'data' is already loaded and 'X' and 'y' are prepared
X = data.drop(['PassengerId', 'Name', 'Ticket', 'Survived'], axis=1)  # Features
y = data['Survived']  # Target

# Logistic Regression model for RFE
model_lr = LogisticRegression()
rfe = RFE(model_lr, n_features_to_select=5)  # Corrected the parameter name

# Fit RFE
rfe.fit(X, y)

# Create a DataFrame for the selected features and their rankings
selected_features_rfe = pd.DataFrame({
    'Feature': X.columns,
    'Selected': rfe.support_,  # Corrected attribute name (underscore)
    'Ranking': rfe.ranking_    # Corrected attribute name (underscore)
}).sort_values(by='Ranking')

print("Selected features using RFE:")
print(selected_features_rfe)

# RandomForestClassifier model for feature importance
model_rf = RandomForestClassifier()
model_rf.fit(X, y)

# Get the feature importances
importances = model_rf.feature_importances_  # Corrected attribute name (underscore)

# Create a DataFrame for feature importances and sort by importance
feature_importance_rf = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

print("\nFeature importances using RandomForestClassifier:")
print(feature_importance_rf)


Selected features using RFE:
      Feature  Selected  Ranking
0      Pclass      True        1
1         Sex      True        1
2         Age      True        1
8  FamilySize      True        1
9     IsAlone      True        1
7  Embarked_S     False        2
3       SibSp     False        3
5        Fare     False        4
4       Parch     False        5
6  Embarked_Q     False        6

Feature importances using RandomForestClassifier:
      Feature  Importance
1         Sex    0.267321
5        Fare    0.263452
2         Age    0.247577
0      Pclass    0.080988
8  FamilySize    0.050047
3       SibSp    0.024448
7  Embarked_S    0.022888
4       Parch    0.022277
9     IsAlone    0.011033
6  Embarked_Q    0.009969


In [37]:
pca = PCA(ncomponents=2)

X_pca = pca.fit_transform(X)

print("Explained Variance Ratio:", pca.explained_variance_ratio)
print('PCA Components:', X_pca[:5])

TypeError: PCA.__init__() got an unexpected keyword argument 'ncomponents'