In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv("C:/Users/sumit/Downloads/titanic.csv")
data.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [3]:
new_data = data[["Pclass","Sex","Age","SibSp","Parch","Fare","Embarked","Survived"]].copy()

In [4]:
new_data.head(3)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
0,3,male,22.0,1,0,7.25,S,0
1,1,female,38.0,1,0,71.2833,C,1
2,3,female,26.0,0,0,7.925,S,1


In [5]:
new_data.isnull().sum()

Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
Survived      0
dtype: int64

In [6]:
from sklearn.impute import SimpleImputer

In [7]:
si = SimpleImputer(strategy = "mean")
new_data["Age"] = si.fit_transform(new_data[["Age"]])

In [8]:
new_data["Embarked"] = new_data["Embarked"].fillna(new_data["Embarked"].mode()[0])    #[0] take the first mode from the list

# Encoding

In [9]:
from sklearn.preprocessing import LabelEncoder

In [10]:
le = LabelEncoder()
new_data["Embarked"] = le.fit_transform(new_data["Embarked"])

In [11]:
sex_en = {"male":0,"female":1}

In [12]:
new_data["Sex"] = new_data["Sex"].map(sex_en)

# cross validation score

In [13]:
#Train and evaluate the following models using cross-validation:
    #Logistic Regression
    #Support Vector Classifier (SVC)
    #Random Forest
    #K-Nearest Neighbors (KNN)
    #Decision Tree

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

In [15]:
x = new_data.iloc[:,:-1]
y = new_data["Survived"]

In [16]:
from sklearn.model_selection import train_test_split

In [17]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2,random_state = 42)

In [18]:
models = {"LogisticRegression":LogisticRegression(max_iter = 1000),
          "SVC":SVC(kernel = "linear"),
          "RandomForestClassifier":RandomForestClassifier(n_estimators = 4),
          "KNeighborsClassifier": KNeighborsClassifier(n_neighbors = 4),
          "DecisionTreeClassifier":DecisionTreeClassifier()
         }

for name,model in models.items():
    score = cross_val_score(model,x_train,y_train,cv=5)
    print(f"{name}: {score.mean():.4f}")

LogisticRegression: 0.7921
SVC: 0.7879
RandomForestClassifier: 0.7894
KNeighborsClassifier: 0.6615
DecisionTreeClassifier: 0.7528


# Ensembling & voting

In [19]:
#Now use a VotingClassifier to combine the strengths of multiple models.
#Steps:
    #Combine Logistic Regression, SVC, and Random Forest.
    #Use both hard voting and soft voting.

In [20]:
from sklearn.ensemble import VotingClassifier

In [21]:
lr = LogisticRegression()
rf = RandomForestClassifier()
sv = SVC()

In [22]:
lr.fit(x_train, y_train)
rf.fit(x_train, y_train)
sv.fit(x_train, y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [25]:
li = [("lr",LogisticRegression(max_iter = 1000)),("rf",RandomForestClassifier(n_estimators=5)),("sv",SVC(kernel="linear",probability=True))]

In [26]:
vc=VotingClassifier(li,voting="hard")
vc.fit(x_train,y_train)

In [27]:
prd = {"lr":lr.predict(x_test),"rf":rf.predict(x_test),"sv": sv.predict(x_test),"vc":vc.predict(x_test)}

In [28]:
prd = pd.DataFrame(prd)

In [29]:
prd.head(4)

Unnamed: 0,lr,rf,sv,vc
0,0,0,0,0
1,0,0,0,0
2,0,0,0,0
3,1,1,0,1


In [30]:
vc1=VotingClassifier(li,voting="soft")
vc1.fit(x_train,y_train)

In [31]:
prd1 = {"lr":lr.predict(x_test),"rf":rf.predict(x_test),"sv": sv.predict(x_test),"vc":vc.predict(x_test)}

In [32]:
prd1 = pd.DataFrame(prd1)

In [33]:
prd1.head(4)

Unnamed: 0,lr,rf,sv,vc
0,0,0,0,0
1,0,0,0,0
2,0,0,0,0
3,1,1,0,1


# Feature selection

In [34]:
#Feature Importance from Random Forest

In [35]:
from mlxtend.feature_selection import SequentialFeatureSelector

In [36]:
fs = SequentialFeatureSelector(rf,k_features = 5,forward =True)
fs.fit(x,y)

In [37]:
fs.feature_names

['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']

In [38]:
fs.k_feature_names_

('Pclass', 'Sex', 'Age', 'Parch', 'Fare')

In [39]:
fs.k_score_

np.float64(0.8182537191638943)