In [39]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import os
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

In [2]:
titanic_path=os.path.join("titanic")
def load_data(filename,t_path=titanic_path):
    csv_path=os.path.join(t_path,filename)
    return pd.read_csv(csv_path)

train_data=load_data("train.csv")
test_data=load_data("test.csv")


In [3]:
train_data.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
train_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [5]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [10]:
train_data['Sex'].value_counts()

Sex
male      577
female    314
Name: count, dtype: int64

In [11]:
train_data['Pclass'].value_counts()

Pclass
3    491
1    216
2    184
Name: count, dtype: int64

In [12]:
train_data['Embarked'].value_counts()

Embarked
S    644
C    168
Q     77
Name: count, dtype: int64

In [16]:
# as we can see we have the above three catagorical values and in the describe table we can se the numerical values 

In [23]:
class DataFrameSelector(BaseEstimator,TransformerMixin):
    def __init__(self,attribute_name):
        self.attribute_name=attribute_name
    def fit(self,X,y=None):
        return self
    def transform(self,X,y=None):
        return X[self.attribute_name]

num_pipeline = Pipeline([("num_selector",DataFrameSelector(['Age','SibSp','Parch','Fare'])),
                         ("imputer", SimpleImputer(strategy="median")),
                         ("std_scaler", StandardScaler())
                        ])
num_pipeline.fit_transform(train_data)
        

array([[-0.56573646,  0.43279337, -0.47367361, -0.50244517],
       [ 0.66386103,  0.43279337, -0.47367361,  0.78684529],
       [-0.25833709, -0.4745452 , -0.47367361, -0.48885426],
       ...,
       [-0.1046374 ,  0.43279337,  2.00893337, -0.17626324],
       [-0.25833709, -0.4745452 , -0.47367361, -0.04438104],
       [ 0.20276197, -0.4745452 , -0.47367361, -0.49237783]])

In [28]:
class feature_imputer(BaseEstimator,TransformerMixin):
    def fit(self,X,y=None):
        self.most_frequent=pd.Series([X[c].value_counts().index[0] for c in X], index=X.columns)
        return self
    def transform(self,X,y=None):
        return X.fillna(self.most_frequent)

cat_pipeline=Pipeline([("cat_selector", DataFrameSelector(['Sex','Pclass','Embarked'])),
                       ('imputer', feature_imputer()),
                       ('encoder', OneHotEncoder(sparse_output=False))
                      ])
cat_pipeline.fit_transform(train_data)

array([[0., 1., 0., ..., 0., 0., 1.],
       [1., 0., 1., ..., 1., 0., 0.],
       [1., 0., 0., ..., 0., 0., 1.],
       ...,
       [1., 0., 0., ..., 0., 0., 1.],
       [0., 1., 1., ..., 1., 0., 0.],
       [0., 1., 0., ..., 0., 1., 0.]])

In [30]:
preprocessed_pipeline=FeatureUnion(transformer_list=[('num_pipeline',num_pipeline),
                                                     ('cat_pipeline',cat_pipeline)
                                                    ])
x_train=preprocessed_pipeline.fit_transform(train_data)
x_train

array([[-0.56573646,  0.43279337, -0.47367361, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.66386103,  0.43279337, -0.47367361, ...,  1.        ,
         0.        ,  0.        ],
       [-0.25833709, -0.4745452 , -0.47367361, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [-0.1046374 ,  0.43279337,  2.00893337, ...,  0.        ,
         0.        ,  1.        ],
       [-0.25833709, -0.4745452 , -0.47367361, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.20276197, -0.4745452 , -0.47367361, ...,  0.        ,
         1.        ,  0.        ]])

In [32]:
y_train=train_data["Survived"]

In [35]:
svm_clf=SVC(gamma="auto")
svm_clf.fit(x_train,y_train)

In [38]:
x_test=preprocessed_pipeline.transform(test_data)
ypred=svm_clf.predict(x_test)

In [40]:
svm_scores=cross_val_score(svm_clf,x_train,y_train,cv=10)

In [41]:
svm_scores.mean()

0.8249313358302123

In [42]:
rfs_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rfs_score=cross_val_score(rfs_clf,x_train,y_train,cv=10)
rfs_score.mean()

0.8081772784019975