In [34]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [35]:
import warnings
warnings.filterwarnings('ignore')

In [36]:
df=pd.read_excel('titanic_data.xlsx')

In [37]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [38]:
df.drop(columns=['Name','Ticket','Cabin'],axis=1,inplace=True)

In [39]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Sex              0
Age            177
SibSp            0
Parch            0
Fare             0
Embarked         2
dtype: int64

In [40]:
df['Age'].fillna(df['Age'].median(),inplace=True)

In [41]:
df['Age'].isnull().sum()

0

In [42]:
df['Embarked'].fillna(df['Embarked'].mode()[0],inplace=True)

In [43]:
df.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64

In [44]:
df.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.25,S
1,2,1,1,female,38.0,1,0,71.2833,C


In [45]:
df.Sex.unique()

array(['male', 'female'], dtype=object)

In [46]:
df.Sex.replace({'male':1, 'female':2},inplace=True)

In [47]:
df.Embarked.unique()

array(['S', 'C', 'Q'], dtype=object)

In [48]:
df.Embarked.replace({'S':1, 'C':2, 'Q':3},inplace=True)

In [49]:
df.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,1,22.0,1,0,7.25,1
1,2,1,1,2,38.0,1,0,71.2833,2


In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Sex          891 non-null    int64  
 4   Age          891 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Fare         891 non-null    float64
 8   Embarked     891 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 62.8 KB


In [51]:
X=df.drop(columns=['Survived'],axis=1)

In [52]:
y=df['Survived']

In [53]:
from sklearn.preprocessing import MinMaxScaler

In [54]:
scalar=MinMaxScaler()

In [55]:
scaled_X=scalar.fit_transform(X)

In [56]:
from sklearn.model_selection import train_test_split

In [57]:
X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.2, random_state=42)

In [58]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [59]:
lr=LogisticRegression()
svm=SVC()
rf=RandomForestClassifier()

In [60]:
model_param={
    'svm':{
        'model':SVC(),
        'params':{
            'C':[1,10,50,70,100],
            'gamma':[.1,.01,.001,.0001,1],
            'kernel':['rbf','linear','poly'] } },
    'lr':{
        'model':LogisticRegression(),
          'params': {
              'solver':['lbfgs','liblinear'],
              'C':[1,5,10]}},
    'rf':{
        'model':RandomForestClassifier(),
        'params':{
            'n_estimators':[10,50,70,100],
            'criterion':['gini','entropy']}},  

}

In [61]:
model_param

{'svm': {'model': SVC(),
  'params': {'C': [1, 10, 50, 70, 100],
   'gamma': [0.1, 0.01, 0.001, 0.0001, 1],
   'kernel': ['rbf', 'linear', 'poly']}},
 'lr': {'model': LogisticRegression(),
  'params': {'solver': ['lbfgs', 'liblinear'], 'C': [1, 5, 10]}},
 'rf': {'model': RandomForestClassifier(),
  'params': {'n_estimators': [10, 50, 70, 100],
   'criterion': ['gini', 'entropy']}}}

In [62]:
model_param.items()

dict_items([('svm', {'model': SVC(), 'params': {'C': [1, 10, 50, 70, 100], 'gamma': [0.1, 0.01, 0.001, 0.0001, 1], 'kernel': ['rbf', 'linear', 'poly']}}), ('lr', {'model': LogisticRegression(), 'params': {'solver': ['lbfgs', 'liblinear'], 'C': [1, 5, 10]}}), ('rf', {'model': RandomForestClassifier(), 'params': {'n_estimators': [10, 50, 70, 100], 'criterion': ['gini', 'entropy']}})])

In [63]:
from sklearn.model_selection import GridSearchCV

In [64]:
score=[]
for model_name,model_parameters in model_param.items():
    final_clf=GridSearchCV(model_parameters['model'],model_parameters['params'],cv=5)
    final_clf.fit(X_train,y_train)
    score.append({
        'model':model_name,
        'best_score':final_clf.best_score_,
        'best_params':final_clf.best_params_
    })

In [65]:
pd.DataFrame(score,columns=['model','best_score','best_params'])

Unnamed: 0,model,best_score,best_params
0,svm,0.815995,"{'C': 10, 'gamma': 1, 'kernel': 'rbf'}"
1,lr,0.799143,"{'C': 1, 'solver': 'lbfgs'}"
2,rf,0.813169,"{'criterion': 'gini', 'n_estimators': 70}"


In [67]:
model=SVC(C=10,gamma=1,kernel='rbf')

In [68]:
model.fit(X_train,y_train)

SVC(C=10, gamma=1)

In [69]:
model_scores=[]
for i in range(1000):
    X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.33, random_state=i)
    model.fit(X_train,y_train)
    model_scores.append(model.score(X_test,y_test))

In [70]:
np.argmax(model_scores)

400

In [71]:
model_scores[400]

0.8745762711864407

In [74]:
pred=model.predict(scaled_X)

In [96]:
dic={'Passenger_ID':df['PassengerId'].values,
    'Prediction':pred,
    'actual_value':y.values}

In [109]:
new=pd.DataFrame([df['PassengerId'],pred,y]).transpose()

In [119]:
new.rename(columns={'Unnamed 0':'Prediction'})

Unnamed: 0,PassengerId,Prediction,Survived
0,1,0,0
1,2,1,1
2,3,0,1
3,4,1,1
4,5,0,0
...,...,...,...
886,887,0,0
887,888,1,1
888,889,0,0
889,890,0,1
