<h1>Loading Necessary Libraries</h1>

In [267]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load Data

In [268]:
df=pd.read_csv('dataset/train.csv')

# Simple Visualization

In [269]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [270]:
df.shape

(891, 12)

In [271]:
df.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
622,623,1,3,"Nakid, Mr. Sahid",male,20.0,1,1,2653,15.7417,,C
833,834,0,3,"Augustsson, Mr. Albert",male,23.0,0,0,347468,7.8542,,S
593,594,0,3,"Bourke, Miss. Mary",female,,0,2,364848,7.75,,Q
792,793,0,3,"Sage, Miss. Stella Anna",female,,8,2,CA. 2343,69.55,,S
73,74,0,3,"Chronopoulos, Mr. Apostolos",male,26.0,1,0,2680,14.4542,,C


# Here Name shouldn't be so relatable so dropping them

In [272]:
df.drop(columns=['Name'],inplace=True)

# Get In-Depth Info

In [273]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Sex          891 non-null    object 
 4   Age          714 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Ticket       891 non-null    object 
 8   Fare         891 non-null    float64
 9   Cabin        204 non-null    object 
 10  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(4)
memory usage: 76.7+ KB


We have 891 total datas and in some we can observe value for nulls. 
Checking for null values for each columns

In [274]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [275]:
# Determining how to fill datas for age
print('Oldest Passenger was of:',df['Age'].max(),'Years')
print('Youngest Passenger was of:',df['Age'].min(),'Years')
print('Average Age on the ship:',df['Age'].mean(),'Years')

#So maybe mean would be appropriate

Oldest Passenger was of: 80.0 Years
Youngest Passenger was of: 0.42 Years
Average Age on the ship: 29.69911764705882 Years


In [276]:
# Determining how to fill datas for Cabins
print(len(df['Cabin'].unique()))

print(">>>>>>>>>>>>>>")
#Many unique Values. 
print(df['Cabin'].sample(10))

#Values seem not appropiate to take mean.


148
>>>>>>>>>>>>>>
476    NaN
176    NaN
360    NaN
852    NaN
590    NaN
574    NaN
114    NaN
235    NaN
454    NaN
858    NaN
Name: Cabin, dtype: object


# Self Thoughts

In [277]:
print("Number of null values", df['Embarked'].isnull().sum())

print("Number of unique values(including NaN)", len(df['Embarked'].unique()))

#Has 3 unique Values. 
print(df['Embarked'].value_counts())

#Appropiate to take maximum value for Embarked 'S'

Number of null values 2
Number of unique values(including NaN) 4
Embarked
S    644
C    168
Q     77
Name: count, dtype: int64


Lets Fill NAN Feilds

Age has 177 null values
Cabin has 687 null values
Emabrked has 2 null values

We can drop them. But 177 and 687 are too large numbers to drop.

But we can replace mean value of age for age and for Cabin as there is no logic to replace the NaN value we instead drop the column instead of rows(So we dont loose data.)

In [278]:
df['Embarked'].fillna('S',inplace=True) #Replacing with max values
df['Age'].fillna(df['Age'].mean(),inplace=True) #Replacing with mean values
df.drop('Cabin',axis=1,inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Embarked'].fillna('S',inplace=True) #Replacing with max values
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mean(),inplace=True) #Replacing with mean values


In [279]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,female,35.0,1,0,113803,53.1,S
4,5,0,3,male,35.0,0,0,373450,8.05,S


# Convert to numerical data for non numerical data

In [280]:
df['Sex']=df['Sex'].apply(lambda x:1 if x=='female' else 0)
df['Embarked'] = df['Embarked'].apply(lambda x: 0 if x == 'S' else (1 if x == 'C' else 2))

# Drop Ticket

In [281]:
df.drop('Ticket',axis=1,inplace=True)

<h1> Now Lets Look at Feature Selection
 </h1>

In [282]:
#Looking at correlation between columns with survival

df_comp=df.iloc[:,df.columns!='Survived']
df_comp.corrwith(df['Survived'])


PassengerId   -0.005007
Pclass        -0.338481
Sex            0.543351
Age           -0.069809
SibSp         -0.035322
Parch          0.081629
Fare           0.257307
Embarked       0.106811
dtype: float64

### here correlation with PassengerID Age SibSP and Parch is very low so dropping them

In [283]:
#dropping PassengerID , Age , SibSp , Parch
rows=['PassengerId','Age','SibSp','Parch']
for r in rows:
    df.drop(r,axis=1,inplace=True)

In [284]:
df.columns

Index(['Survived', 'Pclass', 'Sex', 'Fare', 'Embarked'], dtype='object')

# Unique Datas

In [285]:
#column names
col_names=df.columns.values 

for i in range(0,len(col_names)):
    uniq_datas=df[col_names[i]].unique() #unique datas of a column 
    
    if(len( uniq_datas ) < 10 ):
        print(col_names[i] ," : ", np.sort(uniq_datas))
    
    else:
        arr= df[col_names[i]]
        chunks = len(df[col_names[i]])/10
        print( col_names[i]," : " ,np.array_split(arr,chunks))
    

Survived  :  [0 1]
Pclass  :  [1 2 3]
Sex  :  [0 1]
Fare  :  [0      7.2500
1     71.2833
2      7.9250
3     53.1000
4      8.0500
5      8.4583
6     51.8625
7     21.0750
8     11.1333
9     30.0708
10    16.7000
Name: Fare, dtype: float64, 11    26.5500
12     8.0500
13    31.2750
14     7.8542
15    16.0000
16    29.1250
17    13.0000
18    18.0000
19     7.2250
20    26.0000
Name: Fare, dtype: float64, 21     13.0000
22      8.0292
23     35.5000
24     21.0750
25     31.3875
26      7.2250
27    263.0000
28      7.8792
29      7.8958
30     27.7208
Name: Fare, dtype: float64, 31    146.5208
32      7.7500
33     10.5000
34     82.1708
35     52.0000
36      7.2292
37      8.0500
38     18.0000
39     11.2417
40      9.4750
Name: Fare, dtype: float64, 41    21.0000
42     7.8958
43    41.5792
44     7.8792
45     8.0500
46    15.5000
47     7.7500
48    21.6792
49    17.8000
50    39.6875
Name: Fare, dtype: float64, 51     7.8000
52    76.7292
53    26.0000
54    61.9792
55    35

  return bound(*args, **kwds)


In [286]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Fare,Embarked
0,0,3,0,7.25,0
1,1,1,1,71.2833,1
2,1,3,1,7.925,0
3,1,1,1,53.1,0
4,0,3,0,8.05,0


In [287]:
#Checking correlations
df.corr()

Unnamed: 0,Survived,Pclass,Sex,Fare,Embarked
Survived,1.0,-0.338481,0.543351,0.257307,0.106811
Pclass,-0.338481,1.0,-0.1319,-0.5495,0.045702
Sex,0.543351,-0.1319,1.0,0.182333,0.116569
Fare,0.257307,-0.5495,0.182333,1.0,0.062142
Embarked,0.106811,0.045702,0.116569,0.062142,1.0


In [288]:
#Here Lets look between Pclass and Fare

df['Pclass'].corr(df['Fare'])

-0.5494996199439082

In [289]:
# A close correlation. Maybe we can discard that feature.
# checking between Survived

print("With Pclass")
print(df['Pclass'].corr(df['Survived']))

print("With Fare")
print(df['Fare'].corr(df['Survived']))

With Pclass
-0.3384810359610148
With Fare
0.2573065223849622


In [290]:
#Closer relation with Pclass so lets drop Fare
df.drop('Fare',axis=1,inplace=True)

In [291]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Embarked
0,0,3,0,0
1,1,1,1,1
2,1,3,1,0
3,1,1,1,0
4,0,3,0,0


In [292]:
x = df.drop('Survived', axis=1)
y = df['Survived'] 

<h1> Now Build Model </h1>

In [293]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

In [294]:
knn= KNeighborsClassifier()
svc= SVC()
dtc= DecisionTreeClassifier()
nb= GaussianNB()
lg= LogisticRegression()

In [295]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score,recall_score, precision_score, f1_score,classification_report

from sklearn.preprocessing import StandardScaler

In [296]:
models_params = {
    'knn': {
        'model':knn,
        'params':{
        'n_neighbors': [3, 5, 7, 9, 11, 13],
        'metric': ['cosine', 'euclidean', 'manhattan'],
        'weights': ['uniform', 'distance']
        }
    },
    
    'svc': {
        'model':svc,
        'params':{
        'C': [0.1, 1, 10, 100],
        'gamma': [1, 0.1, 0.01, 0.001],
        'kernel': ['rbf', 'linear']
        }
    },
    
    'dtc': {
        'model':dtc,
        'params':{
        'criterion': ['gini', 'entropy'],
        'max_depth': [2, 4, 6, 8, 10, 12]
        }
    },
    
    'nb': {
        'model':nb,
        'params':{
        'priors': [None],
        'var_smoothing': [0.00000001, 0.000000001, 0.00000001]
        }
    },
    
    'lg': {
        'model':lg,
        'params':{
        'C': [0.1, 1, 10, 100],         # Regularization parameter
        'penalty': ['l2', 'none'],       # Regularization type
        'solver': ['newton-cg', 'lbfgs', 'liblinear']  # Optimization algorithms
        }
    }
}


In [297]:
def train_evaluate_models(x,y,models_params):
    
    x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=11)

    scaler = StandardScaler()
    x_scaled = scaler.fit_transform(x)
    x_train_scaled = scaler.fit_transform(x_train)
    x_test_scaled = scaler.transform(x_test)

    results= {}

    for model_name, model_info in models_params.items():
        print(f"Training {model_name}...")
        model = model_info['model']
        params = model_info['params']

        # Perform grid search for hyperparameter tuning
        grid = GridSearchCV(model, params, cv=5, scoring='accuracy', n_jobs=-1)
        grid.fit(x_train_scaled, y_train)
        
        # Best model from grid search
        best_model = grid.best_estimator_
        y_pred = best_model.predict(x_test_scaled)
        
        # Evaluate performance
        accuracy = accuracy_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
      

        # Save results
        results[model_name] = {
            'best_params': grid.best_params_,
            'accuracy': accuracy,
            'recall': recall,
            'precision': precision,
            'f1': f1
            
        }
    
    return results


In [298]:
results = train_evaluate_models(x, y, models_params)

Training knn...
Training svc...
Training dtc...
Training nb...
Training lg...


60 fits failed out of a total of 120.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
45 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\silwa\AppData\Roaming\Python\Python312\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\silwa\AppData\Roaming\Python\Python312\site-packages\sklearn\base.py", line 1382, in wrapper
    estimator._validate_params()
  File "C:\Users\silwa\AppData\Roaming\Python\Python312\site-packages\sklearn\base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\silwa\AppData\Roaming\Python\Python312\site-packages\sklearn\utils\_param_validati

In [299]:
# models={'knn':knn,'svc':svc,'dtc':dtc,'nb':nb,'lg':lg}
for model, metrics in results.items():
    print(f"\nModel: {model}")
    print(f"Best Parameters: {metrics['best_params']}")
    print(f"Accuracy: {metrics['accuracy']:.4f}")
    print(f"Recall: {metrics['recall']:.4f}")
    print(f"Precision: {metrics['precision']:.4f}")
    print(f"F1 Score: {metrics['f1']:.4f}")
   


Model: knn
Best Parameters: {'metric': 'euclidean', 'n_neighbors': 11, 'weights': 'uniform'}
Accuracy: 0.8603
Recall: 0.7049
Precision: 0.8600
F1 Score: 0.7748

Model: svc
Best Parameters: {'C': 1, 'gamma': 1, 'kernel': 'rbf'}
Accuracy: 0.8547
Recall: 0.6393
Precision: 0.9070
F1 Score: 0.7500

Model: dtc
Best Parameters: {'criterion': 'gini', 'max_depth': 4}
Accuracy: 0.8547
Recall: 0.6393
Precision: 0.9070
F1 Score: 0.7500

Model: nb
Best Parameters: {'priors': None, 'var_smoothing': 1e-08}
Accuracy: 0.8380
Recall: 0.8197
Precision: 0.7353
F1 Score: 0.7752

Model: lg
Best Parameters: {'C': 0.1, 'penalty': 'l2', 'solver': 'newton-cg'}
Accuracy: 0.8324
Recall: 0.7541
Precision: 0.7541
F1 Score: 0.7541
