In [66]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic-survivore/train.csv
/kaggle/input/titanic-survivore/test.csv


In [67]:
def add_intercept(x):
    x["intercept"]=1
    return x

In [68]:
def load_data(path, label=["Survived"], in_attri = ["Pclass", "Sex", "Age", "Fare"], add_intercept=False):
    def addIntercept(x):
        global add_intercept
        return add_intercept(x)
    if in_attri is not None:
        input = pd.read_csv(path, delimiter = ',', usecols = in_attri)
    else:
        input = pd.read_csv(path, delimiter = ',')
    output = None
    if label is not None:
        output = pd.read_csv(path, delimiter = ',', usecols = label)
    if input.ndim == 1:
        input = np.expand_dims(input, -1)
    if add_intercept:
        input = addIntercept(input)
    return input, output
    

In [69]:
input, output = load_data("/kaggle/input/titanic-survivore/train.csv", add_intercept=True)
input.loc[input["Sex"]=="male", "Sex"] = 1
input.loc[input["Sex"]=="female", "Sex"] = 0


In [70]:
def nanHandle(df):
    if not isinstance(df, pd.DataFrame):
        raise ValueError("The argument must be a pandas DataFrame.")
    df = df.infer_objects()
    df = df.interpolate(method='linear', axis=0, limit_direction='both')
    df = df.fillna(0)
    return df

In [71]:
input = nanHandle(input)
input

Unnamed: 0,Pclass,Sex,Age,Fare,intercept
0,3,1,22.0,7.2500,1
1,1,0,38.0,71.2833,1
2,3,0,26.0,7.9250,1
3,1,0,35.0,53.1000,1
4,3,1,35.0,8.0500,1
...,...,...,...,...,...
886,2,1,27.0,13.0000,1
887,1,0,19.0,30.0000,1
888,3,0,22.5,23.4500,1
889,1,1,26.0,30.0000,1


In [72]:
input.isna().any()

Pclass       False
Sex          False
Age          False
Fare         False
intercept    False
dtype: bool

In [73]:
def sigmoid(x):
    return 1 / (1 + np.exp(np.clip(-x,-500,500)))
def d_sigmoid(x):
    return sigmoid(x)*(1-sigmoid(x))

In [74]:
class newton_Linear_model():
    def __init__(self, max_itr = 100, learning_rate = 0.1, eps = 1e-5, theta = None):
        self.max_itr = max_itr
        self.learning_rate = learning_rate
        self.eps = eps
        self.theta = theta
    def train(self, x, y):
        x = x.to_numpy().astype(float)
        m, n = x.shape
        y = y.to_numpy().astype(float).reshape(m)
        self.theta = np.random.rand(n)
        lambda_ = 1e-4
        for i in range(self.max_itr):
            y_prid = sigmoid(x @ self.theta)
            gradient = (1/m) * x.T @ (y_prid - y)
            diag = y_prid * (1 - y_prid)
            H = (1/m) * (x.T @ (diag[:, np.newaxis] * x)) + lambda_ * np.eye(n)
            tPre = self.theta.copy()
            try:
                self.theta -= np.linalg.inv(H) @ gradient
            except np.linalg.LinAlgError:
                print(f"Iteration {i}: Hessian is singular. Switching to pseudoinverse.")
                self.theta -= np.linalg.pinv(H) @ gradient
            if np.linalg.norm(self.theta - tPre, ord=1) < self.eps:
                print(f"Converged after {i} iterations.")
                break
        

    def predict(self, x, x_whole, path):
        x = x.to_numpy().astype(float)
        m,n = x.shape
        yPrid = sigmoid(x@self.theta)
        pid = pd.DataFrame({"PassangerID": x_whole.iloc[:,0]})
        survivore = pd.DataFrame({"Survivore": yPrid})
        survivore[survivore["Survivore"]>0.5] = 1
        survivore[survivore["Survivore"]!=1] = 0
        submitFile = pd.concat([pid,survivore],axis=1)
        submitFile.to_csv(path, index=False)
        

In [75]:
model_instance = newton_Linear_model()


In [76]:
model_instance.train(input,output)

In [77]:
px, _= load_data("/kaggle/input/titanic-survivore/test.csv", label = None, add_intercept = True)
px_whole, _ = load_data("/kaggle/input/titanic-survivore/test.csv", label = None, in_attri = None)

In [78]:
px.loc[px["Sex"]=="male", "Sex"] = 1
px.loc[px["Sex"]=="female", "Sex"] = 0
px = nanHandle(px)
print(px)

     Pclass  Sex   Age      Fare  intercept
0         3    1  34.5    7.8292          1
1         3    0  47.0    7.0000          1
2         2    1  62.0    9.6875          1
3         3    1  27.0    8.6625          1
4         3    0  22.0   12.2875          1
..      ...  ...   ...       ...        ...
413       3    1  33.5    8.0500          1
414       1    0  39.0  108.9000          1
415       3    1  38.5    7.2500          1
416       3    1  38.5    8.0500          1
417       3    1  38.5   22.3583          1

[418 rows x 5 columns]


In [79]:
model_instance.predict(px, px_whole, "submit.csv")

In [80]:
model_instance.theta

array([ -68643.94809547, -104961.51755582,  -26716.61217833,
        183846.59557884,   -4987.44712774])

In [81]:
hh = pd.read_csv("/kaggle/working/final_submit.csv")

In [82]:
hh

Unnamed: 0,PassangerID,Survivore
0,892,0.0
1,893,0.0
2,894,0.0
3,895,0.0
4,896,1.0
...,...,...
413,1305,0.0
414,1306,1.0
415,1307,0.0
416,1308,0.0
