In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('train.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [3]:
from sklearn import preprocessing
label_encoder_1=preprocessing.LabelEncoder()
label_encoder_2=preprocessing.LabelEncoder()
label_encoder_3=preprocessing.LabelEncoder()

In [4]:
def pipeline(dataset):
  dataset=dataset.drop(columns=['PassengerId','Name','Ticket','Cabin'])
  dataset['Sex']= label_encoder_1.fit_transform(dataset['Sex'])
  dataset['Embarked']= label_encoder_2.fit_transform(dataset['Embarked'])
  dataset['Pclass']= label_encoder_3.fit_transform(dataset['Pclass'])
  return dataset

In [5]:
def test_pipeline(dataset):
  dataset=dataset.drop(columns=['PassengerId','Name','Ticket','Cabin'])
  dataset['Sex']= label_encoder_1.transform(dataset['Sex'])
  dataset['Embarked']= label_encoder_2.transform(dataset['Embarked'])
  dataset['Pclass']= label_encoder_3.transform(dataset['Pclass'])
  return dataset

In [6]:
#Replacing the missing values by mean
dataset=pipeline(df)
dataset=dataset.fillna(dataset.mean(numeric_only=True))

In [7]:
#Convert all values to float
dataset = dataset.astype(float)
dataset.info()
# dataset['SibSp'].value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    float64
 1   Pclass    891 non-null    float64
 2   Sex       891 non-null    float64
 3   Age       891 non-null    float64
 4   SibSp     891 non-null    float64
 5   Parch     891 non-null    float64
 6   Fare      891 non-null    float64
 7   Embarked  891 non-null    float64
dtypes: float64(8)
memory usage: 55.8 KB


In [8]:
X=dataset.drop(columns=['Survived']) # Feature matrix
y=dataset['Survived'] # Output label

In [9]:
# X['Embarked'].value_counts()

In [10]:
X=X.to_numpy() #Feature Matrix(Training Input)
y=y.to_numpy() #Training Labels

In [11]:
# Bias term
X = np.hstack((np.ones((X.shape[0],1)), X)) 

In [12]:
X.shape

(891, 8)

In [13]:
y=np.reshape(y,(-1,1))
y.shape

(891, 1)

In [14]:
# Split the data into training and validation sets
from sklearn.model_selection import train_test_split
X_train_unscaled, X_val_unscaled, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=180)

In [15]:
from sklearn.preprocessing import StandardScaler

# Standardize the features
scaler = StandardScaler()
X_train = X_train_unscaled.copy()
X_val = X_val_unscaled.copy()  

# Create a list of column indices to exclude from scaling
columns_to_exclude = [0, 1, 2, 7]

# Create a list of columns to scale (excluding columns to exclude)
columns_to_scale = [i for i in range(X_train_unscaled.shape[1]) if i not in columns_to_exclude]

# Fit the scaler on the training data and transform both training and validation data for selected columns
X_train[:, columns_to_scale] = scaler.fit_transform(X_train_unscaled[:, columns_to_scale])
X_val[:, columns_to_scale] = scaler.transform(X_val_unscaled[:, columns_to_scale])

In [16]:
def sigmoid(z):
  return 1 / (1 + np.exp(-z))

In [17]:
def logistic_regression(X, y, initial_rate, iterations, reg, decay_rate):
    N, D = X.shape
    np.random.seed(180)  # Set a specific seed (e.g., 42)
    w = np.random.randn(D, 1) * 0.01  # Initialize with small random values
    
    for i in range(iterations):
        h = sigmoid(np.dot(X, w))
        gradient = (np.dot(X.T, (h - y))) * (1 / N) + reg * w
        
        # Update learning rate using decay formula
        rate = initial_rate / (1 + decay_rate * i)
        
        w = w - rate * gradient
    
    return w


In [18]:
# Predict function
def predict(X, w):
    probabilities = sigmoid(np.dot(X, w))
    return [1 if x >= 0.5 else 0 for x in probabilities]

In [19]:
initial_rate = 0.01
iterations = 100
reg = 1
decay_rate = 0.01  # Adjust the decay rate as needed

w = logistic_regression(X_train, y_train, initial_rate, iterations, reg, decay_rate)

In [20]:
predictions_val = predict(X_val, w)
accuracy = np.mean(predictions_val == y_val) * 100
print(f'Validation set accuracy: {accuracy:.2f}%')

Validation set accuracy: 69.65%


In [21]:
test_df = pd.read_csv('test.csv')

In [22]:
#Replacing the missing values by mean
X_test=test_pipeline(test_df)
X_test=X_test.fillna(X_test.mean(numeric_only=True))
X_test = X_test.astype(float)
# dataset=dataset.fillna(dataset.mean(numeric_only=True))

In [23]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    418 non-null    float64
 1   Sex       418 non-null    float64
 2   Age       418 non-null    float64
 3   SibSp     418 non-null    float64
 4   Parch     418 non-null    float64
 5   Fare      418 non-null    float64
 6   Embarked  418 non-null    float64
dtypes: float64(7)
memory usage: 23.0 KB


In [24]:
# Bias term
X_test = np.hstack((np.ones((X_test.shape[0],1)), X_test)) 

In [25]:
X_test=np.array(X_test)

In [26]:
columns_to_exclude = [0, 1, 2, 7]

# Create a list of columns to scale (excluding columns to exclude)
columns_to_scale = [i for i in range(X_test.shape[1]) if i not in columns_to_exclude]

# Fit the scaler on the training data and transform both training and validation data for selected columns
X_test[:, columns_to_scale] = scaler.transform(X_test[:, columns_to_scale])

In [27]:
# Make predictions
predictions = predict(X_test, w)

In [28]:
# predictions
submission = pd.DataFrame({'PassengerId': test_df['PassengerId'], 'Survived': predictions})
submission.to_csv('Predictions.csv', index=False)
submission['Survived'].value_counts()

Survived
0    390
1     28
Name: count, dtype: int64