## Loading and Cleaning Training Data

## Prerequisites

In [2]:
!pip install seaborn



In [2]:
import pandas as pd
import matplotlib.pyplot as plt
# import seaborn as sn

### Train set

In [3]:
# Load the data from a CSV file
df = pd.read_csv('Dataset/train.csv')

# Display the first few rows of the dataset
df.head()

# Drop specific columns by name
df = df.drop(columns=['Name', 'SibSp','Parch','Ticket','Fare', 'Cabin', 'Embarked'  ])

# Display the updated DataFrame to verify columns are removed
print(df.head())

# Optionally, save the modified DataFrame back to a new CSV file
df.to_csv('Dataset/train_modi.csv', index=False)

   PassengerId  Survived  Pclass     Sex   Age
0            1         0       3    male  22.0
1            2         1       1  female  38.0
2            3         1       3  female  26.0
3            4         1       1  female  35.0
4            5         0       3    male  35.0


In [4]:

# Fill missing values for 'Age' with the median or use any other strategy
df['Age'].fillna(df['Age'].median(), inplace=True)

# Ensure there are no missing values in the columns of interest
print(df[['Pclass', 'Sex', 'Age']].isnull().sum())

Pclass    0
Sex       0
Age       0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)


In [17]:
# Load the dataset
df = pd.read_csv('Dataset/train_modi.csv')

# Drop rows where 'Age' column has missing values
df_cleaned = df.dropna(subset=['Age'])

# Save the cleaned DataFrame to a new CSV file
df_cleaned.to_csv('Dataset/train_modi.csv', index=False)

In [20]:
# Check for missing values in features
print(df_cleaned.isnull().sum())


PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
dtype: int64


In [19]:
# Check for basic statistics
data.describe()

# Check for missing values
data.isnull().sum()

# Check data types
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   PassengerId  418 non-null    int64
 1   Survived     418 non-null    int64
dtypes: int64(2)
memory usage: 6.7 KB


### Test Data

In [14]:
# Load the data from a CSV file
df = pd.read_csv('Dataset/test.csv')

# Display the first few rows of the dataset
df.head()

# Drop specific columns by name
df = df.drop(columns=['Name', 'SibSp','Parch','Ticket','Fare', 'Cabin', 'Embarked'  ])

# Display the updated DataFrame to verify columns are removed
print(df.head())

# Optionally, save the modified DataFrame back to a new CSV file
df.to_csv('Dataset/test_modi.csv', index=False)

   PassengerId  Pclass     Sex   Age
0          892       3    male  34.5
1          893       3  female  47.0
2          894       2    male  62.0
3          895       3    male  27.0
4          896       3  female  22.0


In [15]:
# Load the data from a CSV file
data = pd.read_csv('Dataset/gender_submission.csv')

# Display the first few rows of the dataset
data.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


## Logistic Regression Model with multiple parameters

In [25]:
df = pd.read_csv('Dataset/train_modi.csv')
print(df.head())

   PassengerId  Survived  Pclass     Sex   Age
0            1         0       3    male  22.0
1            2         1       1  female  38.0
2            3         1       3  female  26.0
3            4         1       1  female  35.0
4            5         0       3    male  35.0


In [26]:
# Convert 'Sex' to numeric values (0 for male, 1 for female)
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})

In [27]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression

# Load the dataset
df = pd.read_csv('Dataset/train_modi.csv')

# Label Encoding for 'Sex'
label_encoder = LabelEncoder()
df['Sex'] = label_encoder.fit_transform(df['Sex'])

# Separate features and target variable
X = df[['Pclass', 'Sex', 'Age']]
y = df['Survived']  # Make sure 'Survived' is the target column

# Create and fit the model
model = LogisticRegression(max_iter=1000)
model.fit(X, y)

In [28]:
y_pred = model.predict(X)

In [29]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

accuracy = accuracy_score(y, y_pred)
precision = precision_score(y, y_pred)
recall = recall_score(y, y_pred)
f1 = f1_score(y, y_pred)
conf_matrix = confusion_matrix(y, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
print(f'Confusion Matrix:\n{conf_matrix}')

Accuracy: 0.7899159663865546
Precision: 0.7554744525547445
Recall: 0.7137931034482758
F1 Score: 0.7340425531914894
Confusion Matrix:
[[357  67]
 [ 83 207]]


In [30]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(model, X, y, cv=5)
print(f'Cross-validation scores: {cv_scores}')
print(f'Average cross-validation score: {cv_scores.mean()}')

Cross-validation scores: [0.76923077 0.83216783 0.77622378 0.74825175 0.8028169 ]
Average cross-validation score: 0.7857382054565154


In [31]:
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.1, 1, 10], 'solver': ['liblinear', 'lbfgs']}
grid_search = GridSearchCV(LogisticRegression(max_iter=1000), param_grid, cv=5)
grid_search.fit(X, y)
print(f'Best parameters: {grid_search.best_params_}')
print(f'Best score: {grid_search.best_score_}')


Best parameters: {'C': 0.1, 'solver': 'lbfgs'}
Best score: 0.7927410617551462


In [36]:
import pandas as pd

# Load test data
test_data = pd.read_csv('Dataset/test_modi.csv')

# Display the first few rows of the test data
print(test_data.head())


   PassengerId  Pclass     Sex   Age
0          892       3    male  34.5
1          893       3  female  47.0
2          894       2    male  62.0
3          895       3    male  27.0
4          896       3  female  22.0


In [37]:
# Handle missing values - Example: fill missing Age values with median
test_data['Age'] = test_data['Age'].fillna(test_data['Age'].median())

# Convert categorical variables - Example: encode 'Sex'
test_data['Sex'] = test_data['Sex'].map({'male': 0, 'female': 1})  # Adjust based on your encoding

# Ensure all features used for training are present in the test data
X_test = test_data[['Pclass', 'Sex', 'Age']]

# Make predictions
predictions = model.predict(X_test)

# Display the predictions
print(f'Predictions for test data:\n{predictions}')


Predictions for test data:
[0 0 1 1 0 1 0 1 0 1 1 1 1 1 0 0 1 1 0 0 1 1 1 1 0 0 1 1 1 1 1 1 0 0 1 1 0
 0 1 1 0 1 0 0 0 1 1 1 0 0 1 1 0 1 1 1 0 1 1 0 1 1 1 0 1 0 0 1 1 0 0 1 0 1
 0 1 1 0 1 0 1 1 1 1 1 1 0 0 0 1 0 1 1 1 1 1 0 1 0 0 0 1 1 1 0 1 1 1 1 1 1
 0 0 0 0 1 1 0 1 0 0 1 0 1 1 0 1 0 1 1 0 1 0 1 0 1 1 1 0 0 0 1 1 1 1 1 1 1
 1 1 1 1 0 0 1 1 1 0 1 0 0 1 0 1 1 0 1 1 1 0 1 1 1 1 0 0 0 1 0 0 1 1 1 1 1
 1 0 1 0 1 1 1 1 1 1 0 1 0 1 0 0 1 1 0 1 1 0 1 1 1 0 1 1 0 0 1 0 1 0 1 0 1
 0 1 0 0 1 0 0 1 1 1 1 1 1 1 1 1 0 0 0 0 1 1 1 1 0 1 0 0 0 1 1 1 0 1 1 1 0
 1 0 1 0 0 1 1 1 1 0 1 1 1 1 0 1 0 1 1 1 1 0 1 0 0 0 0 1 1 1 1 1 0 1 1 0 1
 0 1 1 1 0 1 1 1 0 0 1 1 1 0 1 1 1 0 0 0 1 1 1 1 1 1 1 1 0 1 0 1 1 1 0 1 1
 0 1 1 1 0 1 1 1 0 1 0 1 0 1 0 1 0 0 1 1 1 0 1 0 1 1 0 1 0 0 1 1 0 1 0 0 1
 1 1 1 1 0 0 0 1 1 1 1 1 0 0 1 0 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1
 1 0 0 0 0 0 1 0 0 1 1]


In [38]:
# Add predictions to the test data DataFrame
test_data['Predictions'] = predictions

# Save to a new CSV file
test_data.to_csv('Dataset/test_predictions.csv', index=False)

In [39]:
df = pd.read_csv('Dataset/test_predictions.csv')
print(df.head())

   PassengerId  Pclass  Sex   Age  Predictions
0          892       3    0  34.5            0
1          893       3    1  47.0            0
2          894       2    0  62.0            1
3          895       3    0  27.0            1
4          896       3    1  22.0            0


In [43]:
# Let's say you want only the 'Pclass', 'Sex', 'Age', and 'Predictions' columns
selected_columns = test_data[['PassengerId','Survived']]

# # Rename a single column
# test_data.rename(columns={'Predictions': 'Survived'}, inplace=True)

# Save the selected columns to a new CSV file
selected_columns.to_csv('Dataset/test_submission.csv', index=False)

# Display the new file
print(selected_columns.head())

   PassengerId  Survived
0          892         0
1          893         0
2          894         1
3          895         1
4          896         0
