In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [2]:

df = pd.read_csv('titanic-Dataset.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
df.info()
df.describe(include='all')
df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [4]:
num_cols = df.select_dtypes(include=['int64','float64']).columns
cat_cols = df.select_dtypes(include=['object','category']).columns

num_imputer = SimpleImputer(strategy='median')
df[num_cols] = num_imputer.fit_transform(df[num_cols])

cat_imputer = SimpleImputer(strategy='most_frequent')
df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])

df.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [5]:
df = pd.get_dummies(df, columns=['Sex','Embarked'], drop_first=True)


In [6]:
Q1 = df['Fare'].quantile(0.25)
Q3 = df['Fare'].quantile(0.75)
IQR = Q3 - Q1

df = df[~((df['Fare'] < (Q1 - 1.5*IQR)) | (df['Fare'] > (Q3 + 1.5*IQR)))]


In [7]:
scaler = StandardScaler()
scale_cols = ['Age', 'Fare']

df[scale_cols] = scaler.fit_transform(df[scale_cols])

In [8]:
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1


In [9]:
X = df.drop(['Survived','PassengerId','Name','Ticket','Cabin'], axis=1)
y = df['Survived']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

print("Test Accuracy:", clf.score(X_test, y_test))

Test Accuracy: 0.7548387096774194


In [10]:
df.to_csv("titanic_clean.csv", index=False)
print("Cleaned dataset saved!")

Cleaned dataset saved!


In [11]:
import os
print(os.listdir())   


['.ipynb_checkpoints', 'preprocessing.ipynb', 'requirements.txt', 'src', 'Titanic-Dataset.csv', 'titanic_clean.csv']


In [12]:
import pandas as pd
cleaned = pd.read_csv("titanic_clean.csv")
print("shape:", cleaned.shape)
print("missing per column:\n", cleaned.isnull().sum())
cleaned.head()

shape: (775, 14)
missing per column:
 PassengerId    0
Survived       0
Pclass         0
Name           0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Sex_male       0
Embarked_Q     0
Embarked_S     0
FamilySize     0
dtype: int64


Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Sex_male,Embarked_Q,Embarked_S,FamilySize
0,1.0,0.0,3.0,"Braund, Mr. Owen Harris",-0.528321,1.0,0.0,A/5 21171,-0.779117,B96 B98,True,False,True,2.0
1,3.0,1.0,3.0,"Heikkinen, Miss. Laina",-0.215182,0.0,0.0,STON/O2. 3101282,-0.729373,B96 B98,False,False,True,1.0
2,4.0,1.0,1.0,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0.489381,1.0,0.0,113803,2.599828,C123,False,False,True,2.0
3,5.0,0.0,3.0,"Allen, Mr. William Henry",0.489381,0.0,0.0,373450,-0.720161,B96 B98,True,False,True,1.0
4,6.0,0.0,3.0,"Moran, Mr. James",-0.058613,0.0,0.0,330877,-0.690071,B96 B98,True,True,False,1.0


In [13]:
print(cleaned[['Age','Fare']].describe())


                Age          Fare
count  7.750000e+02  7.750000e+02
mean  -8.251464e-17  6.876220e-17
std    1.000646e+00  1.000646e+00
min   -2.217707e+00 -1.313411e+00
25%   -5.283213e-01 -7.315244e-01
50%   -5.861258e-02 -3.553671e-01
75%    4.110961e-01  6.026763e-01
max    4.012196e+00  3.476807e+00


In [14]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

X = cleaned.drop(["Survived","PassengerId","Name","Ticket","Cabin"], axis=1, errors='ignore')
y = cleaned["Survived"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
print("Final accuracy:", model.score(X_test, y_test))


Final accuracy: 0.7548387096774194


In [15]:
pip freeze | grep -E "pandas|numpy|scikit-learn|matplotlib|seaborn|jupyter" > requirements.txt


Note: you may need to restart the kernel to use updated packages.


'grep' is not recognized as an internal or external command,
operable program or batch file.


In [16]:
import pkg_resources

packages = [
    "pandas",
    "numpy",
    "matplotlib",
    "seaborn",
    "scikit-learn",
    "jupyter"
]

with open("requirements.txt", "w") as f:
    for pkg in packages:
        version = pkg_resources.get_distribution(pkg).version
        f.write(f"{pkg}=={version}\n")

print("requirements.txt created!")


requirements.txt created!


In [17]:
import os

if not os.path.exists("src"):
    os.makedirs("src")

print("src folder created!")


src folder created!


In [18]:
%%writefile src/preprocessing.py
import os
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Path to your data folder
data_path = os.path.join("..", "data", "titanic.csv")

def main():
    df = pd.read_csv(data_path)

    num_cols = df.select_dtypes(include=['int64','float64']).columns
    cat_cols = df.select_dtypes(include=['object','category']).columns

    df[num_cols] = SimpleImputer(strategy='median').fit_transform(df[num_cols])
    df[cat_cols] = SimpleImputer(strategy='most_frequent').fit_transform(df[cat_cols])

    for col in ['Sex','Embarked']:
        if col in df.columns:
            df = pd.get_dummies(df, columns=[col], drop_first=True)

    if 'SibSp' in df.columns and 'Parch' in df.columns:
        df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

    if 'Fare' in df.columns:
        Q1 = df['Fare'].quantile(0.25)
        Q3 = df['Fare'].quantile(0.75)
        IQR = Q3 - Q1
        df = df[~((df['Fare'] < (Q1 - 1.5*IQR)) | (df['Fare'] > (Q3 + 1.5*IQR)))]

    scale_cols = [c for c in ['Age','Fare'] if c in df.columns]
    df[scale_cols] = StandardScaler().fit_transform(df[scale_cols])

    out_path = os.path.join("..", "data", "titanic_clean.csv")
    df.to_csv(out_path, index=False)

    print("Cleaned dataset saved at:", out_path)

if __name__ == "__main__":
    main()


Overwriting src/preprocessing.py


In [50]:

!python src/preprocessing.py


Cleaned dataset saved at: ../titanic_clean.csv


In [52]:
import os
print(os.listdir("../"))

['AIML INTERNSHIP', 'BDA REPORT.pdf', 'Canva.lnk', 'CodeBlocks.lnk', 'COMEDK.jpg', 'desktop.ini', 'Eclipse IDE for Java Developers - 2025-06.lnk', 'EMOTION BASED MUSIC RECOMMENDATION SYSTEM.pdf', 'Excel.lnk', 'filehandling', 'fp resume kavya.pdf', 'Hill Climb Racing - Shortcut.lnk', 'JAVA', 'kav resume word.docx', 'Kavya V (Person 1) - Chrome.lnk', 'kavya-resume.pdf', 'Kavya_Cover_Letter.pdf', 'kavya_JNNCE_ID.jpg', 'Kavya_Oracle_Cover_Letter.pdf', 'kavya_photo.jpg', 'LinkedIn.lnk', 'Microsoft Edge.lnk', 'ML', 'PowerPoint.lnk', 'python-3.11.4-amd64.lnk', 'Resume image.jpg', 'Spyder_64bit_full.lnk', 'task 1.pdf', 'titanic_clean.csv', 'Visual Studio Code.lnk', 'Word.lnk', 'Zoom Workplace.lnk', '~$dule-2-2022-scheme.pdf', '~$o internal 1.pdf', '~$OC407 - Mod 1.pdf']


In [54]:
cleaned = pd.read_csv("../titanic_clean.csv")

cleaned.head()
cleaned.isnull().sum()
cleaned.shape


(775, 14)

In [56]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

X = cleaned.drop(["Survived", "PassengerId", "Name", "Ticket", "Cabin"], axis=1, errors='ignore')
y = cleaned["Survived"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

print("Accuracy:", model.score(X_test, y_test))


Accuracy: 0.7548387096774194
