In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression

In [2]:
data = pd.read_csv("titanic_training_datas.csv")
df = data.copy()

In [3]:
df.describe()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Survived
count,668.0,536.0,668.0,668.0,668.0,668.0
mean,2.296407,29.70056,0.528443,0.407186,32.064552,0.402695
std,0.831638,14.240257,1.080327,0.854695,45.320835,0.490808
min,1.0,0.67,0.0,0.0,0.0,0.0
25%,2.0,21.0,0.0,0.0,7.925,0.0
50%,3.0,29.0,0.0,0.0,14.75,0.0
75%,3.0,38.25,1.0,0.0,31.275,1.0
max,3.0,80.0,8.0,6.0,512.3292,1.0


In [4]:
df.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,2,"Weisz, Mrs. Leopold (Mathilde Francoise Pede)",female,29.0,1,0,228414,26.0,,S,1
1,3,"Williams, Mr. Howard Hugh ""Harry""",male,,0,0,A/5 2466,8.05,,S,0
2,2,"Morley, Mr. Henry Samuel (""Mr Henry Marshall"")",male,39.0,0,0,250655,26.0,,S,0
3,3,"Palsson, Mrs. Nils (Alma Cornelia Berglund)",female,29.0,0,4,349909,21.075,,S,0
4,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.05,,S,0


In [5]:
df.shape

(668, 11)

In [6]:
df.isna()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,False,False,False,False,False,False,False,False,True,False,False
1,False,False,False,True,False,False,False,False,True,False,False
2,False,False,False,False,False,False,False,False,True,False,False
3,False,False,False,False,False,False,False,False,True,False,False
4,False,False,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...
663,False,False,False,False,False,False,False,False,True,False,False
664,False,False,False,True,False,False,False,False,True,False,False
665,False,False,False,False,False,False,False,False,True,False,False
666,False,False,False,False,False,False,False,False,True,False,False


In [7]:
# finding total number of null values in each column

df.isna().sum()

Pclass        0
Name          0
Sex           0
Age         132
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       514
Embarked      1
Survived      0
dtype: int64

In [8]:
# finding % of null values in Age column

(df['Age'].isna().sum()/len(df))*100

19.760479041916167

In [9]:
# finding % of null values in Cabin column

(df["Cabin"].isna().sum()/len(df))*100

76.94610778443113

In [10]:
# finding % of null values in Embarked column

(df["Embarked"].isna().sum()/len(df))*100

0.14970059880239522

In [11]:
# filling the null Age column with mean of all the ages which are not null

mean_of_age = df["Age"].mean()
df["Age"].fillna(mean_of_age, inplace = True)

In [12]:
# filling the Embarked column with the value which is present most often

df["Embarked"].fillna("S", inplace = True)

In [13]:
# here a lot of null values are present in Cabin column so will better to drop it

df.drop("Cabin",axis=1, inplace = True)

In [14]:
# now rechecking the number of null values

df.isna().sum()

Pclass      0
Name        0
Sex         0
Age         0
SibSp       0
Parch       0
Ticket      0
Fare        0
Embarked    0
Survived    0
dtype: int64

In [15]:
df.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Survived
0,2,"Weisz, Mrs. Leopold (Mathilde Francoise Pede)",female,29.0,1,0,228414,26.0,S,1
1,3,"Williams, Mr. Howard Hugh ""Harry""",male,29.70056,0,0,A/5 2466,8.05,S,0
2,2,"Morley, Mr. Henry Samuel (""Mr Henry Marshall"")",male,39.0,0,0,250655,26.0,S,0
3,3,"Palsson, Mrs. Nils (Alma Cornelia Berglund)",female,29.0,0,4,349909,21.075,S,0
4,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.05,S,0


In [16]:
# checking for non numeric column

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 668 entries, 0 to 667
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    668 non-null    int64  
 1   Name      668 non-null    object 
 2   Sex       668 non-null    object 
 3   Age       668 non-null    float64
 4   SibSp     668 non-null    int64  
 5   Parch     668 non-null    int64  
 6   Ticket    668 non-null    object 
 7   Fare      668 non-null    float64
 8   Embarked  668 non-null    object 
 9   Survived  668 non-null    int64  
dtypes: float64(2), int64(4), object(4)
memory usage: 52.3+ KB


In [17]:
# handling sex column as it is a string 

gender = pd.get_dummies(df["Sex"], drop_first = True)
df["Gender"] = gender

In [18]:
# verifying whether the column of gender is added or not

df.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Survived,Gender
0,2,"Weisz, Mrs. Leopold (Mathilde Francoise Pede)",female,29.0,1,0,228414,26.0,S,1,0
1,3,"Williams, Mr. Howard Hugh ""Harry""",male,29.70056,0,0,A/5 2466,8.05,S,0,1
2,2,"Morley, Mr. Henry Samuel (""Mr Henry Marshall"")",male,39.0,0,0,250655,26.0,S,0,1
3,3,"Palsson, Mrs. Nils (Alma Cornelia Berglund)",female,29.0,0,4,349909,21.075,S,0,0
4,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.05,S,0,1


In [19]:
# dropping all the columns which are not neccessary 

df.drop(["Name", "Sex", "Ticket", "Embarked","Fare"], axis = 1, inplace = True)

In [20]:
# rechecking

df.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Survived,Gender
0,2,29.0,1,0,1,0
1,3,29.70056,0,0,0,1
2,2,39.0,0,0,0,1
3,3,29.0,0,4,0,0
4,3,25.0,0,0,0,1


In [21]:
#seperating dependent and independent variables

x_train = df[["Pclass", "Age", "SibSp", "Parch", "Gender"]]
y_train = df["Survived"]

In [22]:
x_train

Unnamed: 0,Pclass,Age,SibSp,Parch,Gender
0,2,29.00000,1,0,0
1,3,29.70056,0,0,1
2,2,39.00000,0,0,1
3,3,29.00000,0,4,0
4,3,25.00000,0,0,1
...,...,...,...,...,...
663,2,17.00000,0,0,0
664,3,29.70056,0,0,1
665,3,32.00000,0,0,1
666,3,22.00000,0,0,0


In [23]:
y_train

0      1
1      0
2      0
3      0
4      0
      ..
663    1
664    0
665    1
666    0
667    1
Name: Survived, Length: 668, dtype: int64

In [24]:
# fitting the data

clf = LogisticRegression()
clf.fit(x_train, y_train)

LogisticRegression()

In [25]:
# loading testing dataset

test_data = pd.read_csv("titanic_testing_datas.csv")

In [26]:
# making copy of testing dataset

df_test = test_data.copy()

In [27]:
df_test.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,2,"Davies, Master. John Morgan Jr",male,8.0,1,1,C.A. 33112,36.75,,S
1,1,"Leader, Dr. Alice (Farnham)",female,49.0,0,0,17465,25.9292,D17,S
2,3,"Kilgannon, Mr. Thomas J",male,,0,0,36865,7.7375,,Q
3,2,"Jacobsohn, Mrs. Sidney Samuel (Amy Frances Chr...",female,24.0,2,1,243847,27.0,,S
4,1,"McGough, Mr. James Robert",male,36.0,0,0,PC 17473,26.2875,E25,S


In [28]:
# checking number of null values of testing data

df_test.isna().sum()

Pclass        0
Name          0
Sex           0
Age          45
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       173
Embarked      1
dtype: int64

In [29]:
# hadndling null values

mean_of_age_test = df_test["Age"].mean()
df_test["Age"].fillna(mean_of_age_test, inplace = True)

In [30]:
# dropping some columns

df_test.drop("Cabin",axis=1, inplace = True)

In [31]:
# handling sex column

gender_test = pd.get_dummies(df_test["Sex"], drop_first = True)
df_test["Gender"] = gender_test

In [32]:
# dropping some unneccessary columns

df_test.drop(["Name", "Sex", "Ticket", "Embarked","Fare"], axis = 1, inplace = True)

In [33]:
df_test

Unnamed: 0,Pclass,Age,SibSp,Parch,Gender
0,2,8.000000,1,1,1
1,1,49.000000,0,0,0
2,3,29.694775,0,0,1
3,2,24.000000,2,1,0
4,1,36.000000,0,0,1
...,...,...,...,...,...
218,3,20.000000,1,0,1
219,1,45.000000,0,0,1
220,1,17.000000,1,0,0
221,3,43.000000,0,0,1


In [36]:
prediction = clf.predict(df_test)

In [37]:
np.savetxt("prediction_of_titanic_data_second_time.csv", prediction, delimiter = ",")