In [2]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.linear_model import LogisticRegression

In [3]:
train_df = pd.read_csv(r'training.csv',encoding = 'utf-8')
test_df = pd.read_csv(r'testing.csv',encoding = 'utf-8')

In [4]:
train_df

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,2,"Weisz, Mrs. Leopold (Mathilde Francoise Pede)",female,29.0,1,0,228414,26.0000,,S,1
1,3,"Williams, Mr. Howard Hugh ""Harry""",male,,0,0,A/5 2466,8.0500,,S,0
2,2,"Morley, Mr. Henry Samuel (""Mr Henry Marshall"")",male,39.0,0,0,250655,26.0000,,S,0
3,3,"Palsson, Mrs. Nils (Alma Cornelia Berglund)",female,29.0,0,4,349909,21.0750,,S,0
4,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.0500,,S,0
...,...,...,...,...,...,...,...,...,...,...,...
663,2,"Ilett, Miss. Bertha",female,17.0,0,0,SO/C 14885,10.5000,,S,1
664,3,"Morrow, Mr. Thomas Rowan",male,,0,0,372622,7.7500,,Q,0
665,3,"Bing, Mr. Lee",male,32.0,0,0,1601,56.4958,,S,1
666,3,"Strandberg, Miss. Ida Sofia",female,22.0,0,0,7553,9.8375,,S,0


In [5]:
test_df

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,2,"Davies, Master. John Morgan Jr",male,8.0,1,1,C.A. 33112,36.7500,,S
1,1,"Leader, Dr. Alice (Farnham)",female,49.0,0,0,17465,25.9292,D17,S
2,3,"Kilgannon, Mr. Thomas J",male,,0,0,36865,7.7375,,Q
3,2,"Jacobsohn, Mrs. Sidney Samuel (Amy Frances Chr...",female,24.0,2,1,243847,27.0000,,S
4,1,"McGough, Mr. James Robert",male,36.0,0,0,PC 17473,26.2875,E25,S
...,...,...,...,...,...,...,...,...,...,...
218,3,"Lindqvist, Mr. Eino William",male,20.0,1,0,STON/O 2. 3101285,7.9250,,S
219,1,"Butt, Major. Archibald Willingham",male,45.0,0,0,113050,26.5500,B38,S
220,1,"Penasco y Castellana, Mrs. Victor de Satode (M...",female,17.0,1,0,PC 17758,108.9000,C65,C
221,3,"Holm, Mr. John Fredrik Alexander",male,43.0,0,0,C 7075,6.4500,,S


In [6]:
# Logistic Regression only works on numerical data, so dropping columns with text fields
# Columns like "Name" , "Ticket", "Cabin" must not have any significant impact on data classification
df = pd.concat([train_df,test_df])

In [7]:
df

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,2,"Weisz, Mrs. Leopold (Mathilde Francoise Pede)",female,29.0,1,0,228414,26.000,,S,1.0
1,3,"Williams, Mr. Howard Hugh ""Harry""",male,,0,0,A/5 2466,8.050,,S,0.0
2,2,"Morley, Mr. Henry Samuel (""Mr Henry Marshall"")",male,39.0,0,0,250655,26.000,,S,0.0
3,3,"Palsson, Mrs. Nils (Alma Cornelia Berglund)",female,29.0,0,4,349909,21.075,,S,0.0
4,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.050,,S,0.0
...,...,...,...,...,...,...,...,...,...,...,...
218,3,"Lindqvist, Mr. Eino William",male,20.0,1,0,STON/O 2. 3101285,7.925,,S,
219,1,"Butt, Major. Archibald Willingham",male,45.0,0,0,113050,26.550,B38,S,
220,1,"Penasco y Castellana, Mrs. Victor de Satode (M...",female,17.0,1,0,PC 17758,108.900,C65,C,
221,3,"Holm, Mr. John Fredrik Alexander",male,43.0,0,0,C 7075,6.450,,S,


In [8]:
df.isnull().any()

Pclass      False
Name        False
Sex         False
Age          True
SibSp       False
Parch       False
Ticket      False
Fare        False
Cabin        True
Embarked     True
Survived     True
dtype: bool

In [9]:
# Age, Embarked, Cabin have missing values
# Assuming Cabin has no role in classification
df.drop(columns=['Ticket','Cabin','Name'],inplace=True)

In [10]:
df['Survived'].isnull().sum()

223

In [11]:
df['Embarked'].fillna('S',inplace=True)

In [12]:
df['Embarked'].value_counts()

S    646
C    168
Q     77
Name: Embarked, dtype: int64

In [13]:
# replace missing values in age with median ages of males and females respectively
male_age = df.groupby('Sex').median()['Age'].loc['male']
female_age = df.groupby('Sex').median()['Age'].loc['female']
print(male_age)
print(female_age)

29.0
27.0


In [14]:
filt_m = ((df['Age'].isna()) & (df['Sex'] == 'male'))
filt_f = ((df['Age'].isna()) & (df['Sex'] == 'female'))

In [15]:
df.loc[filt_m,'Age'] = male_age
df.loc[filt_f,'Age'] = female_age

In [16]:
# One hot encoding for Sex and Embarked Columns
embark = pd.get_dummies(df['Embarked'],prefix='Embarked')
sex = pd.get_dummies(df['Sex'],prefix='gender')

In [17]:
sex.isnull()

Unnamed: 0,gender_female,gender_male
0,False,False
1,False,False
2,False,False
3,False,False
4,False,False
...,...,...
218,False,False
219,False,False
220,False,False
221,False,False


In [18]:
df = pd.concat([df,sex],axis=1)

In [19]:
df = pd.concat([df,embark],axis=1)

In [20]:
df.drop(columns=['Embarked','Sex'],inplace=True)

In [21]:
cols = list(df.columns)

In [22]:
labels = cols[5]
cols = cols[0:5] + cols[6:]
cols.append(labels) 

In [23]:
df = df[cols]

In [24]:
df

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,gender_female,gender_male,Embarked_C,Embarked_Q,Embarked_S,Survived
0,2,29.0,1,0,26.000,1,0,0,0,1,1.0
1,3,29.0,0,0,8.050,0,1,0,0,1,0.0
2,2,39.0,0,0,26.000,0,1,0,0,1,0.0
3,3,29.0,0,4,21.075,1,0,0,0,1,0.0
4,3,25.0,0,0,7.050,0,1,0,0,1,0.0
...,...,...,...,...,...,...,...,...,...,...,...
218,3,20.0,1,0,7.925,0,1,0,0,1,
219,1,45.0,0,0,26.550,0,1,0,0,1,
220,1,17.0,1,0,108.900,1,0,1,0,0,
221,3,43.0,0,0,6.450,0,1,0,0,1,


In [25]:
test_filter = df['Survived'].isnull()
test_df = df.loc[test_filter]
train_df = df.loc[~test_filter]

In [26]:
train_df

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,gender_female,gender_male,Embarked_C,Embarked_Q,Embarked_S,Survived
0,2,29.0,1,0,26.0000,1,0,0,0,1,1.0
1,3,29.0,0,0,8.0500,0,1,0,0,1,0.0
2,2,39.0,0,0,26.0000,0,1,0,0,1,0.0
3,3,29.0,0,4,21.0750,1,0,0,0,1,0.0
4,3,25.0,0,0,7.0500,0,1,0,0,1,0.0
...,...,...,...,...,...,...,...,...,...,...,...
663,2,17.0,0,0,10.5000,1,0,0,0,1,1.0
664,3,29.0,0,0,7.7500,0,1,0,1,0,0.0
665,3,32.0,0,0,56.4958,0,1,0,0,1,1.0
666,3,22.0,0,0,9.8375,1,0,0,0,1,0.0


In [27]:
test_df

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,gender_female,gender_male,Embarked_C,Embarked_Q,Embarked_S,Survived
0,2,8.0,1,1,36.7500,0,1,0,0,1,
1,1,49.0,0,0,25.9292,1,0,0,0,1,
2,3,29.0,0,0,7.7375,0,1,0,1,0,
3,2,24.0,2,1,27.0000,1,0,0,0,1,
4,1,36.0,0,0,26.2875,0,1,0,0,1,
...,...,...,...,...,...,...,...,...,...,...,...
218,3,20.0,1,0,7.9250,0,1,0,0,1,
219,1,45.0,0,0,26.5500,0,1,0,0,1,
220,1,17.0,1,0,108.9000,1,0,1,0,0,
221,3,43.0,0,0,6.4500,0,1,0,0,1,


In [28]:
classifier = LogisticRegression(solver='liblinear',max_iter=10000,C=0.3)

In [29]:
# get train data and labels in separate np arrays
X_train = train_df.iloc[:,:-1].to_numpy()

In [30]:
y_train = train_df.iloc[:,-1]

In [31]:
classifier.fit(X_train,y_train)

LogisticRegression(C=0.3, max_iter=10000, solver='liblinear')

In [32]:
test_df

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,gender_female,gender_male,Embarked_C,Embarked_Q,Embarked_S,Survived
0,2,8.0,1,1,36.7500,0,1,0,0,1,
1,1,49.0,0,0,25.9292,1,0,0,0,1,
2,3,29.0,0,0,7.7375,0,1,0,1,0,
3,2,24.0,2,1,27.0000,1,0,0,0,1,
4,1,36.0,0,0,26.2875,0,1,0,0,1,
...,...,...,...,...,...,...,...,...,...,...,...
218,3,20.0,1,0,7.9250,0,1,0,0,1,
219,1,45.0,0,0,26.5500,0,1,0,0,1,
220,1,17.0,1,0,108.9000,1,0,1,0,0,
221,3,43.0,0,0,6.4500,0,1,0,0,1,


In [33]:
X_test = test_df.iloc[:,:-1].to_numpy()

In [34]:
X_test.shape

(223, 10)

In [35]:
classifier.score(X_train,y_train)

0.7949101796407185

In [36]:
classifier.predict_proba(X_test)

array([[0.70629586, 0.29370414],
       [0.17705968, 0.82294032],
       [0.85456266, 0.14543734],
       [0.31163489, 0.68836511],
       [0.62769146, 0.37230854],
       [0.7280039 , 0.2719961 ],
       [0.82007409, 0.17992591],
       [0.85457036, 0.14542964],
       [0.30802452, 0.69197548],
       [0.86355489, 0.13644511],
       [0.23547446, 0.76452554],
       [0.86400184, 0.13599816],
       [0.0607265 , 0.9392735 ],
       [0.33504679, 0.66495321],
       [0.7423255 , 0.2576745 ],
       [0.76746295, 0.23253705],
       [0.83845838, 0.16154162],
       [0.83336318, 0.16663682],
       [0.8424082 , 0.1575918 ],
       [0.29252208, 0.70747792],
       [0.38283653, 0.61716347],
       [0.86360769, 0.13639231],
       [0.21163961, 0.78836039],
       [0.24683913, 0.75316087],
       [0.86437736, 0.13562264],
       [0.57557305, 0.42442695],
       [0.7513987 , 0.2486013 ],
       [0.69253183, 0.30746817],
       [0.28408589, 0.71591411],
       [0.32519258, 0.67480742],
       [0.

In [37]:
classifier.predict(X_test)

array([0., 1., 0., 1., 0., 0., 0., 0., 1., 0., 1., 0., 1., 1., 0., 0., 0.,
       0., 0., 1., 1., 0., 1., 1., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0.,
       0., 1., 1., 0., 0., 0., 1., 0., 0., 0., 1., 1., 0., 1., 0., 0., 0.,
       1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 0., 0., 1.,
       1., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 1., 0., 0., 0.,
       1., 0., 1., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 1., 1., 0.,
       1., 0., 1., 1., 0., 0., 0., 0., 0., 1., 1., 0., 1., 0., 0., 1., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 0., 0.,
       0., 1., 0., 0., 1., 0., 0., 0., 1., 1., 1., 1., 0., 0., 0., 1., 1.,
       1., 1., 1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 1., 1., 0., 1.,
       0., 1., 0., 0., 0., 1., 0., 0., 1., 1., 1., 1., 1., 0., 0., 0., 0.,
       0., 0., 0., 0., 1., 1., 0., 1., 0., 0., 1., 1., 0., 0., 0., 0., 1.,
       0., 0.])

In [38]:
predictions = classifier.predict(X_test)

In [39]:
predictions.shape

(223,)

In [348]:
np.savetxt('predictions_LReg.csv', predictions, delimiter = ',', fmt = '%.0f')