In [265]:
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn import linear_model
import pandas as pd
import warnings

In [266]:
warnings.filterwarnings('ignore')

In [267]:
# Load in data
test_df = pd.read_csv('data/test.csv')
train_df = pd.read_csv('data/train.csv')
outcome_df = pd.read_csv('data/gender_submission.csv')

In [268]:
# Merge test data with train data
test_df = test_df.merge(outcome_df, how='inner', on='PassengerId')
entire_df = train_df.append(test_df).reset_index(drop=True)
print(entire_df.head(5))

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


In [269]:
# Print information relating to data
print(entire_df.isna().sum())

PassengerId       0
Survived          0
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
dtype: int64


In [270]:
# Drop data 
entire_df.set_index('PassengerId', inplace=True)
entire_df.drop(['Name','Cabin', 'Embarked', 'Ticket'], inplace=True, axis=1)
print(entire_df.isna().sum())
print(entire_df.head(5))

Survived      0
Pclass        0
Sex           0
Age         263
SibSp         0
Parch         0
Fare          1
dtype: int64
             Survived  Pclass     Sex   Age  SibSp  Parch     Fare
PassengerId                                                       
1                   0       3    male  22.0      1      0   7.2500
2                   1       1  female  38.0      1      0  71.2833
3                   1       3  female  26.0      0      0   7.9250
4                   1       1  female  35.0      1      0  53.1000
5                   0       3    male  35.0      0      0   8.0500


In [271]:
# Encode Catagory data
entire_df['Sex'] = OrdinalEncoder().fit_transform(entire_df['Sex'].values.reshape(-1, 1))
print(entire_df.head(5))

             Survived  Pclass  Sex   Age  SibSp  Parch     Fare
PassengerId                                                    
1                   0       3  1.0  22.0      1      0   7.2500
2                   1       1  0.0  38.0      1      0  71.2833
3                   1       3  0.0  26.0      0      0   7.9250
4                   1       1  0.0  35.0      1      0  53.1000
5                   0       3  1.0  35.0      0      0   8.0500


In [272]:
# Normalise Data

for feat in entire_df.columns:
    entire_df[feat] = MinMaxScaler().fit_transform(entire_df[feat].values.reshape(-1, 1))

print(entire_df.head(5))

             Survived  Pclass  Sex       Age  SibSp  Parch      Fare
PassengerId                                                         
1                 0.0     1.0  1.0  0.273456  0.125    0.0  0.014151
2                 1.0     0.0  0.0  0.473882  0.125    0.0  0.139136
3                 1.0     1.0  0.0  0.323563  0.000    0.0  0.015469
4                 1.0     0.0  0.0  0.436302  0.125    0.0  0.103644
5                 0.0     1.0  1.0  0.436302  0.000    0.0  0.015713


In [273]:

# Re-split the data
train_df = entire_df.iloc[:891]
train_df.dropna(inplace=True)

test_df = entire_df.iloc[891:]
test_df.dropna(inplace=True)

# Split features and target
x_train = train_df.loc[:, train_df.columns != 'Survived']
y_train = train_df['Survived']

x_test = test_df.loc[:, test_df.columns != 'Survived']
y_test = test_df['Survived']

In [274]:
# Train Model
model = linear_model.LogisticRegression()
model.fit(x_train, y_train)

In [275]:
# Prediction
prediction = model.predict(x_test)

In [276]:
# Evaluation 
print(classification_report(y_test, prediction))


              precision    recall  f1-score   support

         0.0       0.97      0.93      0.95       204
         1.0       0.90      0.95      0.92       127

    accuracy                           0.94       331
   macro avg       0.93      0.94      0.94       331
weighted avg       0.94      0.94      0.94       331



In [277]:
# Export Results
results = test_df[['Survived']]
results['Predicted'] = prediction
results.reset_index(inplace=True)
print(results.head())

results.to_csv('submission.csv',index=False)

     PassengerId  Survived  Predicted
0            892       0.0        0.0
1            893       1.0        0.0
2            894       0.0        0.0
3            895       0.0        0.0
4            896       1.0        1.0
..           ...       ...        ...
326         1301       1.0        1.0
327         1303       1.0        1.0
328         1304       1.0        1.0
329         1306       1.0        1.0
330         1307       0.0        0.0

[331 rows x 3 columns]
