In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [2]:

# load data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')


In [3]:

# visualize missing values
sns.heatmap(train.isnull(), yticklabels=False, cbar=False, cmap='viridis')
plt.show()

In [4]:
train

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


In [5]:

# fill missing values
imputer = SimpleImputer(strategy='most_frequent')
train['Age'] = imputer.fit_transform(train[['Age']])
train['Embarked'] = imputer.fit_transform(train[['Embarked']])

test['Age'] = imputer.fit_transform(test[['Age']])
test['Fare'] = imputer.fit_transform(test[['Fare']])

# convert categorical variables to numerical
label_encoder = LabelEncoder()
train['Sex'] = label_encoder.fit_transform(train['Sex'])
train['Embarked'] = label_encoder.fit_transform(train['Embarked'])

test['Sex'] = label_encoder.fit_transform(test['Sex'])
test['Embarked'] = label_encoder.fit_transform(test['Embarked'])

# scale numerical variables
scaler = StandardScaler()
train[['Age', 'Fare']] = scaler.fit_transform(train[['Age', 'Fare']])
test[['Age', 'Fare']] = scaler.fit_transform(test[['Age', 'Fare']])

train = train.drop('Name', axis = 1)
train = train.drop('Ticket', axis = 1)
train = train.drop('Cabin', axis = 1)

test = test.drop('Name', axis = 1)
test = test.drop('Ticket', axis = 1)
test = test.drop('Cabin', axis = 1)

# split the data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(train.drop('Survived', axis=1), train['Survived'], 
                                                  test_size=0.2, random_state=42)


In [6]:

# train logistic regression model
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

# predict on validation set
y_pred = log_reg.predict(X_val)


['Age', 'Cabin', 'Embarked']
['Age', 'Fare', 'Cabin']


In [7]:

# evaluate the model
print('Accuracy Score:', accuracy_score(y_val, y_pred))
print('Confusion Matrix:', confusion_matrix(y_val, y_pred))
print('Classification Report:', classification_report(y_val, y_pred))


       PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  714.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.699118    0.523008   
std     257.353842    0.486592    0.836071   14.526497    1.102743   
min       1.000000    0.000000    1.000000    0.420000    0.000000   
25%     223.500000    0.000000    2.000000   20.125000    0.000000   

            Parch        Fare  
count  891.000000  891.000000  
mean     0.381594   32.204208  
std      0.806057   49.693429  
min      0.000000    0.000000  
25%      0.000000    7.910400  


In [8]:

# train decision tree model
dtree = DecisionTreeClassifier()
dtree.fit(X_train, y_train)

# predict on validation set
y_pred = dtree.predict(X_val)

# evaluate the model
print('Accuracy Score:', accuracy_score(y_val, y_pred))
print('Confusion Matrix:', confusion_matrix(y_val, y_pred))
print('Classification Report:', classification_report(y_val, y_pred))


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [None]:

# train random forest model
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)

# predict on validation set
y_pred = rfc.predict(X_val)

# evaluate the model
print('Accuracy Score:', accuracy_score(y_val, y_pred))
print('Confusion Matrix:', confusion_matrix(y_val, y_pred))
print('Classification Report:', classification_report(y_val, y_pred))



In [None]:
y = pd.DataFrame(rfc.predict(test), columns=['Survived'])
pd.concat([test['PassengerId'],y], axis=1).to_csv('submission.csv', index=False)

In [None]:

# train gradient boosting model
gbc = GradientBoostingClassifier()
gbc.fit(X_train, y_train)

# predict on validation set
y_pred = gbc.predict(X_val)

print('Accuracy Score:', accuracy_score(y_val, y_pred))
print('Confusion Matrix:', confusion_matrix(y_val, y_pred))
print('Classification Report:', classification_report(y_val, y_pred))

In [None]:
X_val