In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor

In [2]:
train_d = pd.read_csv('./train.csv')
test_d = pd.read_csv('./test.csv')

In [3]:
train_d.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [4]:
y = train_d.Survived
features = ['Pclass', 'Sex', 'Age','SibSp','Parch','Fare','Embarked']
X = train_d[features]
X_valid = test_d[features]

In [5]:
# Find collumn with missing value
train_miss_cols = [col for col in X.columns
                     if X[col].isnull().any()]
test_miss_cols = [col for col in X_valid.columns
                     if X_valid[col].isnull().any()]
print(train_miss_cols)
print(test_miss_cols)

['Age', 'Embarked']
['Age', 'Fare']


In [6]:
# Change sex (categorical value) into number value male: 1, female: 0
X_plus = X.copy()
X_valid_plus = X_valid.copy()
label_encoder = LabelEncoder()

X_plus['Sex'] = pd.DataFrame(label_encoder.fit_transform(X['Sex']))
X_valid_plus['Sex'] = pd.DataFrame(label_encoder.transform(X_valid['Sex']))

X_plus['Embarked'] = pd.DataFrame(label_encoder.fit_transform(X['Embarked'].astype(str)))
X_valid_plus['Embarked'] = pd.DataFrame(label_encoder.transform(X_valid['Embarked'].astype(str)))

print(X_plus.describe())

           Pclass         Sex         Age       SibSp       Parch        Fare  \
count  891.000000  891.000000  714.000000  891.000000  891.000000  891.000000   
mean     2.308642    0.647587   29.699118    0.523008    0.381594   32.204208   
std      0.836071    0.477990   14.526497    1.102743    0.806057   49.693429   
min      1.000000    0.000000    0.420000    0.000000    0.000000    0.000000   
25%      2.000000    0.000000   20.125000    0.000000    0.000000    7.910400   
50%      3.000000    1.000000   28.000000    0.000000    0.000000   14.454200   
75%      3.000000    1.000000   38.000000    1.000000    0.000000   31.000000   
max      3.000000    1.000000   80.000000    8.000000    6.000000  512.329200   

         Embarked  
count  891.000000  
mean     1.538721  
std      0.794231  
min      0.000000  
25%      1.000000  
50%      2.000000  
75%      2.000000  
max      3.000000  


In [7]:
# We still see that the Age column is in float, so it's better to round it up as an integer
X_plus['Age'] = X_plus['Age'].round()

print(X_plus['Age'].unique())

[22. 38. 26. 35. nan 54.  2. 27. 14.  4. 58. 20. 39. 55. 31. 34. 15. 28.
  8. 19. 40. 66. 42. 21. 18.  3.  7. 49. 29. 65.  5. 11. 45. 17. 32. 16.
 25.  1. 30. 33. 23. 24. 46. 59. 71. 37. 47. 70. 12.  9. 36. 51. 56. 44.
 61. 50. 62. 41. 52. 63. 43. 60. 10. 64. 13. 48. 53. 57. 80.  6.  0. 74.]


In [8]:
# Transform missing value
imputer = SimpleImputer()
X_plus = pd.DataFrame(imputer.fit_transform(X_plus))
X_valid_plus = pd.DataFrame(imputer.transform(X_valid_plus))

In [9]:
# Modelling using XGBoost
xg_model = XGBRegressor()
xg_model.fit(X_plus, y)

# Check MAE
predicted_survive = xg_model.predict(X_plus)
predicted_survive = predicted_survive.round()

print(mean_absolute_error(predicted_survive, y))

0.019079685746352413


In [10]:
# Modelling using RandomForestRegressor
model = RandomForestRegressor(random_state = 1)
model.fit(X_plus, y)

# Check MAE
predicted_survive = model.predict(X_plus)
predicted_survive = predicted_survive.round()

print(mean_absolute_error(predicted_survive, y))

0.017957351290684626


In [11]:
predicted_survive = model.predict(X_valid_plus)
predicted_survive = predicted_survive.round()
predicted_survive = predicted_survive.astype(int)

print(predicted_survive.dtype)

int32


In [12]:
# Output
output = pd.DataFrame({'PassengerId': test_d.PassengerId,
                       'Survived': predicted_survive})
output.to_csv('submission.csv', index=False)