In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Reading the Data files

In [None]:
data_raw = pd.read_csv('/kaggle/input/titanic/train.csv')
data_for_test = pd.read_csv('/kaggle/input/titanic/test.csv')

data_raw.head(10)

In [None]:
data_for_test['Embarked']

# Defining a Function that will PREPARE THE DATA on demand

In [None]:
object_cols = list(data_raw.select_dtypes(include=np.object_))
num_cols = list(data_raw.select_dtypes(include=np.number))

print("object_cols\n", object_cols,)
print("\nnum_cols\n", num_cols)

data_raw['Embarked'].value_counts()

In [None]:
def prepare_data(df,y=True, get_df = True):
    object_cols = list(df.select_dtypes(include=np.object_))
    num_cols = list(df.select_dtypes(include=np.number))
    
    # copying the data to make changes only in copied data
    dfc = df.copy()
    
    # this column not necessary for training
    num_cols.remove('PassengerId')
    if 'Survived' in num_cols:
        num_cols.remove('Survived')
    
    object_cols.remove('Name')
    
    # filling the nan values
    dfc['Embarked'].fillna('S', inplace=True)
    dfc['Sex'].fillna('female', inplace=True)
    dfc['Name'].fillna('missing', inplace=True)
    
    for x in num_cols:
        mn = dfc[x].mean()
        dfc[x].fillna(mn , inplace=True)
        
    # making new columns
    dfc['family_mems'] = dfc['SibSp'] + dfc['Parch'] + 1 # 1 because counting self
    
    def decide_marital_status(val):
        if val==1:
            return 'Single'
        elif val==2:
            return 'Couple'
        elif val>2:
            return 'Family'
    
    def decide_age_group(a):
        if a<=5:
            return 'infant'
        elif 5<a<14:
            return 'child'
        elif 14<a<25:
            return 'youth'
        else:
            return 'adult'
    
    # new column -> marital status
    dfc['marital_status'] = dfc['family_mems'].apply(decide_marital_status)
    # new column -> age_group
    dfc['age_group'] = dfc['Age'].apply(decide_age_group)
    # new column -> Title
    titles = [some_name.split(',')[1].split('.')[0].strip() for some_name in df['Name']]
    if 'missing' in titles:
        titles.remove('missing')
    dfc['Title'] = titles
    dfc['Title'] = dfc['Title'].replace(
        ['Lady', 'the Countess', 'Countess', 'Capt', 
        'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 
        'Jonkheer', 'Dona', 'Ms', 'Mme', 'Mlle'], 
        'Rare')
    
    # Drop or not does not matter because we will be selecting columns
    # mentioned in *attr_for_onehot* and *attr_for_scaling*
    # Dropping columns assumed not necessary for training
    # dfc.drop(['PassengerId','Ticket','Cabin','SibSp','Parch','Name'], axis=1, inplace=True)
    
    attr_for_onehot = ['Sex','Embarked','age_group','marital_status', 'Title']
    attr_for_scaling = ['Fare','Age', 'family_mems']
    
    from sklearn.preprocessing import OneHotEncoder, StandardScaler
    one_hot = OneHotEncoder(sparse=False)
    scaler = StandardScaler()
    one_hot_tr = one_hot.fit_transform(dfc[attr_for_onehot])
    scale_tr = scaler.fit_transform(dfc[attr_for_scaling])
    
    data_prepared = np.concatenate((one_hot_tr, scale_tr), axis=1)
    
    if get_df == True:
        features = list(one_hot.get_feature_names_out()) + attr_for_scaling
        data_prepared_df = pd.DataFrame(data_prepared,
                                       columns=features,
                                       index = dfc.index)
        if y==False:
            return data_prepared_df
        return data_prepared_df, dfc['Survived']
    
    if y==False:
            return data_prepared
    return data_prepared, dfc['Survived']

In [None]:
data_train_prepared, label = prepare_data(data_raw, get_df=True)

In [None]:
data_train_prepared.head(10)

# I inferred from previous version that RandomForestClassifier seems good for Predicitons

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import shuffle
from sklearn.metrics import precision_score, recall_score

precisions = []
recalls = []
scores = []

for n in range(20):
    X, y = shuffle(data_train_prepared, label)
    forest_clf = RandomForestClassifier()
    forest_clf.fit(X, y)
    y_pred = forest_clf.predict(X)
    precisions.append(precision_score(y,y_pred))
    recalls.append(recall_score(y,y_pred))
    scores.append(forest_clf.score(X, y))
    
print(sum(precisions)/len(precisions))
print(sum(recalls)/len(recalls))
print(sum(scores)/len(scores))

# Making predictions for Test data

In [None]:
test_prepared = prepare_data(data_for_test, y=False, get_df=True)

# for col in data_train_prepared.columns:
#     if col not in test_prepared.columns:
#         print(col)
test_pred = forest_clf.predict(test_prepared)

submission = pd.DataFrame({
    'PassengerId': data_for_test['PassengerId'],
    'Survived': test_pred
})

submission.to_csv('submission.csv', index=False)

