In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Titanic predictions with Pipelines

The aim of this notebook is to develop a simple model for Titanic classification using Pipelines.

In [None]:
train = pd.read_csv('../input/tabular-playground-series-apr-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv')
print("Data imported")

### Dealing with missing values

Cabin both for train and test dataset is a variable with a lot of missing values. We could drop the variable but the missing value itself coul be a valuable information so we can create a new value for cabin (W in this case) just as an indication that value is missing. 
Name and PassengerId are variables that can be dropped.

In [None]:
df = {'train':train, 'test':test}

for d in df.values():
    d.loc[d['Cabin'].isnull(), 'Cabin'] = 'W'
    d['Cabin_initial'] = d['Cabin'].apply(lambda x : x[0])
    d.drop(['Cabin','Name','Ticket','PassengerId'], axis = 1, inplace = True)
    

All of the others variables can be imputed. We can use colums transformer to impute in a different ways categorical and numerical features.

In [None]:
categorical_colums = ['Embarked']
numeric_columns = ['Age','Fare']

In [None]:
numeric_transformer = SimpleImputer(missing_values=np.nan, strategy='mean')
categorical_transformer = SimpleImputer(missing_values= np.nan, strategy = 'most_frequent')

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_columns),
        ('cat', categorical_transformer, categorical_colums)],remainder = 'passthrough')

In [None]:
train_transformed = pd.DataFrame(data = preprocessor.fit_transform(train), columns = ['Age','Fare','Embarked','Survived','Pclass','Sex','SibSp','Parch','Cabin_initial'])
test_transformed = pd.DataFrame(data = preprocessor.fit_transform(test), columns = ['Age','Fare','Embarked','Pclass','Sex','SibSp','Parch','Cabin_initial'])

### Features engineering

A useful thing to do is to divide Fare, Age and  Sip_parch in bins. 

In [None]:
df_transformed = [train_transformed,test_transformed]

In [None]:
for d in df_transformed:
    d['SibSp+Parch'] = d['SibSp']+d['Parch']

In [None]:
age_bins = [0,10,20,30,50,60,70,90]
age_labels = ['0 to 10','11 to 20','21 to 30','31 to 40','41 to 50','51 to 60','61 to 90']
fare_bins = [0,2,4,6,10,12,15,20,26,30,38,70,100,200,300,500,800]
fare_labels = ['0 to 2','3 to 4','5 to 6','7 to 10','11 to 12','13 to 15','16 to 20','21 to 26','27 to 30','31 to 38','39 to 70','71 to 100','101 to 200','201 to 300','301 to 500','501 to 800']
Sip_parch_bins = [0,1,3,5,10,20]
Sip_parch_labels = ['0 to 1','1 to 3', '3 to 5','5 to 10','10 to 20']

In [None]:
for d in df_transformed:
    d['Age_binned'] = pd.cut(d['Age'], bins = age_bins, labels = age_labels, right = True)
    d.drop(columns = 'Age',axis =1, inplace = True)
    d['Fare_binned'] = pd.cut(d['Fare'], bins = fare_bins, labels = fare_labels , right = True)
    d.drop('Fare', axis = 1, inplace = True)
    d['SibSp+Parch_binned'] = pd.cut(d['SibSp+Parch'], bins = Sip_parch_bins, labels = Sip_parch_labels , right = False)
    d.drop('SibSp+Parch', axis = 1, inplace = True)

## RandomForestClassifier

In [None]:
ohe = OneHotEncoder()
cat_vars = ['Sex', 'Cabin_initial','Embarked','Age_binned','Fare_binned','SibSp+Parch_binned']
col_transformer = ColumnTransformer(transformers = [('cat',ohe,cat_vars)],remainder = 'passthrough')
rfc = RandomForestClassifier(n_estimators = 150)

In [None]:
y = train_transformed['Survived'].astype('int')
X = train_transformed[[col for col in train_transformed.columns if col not in 'Survived']]
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [None]:
pipe = Pipeline([('col_trasf', col_transformer),('rfc',rfc)])

In [None]:
pipe.fit(X_train, y_train)

In [None]:
pipe.score(X_test, y_test)

#### Predictions submitting

In [None]:
prediction = pipe.predict(test_transformed)

In [None]:
test = pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv')

In [None]:
submission=pd.DataFrame({'PassengerId':test['PassengerId'],
                        'Survived':prediction})
submission.to_csv('my_submissions.csv',index=False)