# Data Cleaning & Model
Data Set: Tabular Playground April 2021

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split # train set dividing
from sklearn.inspection import permutation_importance # to see the feature weights in the model
from sklearn.ensemble import RandomForestClassifier # Model
from sklearn.preprocessing import StandardScaler # transforming data
from sklearn.neural_network import MLPClassifier # Model


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Import data set
train = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/test.csv')

## Convert non-numerycal data & null data

In [None]:
train.info() # To see nulls & data type

### Observations:
* Sex can be boolean.
* Embarked can be converted in range (1,4).
* Age have almost 4000 null values.
* Ticket have almost 5000 null values.
* Cabin is the feature with the most null values.
* Cabin, Name, Ticket and Embarked are non-numerycal values.

### Implications:
(Some of the following implications are been taken based in this [EDA](https://www.kaggle.com/betancourt/tabularplayground-april-eda))
* Transform Cabin, Name and Ticket features.
* Convert features Sex and Embarked with pd.map({}).
* Since Pclass are correlated to Age feature, null values in Age will be filled with the average of its respective value in Pclass.
* Since Pclass are correlated to Fare feature, null values in Fare will be filled with the average of its respective value in Pclass.
* Delete the Name & Ticket Columns

In [None]:
# Convert in numerycal Sex and Embarked
# Train Set
train['Sex'] = train['Sex'].map({'male':1, 'female':0})
train['Embarked'] = train['Embarked'].map({'S':1, 'C':2, 'Q':3})

# Test Set
test['Sex'] = test['Sex'].map({'male':1, 'female':0})
test['Embarked'] = test['Embarked'].map({'S':1, 'C':2, 'Q':3})

In [None]:
# Fill null values in Age feature
# Train Set
Age_mean_vs_class = train.groupby(['Pclass'], as_index = False)['Age'].mean()
Age_mean_vs_class

train_P1 = train[(train['Pclass'] == 1)]
train_P1['Age'] = train_P1['Age'].fillna(Age_mean_vs_class.iloc[0][1])

train_P2 = train[(train['Pclass'] == 2)]
train_P2['Age'] = train_P2['Age'].fillna(Age_mean_vs_class.iloc[1][1])

train_P3 = train[(train['Pclass'] == 3)]
train_P3['Age'] = train_P3['Age'].fillna(Age_mean_vs_class.iloc[2][1])

train = pd.concat([train_P1, train_P2, train_P3])
train.isnull().any()


In [None]:
# Test Set
Age_mean_vs_class = test.groupby(['Pclass'], as_index = False)['Age'].mean()
Age_mean_vs_class

test_P1 = test[(test['Pclass'] == 1)]
test_P1['Age'] = test_P1['Age'].fillna(Age_mean_vs_class.iloc[0][1])

test_P2 = test[(test['Pclass'] == 2)]
test_P2['Age'] = test_P2['Age'].fillna(Age_mean_vs_class.iloc[1][1])

test_P3 = test[(test['Pclass'] == 3)]
test_P3['Age'] = test_P3['Age'].fillna(Age_mean_vs_class.iloc[2][1])

test = pd.concat([test_P1, test_P2, test_P3])
test.isnull().any()


In [None]:
# Fill null values in Cabin
# Train Set
train['Cabin'].fillna('N', inplace = True)
train['Cabin'] = train['Cabin'].apply(lambda x: x[0])
train['Cabin'].value_counts()

In [None]:
# Fill null values in Cabin
# Test Set
test['Cabin'].fillna('N', inplace = True)
test['Cabin'] = test['Cabin'].apply(lambda x: x[0])
test['Cabin'].value_counts()

In [None]:
# Make a dictionary to replace the values
dict_replace = {
    'N':0,
    'C':1,
    'B':2,
    'A':3,
    'D':4,
    'E':5,
    'F':6,
    'G':7,
    'T':9
}

# Train Set
train['Cabin'] = train['Cabin'].map(dict_replace)

#train['Cabin'] = StandardScaler().fit_transform(train['Cabin'].values.reshape(-1, 1))
train['Cabin'].value_counts()

In [None]:
# Test Set
test['Cabin'] = test['Cabin'].map(dict_replace)

#test['Cabin'] = StandardScaler().fit_transform(test['Cabin'].values.reshape(-1, 1))
test['Cabin'].value_counts()

Now, Embarked feature

In [None]:
# Fill null values in Embarked feature
# Train Set
train['Embarked'].fillna(0, inplace = True)
# Transform the data to set it around zero
#train['Embarked'] = StandardScaler().fit_transform(train['Embarked'].values.reshape(-1,1))
train['Embarked'].value_counts()

In [None]:
# Test Set
test['Embarked'].fillna(0, inplace = True)
# Transform the data to set it around zero
#test['Embarked'] = StandardScaler().fit_transform(test['Embarked'].values.reshape(-1,1))
test['Embarked'].value_counts()

In [None]:
# We will fill te NaN values in Fare feature with the mean value
# Train Set
Fare_mean_vs_class = train.groupby(['Pclass'], as_index = False)['Fare'].mean()

train_P1 = train[(train['Pclass'] == 1)]
train_P1['Fare'] = train_P1['Fare'].fillna(Fare_mean_vs_class.iloc[0][1])

train_P2 = train[(train['Pclass'] == 2)]
train_P2['Fare'] = train_P2['Fare'].fillna(Fare_mean_vs_class.iloc[1][1])

train_P3 = train[(train['Pclass'] == 3)]
train_P3['Fare'] = train_P3['Fare'].fillna(Fare_mean_vs_class.iloc[2][1])

train = pd.concat([train_P1, train_P2, train_P3])
train.isnull().any()

In [None]:
# Test Set
Fare_mean_vs_class = test.groupby(['Pclass'], as_index = False)['Fare'].mean()

test_P1 = test[(test['Pclass'] == 1)]
test_P1['Fare'] = test_P1['Fare'].fillna(Fare_mean_vs_class.iloc[0][1])

test_P2 = test[(test['Pclass'] == 2)]
test_P2['Fare'] = test_P2['Fare'].fillna(Fare_mean_vs_class.iloc[1][1])

test_P3 = test[(test['Pclass'] == 3)]
test_P3['Fare'] = test_P3['Fare'].fillna(Fare_mean_vs_class.iloc[2][1])

test = pd.concat([test_P1, test_P2, test_P3])
test.isnull().any()

In [None]:
# Drop the Name and Ticket columns
# Train Set
train = train.drop(['Ticket', 'Name'], axis = 1)
train.head(10)

In [None]:
# Test Set
test = test.drop(['Ticket', 'Name'], axis = 1)
test.head(10)

## Adding new features

The following lines (as well as much of this Notebook) are taken or based from this [Notebook](https://www.kaggle.com/marcogdepinto/feature-engineering-eda-data-cleaning-tutorial).

In [None]:
# Train Set

def process_family_train():
    
    # introducing a new feature : the size of families (including the passenger)
    train['FamilySize'] = train['Parch'] + train['SibSp'] + 1
    
    # introducing other features based on the family size
    train['Singleton'] = train['FamilySize'].map(lambda s: 1 if s == 1 else 0)
    train['SmallFamily'] = train['FamilySize'].map(lambda s: 1 if 2 <= s <= 4 else 0)
    train['LargeFamily'] = train['FamilySize'].map(lambda s: 1 if 5 <= s else 0)
    
    return train
train = process_family_train()
train.head()

In [None]:
def process_family_train():
    
    # introducing a new feature : the size of families (including the passenger)
    test['FamilySize'] = test['Parch'] + test['SibSp'] + 1
    
    # introducing other features based on the family size
    test['Singleton'] = test['FamilySize'].map(lambda s: 1 if s == 1 else 0)
    test['SmallFamily'] = test['FamilySize'].map(lambda s: 1 if 2 <= s <= 4 else 0)
    test['LargeFamily'] = test['FamilySize'].map(lambda s: 1 if 5 <= s else 0)
    
    return test
test = process_family_train()
test.head()

In [None]:
train['Age*Class']=train['Age']*train['Pclass']
test['Age*Class']=test['Age']*test['Pclass']

In [None]:
train['Singleton*Pclass']=train['Singleton']*train['Pclass']
test['Singleton*Pclass']=test['Singleton']*test['Pclass']

In [None]:
train['SmallFamily*Pclass']=train['SmallFamily']*train['Pclass']
test['SmallFamily*Pclass']=test['SmallFamily']*test['Pclass']

In [None]:
train['MaleSingle'] = (train['Singleton'] == 1) & (train['Sex'] == 1)
test['MaleSingle'] = (test['Singleton'] == 1) & (test['Sex'] == 1)

In [None]:
train['Sex*Pclass'] = train['Sex']*train['Pclass']
test['Sex*Pclass'] = test['Sex']*test['Pclass']

In [None]:
train['Sex*Singleton'] = train['Sex']*train['Singleton']
test['Sex*Singleton'] = test['Sex']*test['Singleton']

## Model & Submission

In [None]:
# Make the train & test sets
X = train.drop(['Survived', 'PassengerId'], axis =1)
Y = train['Survived']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state = 100)

In [None]:
scaler = StandardScaler()
scaler.fit(X_train.values.reshape(-1,1))
scaler.fit(X_test.values.reshape(-1,1))

scaler.fit(test.values.reshape(-1,1))

### Random Forest Classifier

In [None]:
# Choose the RandomForestClassifier
model = RandomForestClassifier(n_estimators = 100, random_state=5)
model.fit(X_train, Y_train)

In [None]:
model.score(X_test, Y_test)

### MLP Classifier

In [None]:
model_b = MLPClassifier(max_iter=1000, hidden_layer_sizes=(100, 55), alpha=0.0001, solver='adam', random_state=1)
model_b.fit(X, Y)

In [None]:
model_b.score(X_test, Y_test)

In [None]:
# Make submission

predicts = model_b.predict(test.drop(['PassengerId'], axis = 1))

my_submission = pd.DataFrame({'PassengerId': test['PassengerId'].values, 'Survived': predicts})
my_submission.to_csv('submissionv.csv', index=False)