## Creating And Cleaning Features: Cap And Floor Data To Remove Outliers

### Read In Data

In [1]:
# Read in data
import pandas as pd
import numpy as np

titanic_df = pd.read_csv('../Data/titanic_no_missing.csv')
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_clean,Embarked_clean
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,22.0,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,38.0,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,26.0,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,35.0,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,35.0,S


### Remove Outliers

In [3]:
# See where outliers might be an issue
titanic_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
PassengerId,891.0,446.0,257.353842,1.0,223.5,446.0,668.5,891.0
Survived,891.0,0.383838,0.486592,0.0,0.0,0.0,1.0,1.0
Pclass,891.0,2.308642,0.836071,1.0,2.0,3.0,3.0,3.0
Age,714.0,29.699118,14.526497,0.42,20.125,28.0,38.0,80.0
SibSp,891.0,0.523008,1.102743,0.0,0.0,0.0,1.0,8.0
Parch,891.0,0.381594,0.806057,0.0,0.0,0.0,0.0,6.0
Fare,891.0,32.204208,49.693429,0.0,7.9104,14.4542,31.0,512.3292
Age_clean,891.0,29.699118,13.002015,0.42,22.0,29.699118,35.0,80.0


In [5]:
def detect_outlier(feature):
    outliers = []
    data = titanic_df[feature]
    mean = np.mean(data)
    std = np.std(data)
    
    for y in data:
        z_score = (y - mean) / std
        if np.abs(z_score) > 3:          # if z_score is greather than 3, considered as outlier
            outliers.append(y)
            
    print('\nOutlier caps for {}: '.format(feature))
    
    print('     --95p: {:.1f} / {} values exceed that'.format(data.quantile(.95),
                                                                                     len([i for i in data
                                                                                         if i > data.quantile(.95)])))

    print('     --3std: {:.1f} / {} values exceed that'.format(mean + 3 *(std), len(outliers)))
    
    print('     --99p: {:.1f} / {} values exceed that'.format(data.quantile(.99),
                                                                                     len([i for i in data
                                                                                         if i > data.quantile(.99)])))

In [6]:
# Determine what the upperbound should be for continuous features
for feat in ['Age_clean', 'SibSp', 'Parch', 'Fare']:
    detect_outlier(feat)


Outlier caps for Age_clean: 
     --95p: 54.0 / 42 values exceed that
     --3std: 68.7 / 7 values exceed that
     --99p: 65.0 / 8 values exceed that

Outlier caps for SibSp: 
     --95p: 3.0 / 30 values exceed that
     --3std: 3.8 / 30 values exceed that
     --99p: 5.0 / 7 values exceed that

Outlier caps for Parch: 
     --95p: 2.0 / 15 values exceed that
     --3std: 2.8 / 15 values exceed that
     --99p: 4.0 / 6 values exceed that

Outlier caps for Fare: 
     --95p: 112.1 / 45 values exceed that
     --3std: 181.2 / 20 values exceed that
     --99p: 249.0 / 9 values exceed that


- As from the data above, we can see what are the 95 percentile, 3 standard deviation, 99 percentile of each feature.
- There is no right or wrong in considering which range to use for capping . In this case, we will use 99 percentile.
- for `SibSp` and `Parch`, the max values are 8 and 6 which is considered normal. so we will just keep those.
- we will cap `Age` and `Fare` features to 99 percentile as max level (upper bound) in next step.

In [7]:
# Cap features
titanic_df['Age_clean'].clip(upper=titanic_df['Age_clean'].quantile(.99), inplace=True)
titanic_df['Fare_clean'] = titanic_df['Fare'].clip(upper=titanic_df['Fare'].quantile(.99))

In [8]:
# Describe the dataframe again to make sure the capping was successful
titanic_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
PassengerId,891.0,446.0,257.353842,1.0,223.5,446.0,668.5,891.0
Survived,891.0,0.383838,0.486592,0.0,0.0,0.0,1.0,1.0
Pclass,891.0,2.308642,0.836071,1.0,2.0,3.0,3.0,3.0
Age,714.0,29.699118,14.526497,0.42,20.125,28.0,38.0,80.0
SibSp,891.0,0.523008,1.102743,0.0,0.0,0.0,1.0,8.0
Parch,891.0,0.381594,0.806057,0.0,0.0,0.0,0.0,6.0
Fare,891.0,32.204208,49.693429,0.0,7.9104,14.4542,31.0,512.3292
Age_clean,891.0,29.640195,12.820616,0.42,22.0,29.699118,35.0,65.0
Fare_clean,891.0,31.224767,42.524125,0.0,7.9104,14.4542,31.0,249.00622


Now we can see that `Age_clean` and `Fare_clean` max values are capped correctly and as expected. Compare to those with original columns max values.

In [9]:
# Write out capped data
titanic_df.to_csv('../Data/titanic_capped.csv', index=False)