In [1]:
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

In [2]:
titanic = sns.load_dataset('titanic')
X = titanic[['sex', 'age', 'fare', 'class', 'embark_town', 'alone']].copy()
X.shape

(891, 6)

## Handle Missing Values

In [3]:
print('Sex: ' + str(X['sex'].isnull().sum()))
print('Age: ' + str(X['age'].isnull().sum()))
print('Fare: ' + str(X['fare'].isnull().sum()))
print('Class: ' + str(X['class'].isnull().sum()))
print('Embark town: ' + str(X['embark_town'].isnull().sum()))
print('Alone: ' + str(X['alone'].isnull().sum()))

Sex: 0
Age: 177
Fare: 0
Class: 0
Embark town: 2
Alone: 0


In [4]:
mean = X['age'].mean()
mean = round(mean)
X['age'].fillna(mean, inplace=True)

## Outliers

In [5]:
features = ['age', 'fare']

for feature in features:
    min_ = X[feature].mean() - (3 * X[feature].std())
    max_ = X[feature].mean() + (3 * X[feature].std())
    X = X[X[feature] <= max_]
    X = X[X[feature] >= min_]
    print(feature, ':', X.shape)

age : (884, 6)
fare : (864, 6)


In [6]:
features = ['sex', 'class', 'embark_town', 'alone']

for feature in features:
    count_ = X[feature].value_counts()
    print(feature)
    print(count_, '\n')

sex
male      562
female    302
Name: sex, dtype: int64 

class
Third     489
First     192
Second    183
Name: class, dtype: int64 

embark_town
Southampton    632
Cherbourg      154
Queenstown      76
Name: embark_town, dtype: int64 

alone
True     524
False    340
Name: alone, dtype: int64 



## Categorical to Numeric

In [7]:
enc = LabelEncoder()

In [8]:
X['sex'] = enc.fit_transform(X['sex'].astype('str'))
X['class'] = enc.fit_transform(X['class'].astype('str'))
X['embark_town'] = enc.fit_transform(X['embark_town'].astype('str'))
X['alone'] = enc.fit_transform(X['alone'].astype('str'))

In [9]:
X.head()

Unnamed: 0,sex,age,fare,class,embark_town,alone
0,1,22.0,7.25,2,2,0
1,0,38.0,71.2833,0,0,0
2,0,26.0,7.925,2,2,1
3,0,35.0,53.1,0,2,0
4,1,35.0,8.05,2,2,1


## Rescaling

In [10]:
X = (X - X.min()) / (X.max() - X.min())
X.head(10)

Unnamed: 0,sex,age,fare,class,embark_town,alone
0,1.0,0.329064,0.043975,1.0,0.666667,0.0
1,0.0,0.573041,0.432369,0.0,0.0,0.0
2,0.0,0.390058,0.048069,1.0,0.666667,1.0
3,0.0,0.527295,0.322078,0.0,0.666667,0.0
4,1.0,0.527295,0.048827,1.0,0.666667,1.0
5,1.0,0.451052,0.051304,1.0,0.333333,1.0
6,1.0,0.817017,0.314572,0.0,0.666667,1.0
7,1.0,0.024093,0.127831,1.0,0.666667,0.0
8,0.0,0.405306,0.067529,1.0,0.666667,0.0
9,0.0,0.207075,0.182395,0.5,0.0,0.0
