## Data Preprocessing

In [1]:
import numpy as np

In [2]:
from sklearn import preprocessing


In [3]:
data = np.array([[6, 2, 3], [4, 5, 8], [2, 3, 9]])


print(data)

[[6 2 3]
 [4 5 8]
 [2 3 9]]


### Preprocessing Techniques

# Mean removal   

It involves removing the mean from each feature so that it is centered on zero. Mean removal helps in removing any bias from the features.

In [4]:
data_standardized = preprocessing.scale(data)

In [5]:
data_standardized.mean(axis = 0)

array([ 0.00000000e+00, -1.48029737e-16, -7.40148683e-17])

In [6]:
data_standardized.std(axis = 0)

array([1., 1., 1.])

In [7]:
scaler = preprocessing.StandardScaler().fit(data)

In [8]:
scaler

StandardScaler()

In [9]:
scaler.mean_

array([4.        , 3.33333333, 6.66666667])

In [10]:
scaler.scale_

array([1.63299316, 1.24721913, 2.62466929])

# Scaling

In [11]:
data_scaler = preprocessing.MinMaxScaler(feature_range = (0, 1))

In [12]:
data_scaled = data_scaler.fit_transform(data)

In [13]:
data_scaled

array([[1.        , 0.        , 0.        ],
       [0.5       , 1.        , 0.83333333],
       [0.        , 0.33333333, 1.        ]])

## Normalization¶

In [14]:
data_normalized = preprocessing.normalize(data, norm  = 'l1')


data_normalized

array([[0.54545455, 0.18181818, 0.27272727],
       [0.23529412, 0.29411765, 0.47058824],
       [0.14285714, 0.21428571, 0.64285714]])

In [15]:
data_normalized = preprocessing.normalize(data, norm  = 'max')


data_normalized

array([[1.        , 0.33333333, 0.5       ],
       [0.5       , 0.625     , 1.        ],
       [0.22222222, 0.33333333, 1.        ]])

## Binarization

In [16]:
data_binarized = preprocessing.Binarizer(threshold=1.4).transform(data)

data_binarized

array([[1, 1, 1],
       [1, 1, 1],
       [1, 1, 1]])

## One Hot Encoding

Convert categorical data into numerical data automatically

In [18]:
encoder = preprocessing.OneHotEncoder()

In [20]:
X = [['Male', 1], ['Female', 3], ['Female', 2]]
encoder.fit(X)

OneHotEncoder()

In [21]:
encoder.categories_

[array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]

### Label Encoding

In [None]:
Label encoding refers to changing the word labels into numbers so that the algorithms can understand how to work on them.

In [None]:
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
input_classes = ['suzuki', 'ford', 'suzuki', 'toyota', 'ford', 'bmw']
label_encoder.fit(input_classes)
print "\nClass mapping:"
for i, item in enumerate(label_encoder.classes_):
print item, '-->', i

In [22]:
label_encoder = preprocessing.LabelEncoder()

In [23]:
input_classes = ['suzuki', 'ford', 'suzuki', 'toyota', 'ford', 'bmw']

In [24]:
label_encoder.fit(input_classes)

LabelEncoder()

In [27]:
for i, item in enumerate(label_encoder.classes_):
    print(i)

0
1
2
3


## Data Analysis

In [8]:
import pandas as pd

In [9]:
import numpy as np

In [10]:
pwd

'C:\\Users\\pc\\Data Science'

In [17]:
# load dataset

data = pd.read_csv(r"C:\Users\pc\Data Science\titanic_dataset.csv ")
data.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [15]:
data.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


## Summarizing the Dataset

Summarizing the data can be done in many ways as follows −

Check dimensions of the dataset
List the entire data
View the statistical summary of all attributes
Breakdown of the data by the class variable



In [16]:
## Check dimensions of the dataset

# Check original shape
data.shape

(891, 12)

##### View the Statistical Summary

In [20]:
data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292
