In [1]:
import pandas as pd
import numpy as np
# Label Encoding
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
# Feature Scaling
from sklearn.preprocessing import MinMaxScaler, StandardScaler
# Handle Missing Values
from sklearn.impute import SimpleImputer
# Train Test Split
from sklearn.model_selection import train_test_split

In [2]:
data = {
    "gender" : ["m","m","f","m","f","f","f","m","m","m"],
    "country" : ['Ind', 'Aus', 'China', 'Aus', 'Aus', 'Ind','Ind','Ind','China','China'],
    "age" : [20,25,54,56,65,None,47,19,27,24],
    "salary" : [57000,45000,56400,25000,32000,35000,None,78000,28000,45000]
}

In [3]:
df = pd.DataFrame(data)

In [11]:
df

Unnamed: 0,gender,country,age,salary
0,m,Ind,20.0,57000.0
1,m,Aus,25.0,45000.0
2,f,China,54.0,56400.0
3,m,Aus,56.0,25000.0
4,f,Aus,65.0,32000.0
5,f,Ind,,35000.0
6,f,Ind,47.0,
7,m,Ind,19.0,78000.0
8,m,China,27.0,28000.0
9,m,China,24.0,45000.0


In [7]:
# na - not available
# nan - not a number
pd.isna(df).sum()

gender     0
country    0
age        1
salary     1
dtype: int64

In [9]:
df.fillna(0)

In [10]:
np.nan

nan

In [12]:
imputer = SimpleImputer(missing_values=np.nan, strategy="mean")

In [16]:
imputer_fit = imputer.fit(df.iloc[:,2:3])

In [17]:
imputer_fit.transform(df.iloc[:,2:3])

array([[20.        ],
       [25.        ],
       [54.        ],
       [56.        ],
       [65.        ],
       [37.44444444],
       [47.        ],
       [19.        ],
       [27.        ],
       [24.        ]])

In [18]:
df.iloc[:,2:3] = imputer.fit_transform(df.iloc[:,2:3])

In [20]:
df.iloc[:,3:] = imputer.fit_transform(df.iloc[:,3:])

In [21]:
df

Unnamed: 0,gender,country,age,salary
0,m,Ind,20.0,57000.0
1,m,Aus,25.0,45000.0
2,f,China,54.0,56400.0
3,m,Aus,56.0,25000.0
4,f,Aus,65.0,32000.0
5,f,Ind,37.444444,35000.0
6,f,Ind,47.0,44600.0
7,m,Ind,19.0,78000.0
8,m,China,27.0,28000.0
9,m,China,24.0,45000.0


In [26]:
# df['age'].values.reshape(-1,1)

In [27]:
# df.dropna()

In [28]:
label = LabelEncoder()
df['gender'] = label.fit_transform(df['gender'])

In [29]:
df.head()

Unnamed: 0,gender,country,age,salary
0,1,Ind,20.0,57000.0
1,1,Aus,25.0,45000.0
2,0,China,54.0,56400.0
3,1,Aus,56.0,25000.0
4,0,Aus,65.0,32000.0


In [30]:
country = label.fit_transform(df['country'])

In [31]:
country
# aus - 0
# china - 1
# ind - 2

array([2, 0, 1, 0, 0, 2, 2, 2, 1, 1])

In [34]:
onehot = OneHotEncoder()
country = onehot.fit_transform(country.reshape(-1,1))

In [35]:
country

<10x3 sparse matrix of type '<class 'numpy.float64'>'
	with 10 stored elements in Compressed Sparse Row format>

In [38]:
df.head()

Unnamed: 0,gender,country,age,salary
0,1,Ind,20.0,57000.0
1,1,Aus,25.0,45000.0
2,0,China,54.0,56400.0
3,1,Aus,56.0,25000.0
4,0,Aus,65.0,32000.0


In [36]:
country.toarray()

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.]])

In [40]:
country.shape

(10, 3)

In [41]:
country.ndim

2

In [42]:
# MinMaxScaler - Normalization
# StandardScaler - Standardization

In [43]:
df['age'].std()

16.859623767458363

In [47]:
# for item in df['age']:
#     print(item)

for i in range(len(df)):
    df['age'].iloc[i] = df['age'].iloc[i] + 4

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [49]:
# df['age'] = newAge

In [52]:
minmax = MinMaxScaler()
minmax.fit_transform(df.iloc[:,2:3])

array([[0.02173913],
       [0.13043478],
       [0.76086957],
       [0.80434783],
       [1.        ],
       [0.40096618],
       [0.60869565],
       [0.        ],
       [0.17391304],
       [0.10869565]])

In [53]:
sc = StandardScaler()
sc.fit_transform(df.iloc[:,2:3])

array([[-1.09065654],
       [-0.77804798],
       [ 1.03508169],
       [ 1.16012511],
       [ 1.72282053],
       [ 0.        ],
       [ 0.5974297 ],
       [-1.15317826],
       [-0.65300455],
       [-0.84056969]])

In [60]:
X = df[['gender','country','age']]
y = df['salary']
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.25)

# x_train and y_train - apply ML
# test on x_test
# compare with y_test

In [61]:
x_train.shape

(7, 3)

In [62]:
x_test.shape

(3, 3)

In [63]:
y_train.shape

(7,)

In [64]:
y_test.shape

(3,)

In [65]:
x_train

Unnamed: 0,gender,country,age
6,0,Ind,51.0
0,1,Ind,24.0
2,0,China,58.0
8,1,China,31.0
4,0,Aus,69.0
5,0,Ind,41.444444
1,1,Aus,29.0
