In [1]:
import numpy as np
import pandas as pd

In [2]:
dfcust = pd.DataFrame({'Customer Id':[1,2,3,4],
                     'Customer Name':['Shru','Stef','Elena','Ross']})
dfcust

Unnamed: 0,Customer Id,Customer Name
0,1,Shru
1,2,Stef
2,3,Elena
3,4,Ross


In [3]:
dfprod = pd.DataFrame({'Customer Id':[1,2,3,4],
                     'Product Id':['A11','A22','A33','A44'],
                      'Product Price':[3000,2000,1000,4000]})
dfprod

Unnamed: 0,Customer Id,Product Id,Product Price
0,1,A11,3000
1,2,A22,2000
2,3,A33,1000
3,4,A44,4000


In [4]:
dfbill=pd.merge(dfcust,dfprod,on='Customer Id')
dfbill

Unnamed: 0,Customer Id,Customer Name,Product Id,Product Price
0,1,Shru,A11,3000
1,2,Stef,A22,2000
2,3,Elena,A33,1000
3,4,Ross,A44,4000


In [5]:
from sklearn import preprocessing

In [6]:
#create feature
feature=np.array([[-500.5],
                 [-100.1],
                 [0],
                 [100.1],
                 [900.9]])

In [7]:
#create scalar
minmax_scale=preprocessing.MinMaxScaler(feature_range=(0,1))

In [8]:
#scale feature
scaled_feature=minmax_scale.fit_transform(feature)

In [9]:
#show feature
scaled_feature

array([[0.        ],
       [0.28571429],
       [0.35714286],
       [0.42857143],
       [1.        ]])

# Standardizing Feature

In [10]:
from sklearn import preprocessing

In [11]:
# create feaure
x=np.array([[-1000.1],
           [-200.2],
           [500.5],
           [600.6],
           [9000.9]])

In [12]:
#create scalar
scalar=preprocessing.StandardScaler()

In [13]:
# Transform the feature
standardized = scalar.fit_transform(x)

In [14]:
#Show feature
standardized

array([[-0.76058269],
       [-0.54177196],
       [-0.35009716],
       [-0.32271504],
       [ 1.97516685]])

In [15]:
#print mean and standard deviation
print("Mean:",round(standardized.mean()))
print("Standard Deviation:",round(standardized.std()))

Mean: 0.0
Standard Deviation: 1.0


# Normalizing observation

In [16]:
from sklearn.preprocessing import Normalizer

In [17]:
#create feature matrix
features=np.array([[0.5,0.5],
                  [1.1,3.4],
                  [1.5,20.2],
                  [1.63,34.4],
                  [10.9,3.3]])

In [18]:
#create Normalizer
normalizer=Normalizer(norm="l2")

In [19]:
#Transform feature matrix
normalizer.transform(features)

array([[0.70710678, 0.70710678],
       [0.30782029, 0.95144452],
       [0.07405353, 0.99725427],
       [0.04733062, 0.99887928],
       [0.95709822, 0.28976368]])

In [20]:
features_l1_norm=Normalizer(norm="l1").transform(features)

In [21]:
#show feature matrix
features_l1_norm

array([[0.5       , 0.5       ],
       [0.24444444, 0.75555556],
       [0.06912442, 0.93087558],
       [0.04524008, 0.95475992],
       [0.76760563, 0.23239437]])

# Transforming Features

In [22]:
from sklearn.preprocessing import FunctionTransformer

In [23]:
#create feature matrix
features=np.array([[2,3],
                  [2,3],
                  [2,3]])

In [24]:
#Define a simple function
def add_ten(x):
    return x/10

In [25]:
#create transformer
ten_transformer=FunctionTransformer(add_ten)

In [26]:
#transform featue matrix
ten_transformer.transform(features)

array([[0.2, 0.3],
       [0.2, 0.3],
       [0.2, 0.3]])

# Detecting Outliers

In [27]:
from sklearn.covariance import EllipticEnvelope
from sklearn.datasets import make_blobs

In [28]:
make_blobs

<function sklearn.datasets._samples_generator.make_blobs(n_samples=100, n_features=2, centers=None, cluster_std=1.0, center_box=(-10.0, 10.0), shuffle=True, random_state=None)>

In [29]:
#create simulated data
features, _=make_blobs(n_samples=10,
                      n_features=2,
                      centers=1,
                      random_state=1)

In [30]:
features

array([[-1.83198811,  3.52863145],
       [-2.76017908,  5.55121358],
       [-1.61734616,  4.98930508],
       [-0.52579046,  3.3065986 ],
       [ 0.08525186,  3.64528297],
       [-0.79415228,  2.10495117],
       [-1.34052081,  4.15711949],
       [-1.98197711,  4.02243551],
       [-2.18773166,  3.33352125],
       [-0.19745197,  2.34634916]])

In [31]:
#replace the first observation's values with extreme values
features[0,0]=10000


In [32]:
outlier_detector=EllipticEnvelope(contamination=.1)

In [33]:
#fit detector
outlier_detector.fit(features)

EllipticEnvelope(assume_centered=False, contamination=0.1, random_state=None,
                 store_precision=True, support_fraction=None)

In [34]:
#predict outliers
outlier_detector.predict(features)

array([-1,  1,  1,  1,  1,  1,  1,  1,  1,  1])

In [35]:
#create a dataframe
houses=pd.DataFrame()
houses['Prices']=[534433,392333,293222,4322032]
houses['Bathrooms']=[2,3.5,2,116]
houses['Square_Feet']=[1500,2500,1500,48000]

In [36]:
#Filter obseervation
houses[houses['Bathrooms']<20]

Unnamed: 0,Prices,Bathrooms,Square_Feet
0,534433,2.0,1500
1,392333,3.5,2500
2,293222,2.0,1500


In [37]:
#create feature based on boolean condition
houses["Outlier"]=np.where(houses["Bathrooms"]<20,0,1)

In [38]:
#show data
houses

Unnamed: 0,Prices,Bathrooms,Square_Feet,Outlier
0,534433,2.0,1500,0
1,392333,3.5,2500,0
2,293222,2.0,1500,0
3,4322032,116.0,48000,1


In [39]:
#Log feature
houses["Log_of_Square_Feet"]=[np.log(x) for x in houses["Square_Feet"]]

In [40]:
houses

Unnamed: 0,Prices,Bathrooms,Square_Feet,Outlier,Log_of_Square_Feet
0,534433,2.0,1500,0,7.31322
1,392333,3.5,2500,0,7.824046
2,293222,2.0,1500,0,7.31322
3,4322032,116.0,48000,1,10.778956


# Discretizing Feature

In [41]:
from sklearn.preprocessing import Binarizer

In [42]:
#create feature
age=np.array([[6],
             [12],
             [20],
             [36],
             [65]])

In [43]:
#create binarizer
binarizer=Binarizer(18)

In [44]:
#transform feature
binarizer.fit_transform(age)
#same as
#if(age>18) binaryage=1 else binaryage=0

array([[0],
       [0],
       [1],
       [1],
       [1]])

In [45]:
#Bin feature
np.digitize(age,bins=[20,30,64])

array([[0],
       [0],
       [1],
       [2],
       [3]], dtype=int64)

In [46]:
#Bin feature
np.digitize(age,bins=[20,30,64],right=True)

array([[0],
       [0],
       [0],
       [2],
       [3]], dtype=int64)

# Categorical data

In [47]:
from sklearn.preprocessing import LabelBinarizer,MultiLabelBinarizer

In [48]:
#create feature
feat=np.array([["Kochi"],
              ["Mumbai"],
              ["Kochi"],
              ["Indore"],
              ["Kochi"]])

In [49]:
#create one-hot encoder
one_hot=LabelBinarizer()

In [50]:
#One-hot encode feature
one_hot.fit_transform(feat)

array([[0, 1, 0],
       [0, 0, 1],
       [0, 1, 0],
       [1, 0, 0],
       [0, 1, 0]])

In [51]:
one_hot.classes_

array(['Indore', 'Kochi', 'Mumbai'], dtype='<U6')

In [52]:
feat

array([['Kochi'],
       ['Mumbai'],
       ['Kochi'],
       ['Indore'],
       ['Kochi']], dtype='<U6')

In [53]:
one_hot.inverse_transform(one_hot.transform(feat))

array(['Kochi', 'Mumbai', 'Kochi', 'Indore', 'Kochi'], dtype='<U6')

In [54]:
pd.get_dummies(feat[:,0])

Unnamed: 0,Indore,Kochi,Mumbai
0,0,1,0
1,0,0,1
2,0,1,0
3,1,0,0
4,0,1,0


In [55]:
#create multiclass feature
multiclass_feat=[("Kochi","Surat"),
                ("Mumbai","Pune"),
                ("Kochi","Bangalore"),
                ("Hyderabad","Pune"),
                ("Kochi","Bangalore")]

In [56]:
one_hot_multiclass=MultiLabelBinarizer()

In [57]:
one_hot_multiclass.fit_transform(multiclass_feat)

array([[0, 0, 1, 0, 0, 1],
       [0, 0, 0, 1, 1, 0],
       [1, 0, 1, 0, 0, 0],
       [0, 1, 0, 0, 1, 0],
       [1, 0, 1, 0, 0, 0]])

In [58]:
one_hot_multiclass.classes_

array(['Bangalore', 'Hyderabad', 'Kochi', 'Mumbai', 'Pune', 'Surat'],
      dtype=object)

# Handling ordinal categorical feature

In [64]:
#create feature
dataframe=pd.DataFrame({"Score":["Low","Low","Medium","Medium","High"
                                #,"between medium and high"
                                ]})

In [65]:
#create mapper
scale_mapper={"Low":1,"Medium":2,
              #"between medium and high":2.5,
              "High":3}

In [66]:
#Replace feature values with scale
dataframe["Score"].replace(scale_mapper)

0    1
1    1
2    2
3    2
4    3
Name: Score, dtype: int64

In [67]:
from sklearn.feature_extraction import DictVectorizer

In [68]:
#create word counts dictionaries for four documents
doc_1_word_count={"Highly":2,"Covid":4}
doc_2_word_count={"Highly":4,"Covid":3}
doc_3_word_count={"Highly":1,"Weather":2}
doc_4_word_count={"Highly":2,"Weather":2}

In [70]:
#create list
doc_word_counts=[doc_1_word_count,
                doc_2_word_count,
                doc_3_word_count,
                doc_4_word_count]

In [71]:
dictvectorizer=DictVectorizer(sparse=False)

In [72]:
#convert list of word count dictionaries into feature matrix
c=dictvectorizer.fit_transform(doc_word_counts)

In [73]:
c

array([[4., 2., 0.],
       [3., 4., 0.],
       [0., 1., 2.],
       [0., 2., 2.]])

In [74]:
#create feature matrix with categorical feature
X=np.array([[0,2.10,1.45],
           [1,1.18,1.33],
           [0,1.22,1.27],
           [1,-0.21,-1.19],
           [1,1.56,2.35]])

In [75]:
#create feature matrix with missing values in the categorical feature
X_with_nan=np.array([[np.nan,0.87,1.31],
                    [np.nan,-0.67,-0.22]])

In [76]:
#Join the teo feature matrices
X_complete=np.vstack((X_with_nan,X))

In [77]:
X_complete

array([[  nan,  0.87,  1.31],
       [  nan, -0.67, -0.22],
       [ 0.  ,  2.1 ,  1.45],
       [ 1.  ,  1.18,  1.33],
       [ 0.  ,  1.22,  1.27],
       [ 1.  , -0.21, -1.19],
       [ 1.  ,  1.56,  2.35]])

In [78]:
from sklearn.impute import SimpleImputer

In [80]:
imputer=SimpleImputer(strategy='most_frequent')

In [81]:
imputer.fit_transform(X_complete)

array([[ 1.  ,  0.87,  1.31],
       [ 1.  , -0.67, -0.22],
       [ 0.  ,  2.1 ,  1.45],
       [ 1.  ,  1.18,  1.33],
       [ 0.  ,  1.22,  1.27],
       [ 1.  , -0.21, -1.19],
       [ 1.  ,  1.56,  2.35]])

# Handling Text

In [82]:
#create text
text_data=["Interrobang. by Aishwarya Henriette ",
          "Parking and Going. By Karl Gautier",
          " Today Is The night. By Jarek Prakash "]

In [85]:
#strip whitespaces
strip_whitespace=[string.strip() for string in text_data]

In [87]:
#show text
strip_whitespace

['Interrobang. by Aishwarya Henriette',
 'Parking and Going. By Karl Gautier',
 'Today Is The night. By Jarek Prakash']

In [90]:
#remove periods
remove_periods=[string.replace(".","") for string in strip_whitespace]

In [91]:
#show text
remove_periods

['Interrobang by Aishwarya Henriette',
 'Parking and Going By Karl Gautier',
 'Today Is The night By Jarek Prakash']

In [94]:
#create function
def capitalizer(string: str)->str:
    return string.upper()

In [95]:
#apply function
[capitalizer(string) for string in remove_periods]

['INTERROBANG BY AISHWARYA HENRIETTE',
 'PARKING AND GOING BY KARL GAUTIER',
 'TODAY IS THE NIGHT BY JAREK PRAKASH']

In [96]:
#import library
import re

In [97]:
#create function
def replace_letters_with_X(string: str) -> str:
    return re.sub(r"[a-zA-Z]","X",string)

In [98]:
#apply function
[replace_letters_with_X(string) for string in remove_periods]

['XXXXXXXXXXX XX XXXXXXXXX XXXXXXXXX',
 'XXXXXXX XXX XXXXX XX XXXX XXXXXXX',
 'XXXXX XX XXX XXXXX XX XXXXX XXXXXXX']

# Cleaning HTML