## Data preparation

In [None]:
import numpy as np
import pandas as pd

In [None]:
# Handling Missing Values in DataFrames

In [None]:
df = pd.DataFrame({"id":[np.nan,2,3,4,5],"grade":[np.nan,"b",np.nan,"c",np.nan],
                   "award":[np.nan, "gold", "silver","bronze", np.nan]})
display(df)

In [None]:
# Dropping rows or columns with missing values

display(df.dropna(how="any"))

In [None]:
display(df.dropna(how="all",subset=["grade","award"]))

In [None]:
# Imputing missing values
values = {"grade": "e", "award": "iron"}
display(df.fillna(value=values))

In [None]:
df["id"].fillna(df["id"].mean(),inplace=True)
display(df)

In [None]:
df["award"].fillna(df["award"].mode()[0],inplace=True)
display(df)

In [None]:
# Binning in DataFrames

# Equal-width binning (using cut)

df = pd.DataFrame({"values": np.random.rand(100)})
res, bins = pd.cut(df["values"],10,retbins=True)
display(bins)

In [None]:
display(res)

In [None]:
df2 = pd.DataFrame({"values": np.random.rand(100)})
new_res = pd.cut(df2["values"],bins)
display(new_res)

In [None]:
res, bins = pd.qcut(df["values"],10,retbins=True,labels=list("abcdefghij"))
display(bins)

In [None]:
display(res)

In [None]:
# min-max normalization
df = pd.DataFrame({"values": np.random.randn(100)})
df.head()

In [None]:
min = df["values"].min()
display(min)

In [None]:
max = df["values"].max()
display(max)

In [None]:
df["values"] = [(x-min)/(max-min) for x in df["values"]]
df.head()

In [None]:
# z-normalization;
df = pd.DataFrame({"values": np.random.randn(100)})
df.head()

In [None]:
mean = df["values"].mean()
display(mean)

In [None]:
std = df["values"].std()
display(std)

In [None]:
df["values"] = df["values"].apply(lambda x: (x-mean)/std)
df.head()

In [None]:
# Selection of top-ranked categorical features

In [None]:
df = pd.DataFrame({"id":[1,2,3,4,5],"grade":["b","b","a","c","a"],
                   "award":["gold", "gold", "silver","bronze", "bronze"],
                   "class": [1,1,1,0,0]})
display(df)

In [None]:
for col in df.columns:
    df[col] = df[col].astype("category")
res = [(col,[g.groupby("class").size().values for (n,g) in df.groupby(col)]) for col in df.columns.drop("class")]
display(res)

In [None]:
def score(values): # simple scoring function
    return np.max([np.max(v)-np.min(v) for v in values])

In [None]:
scores = [(col,score(r)) for (col,r) in res]
display(scores)

In [None]:
sorted_scores = sorted(scores,key=lambda tup: tup[1],reverse=True)
display(sorted_scores)

In [None]:
filtered = [col for (col,score) in sorted_scores[:2]]
display(filtered)

In [None]:
new_df = df.loc[:,filtered]
display(new_df)