# Instructor Do: Dealing with Categorical Data in ML

## 17.6.1 Encode Labels With Pandas


In [None]:
# initial imports
import pandas as pd
from path import Path

# Dataset Information

The file loans_data.csv, contains simulated data about loans, there are a total of 500 records. Each row represents a loan application along an arbitrary year, where every column represents the following data about every loan application.

    1. amount: The loan amount in USD.
    2. term: The loan term in months.
    3. month: The month of the year when the loan was requested.
    4. age: Age of the loan applicant.
    5. education: Educational level of the loan applicant.
    6. gender: Gender of the loan applicant.
    7. bad: Stands for a bad or good loan applicant (1 - bad, 0 - good).


In [None]:
# Load data
file_path = Path("./Resources/loans_data.csv")
loans_df = pd.read_csv(file_path)
loans_df.head()

In [None]:
# Binary encoding using Pandas (single column)
loans_binary_encoded = pd.get_dummies(loans_df, columns=["gender"])
loans_binary_encoded.head()

In [None]:
# Binary encoding using Pandas (multiple columns)
loans_binary_encoded = pd.get_dummies(loans_df, columns=["education", "gender"])
loans_binary_encoded.head()

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df2 = loans_df.copy()
df2['education'] = le.fit_transform(df2['education']) 
df2.head()

# Integer Encoding


In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df2 = loans_df.copy()
df2['education'] = le.fit_transform(df2['education'])

In [None]:
df2.head()

# Custom Encoding

In [None]:
# Creating an instance of label encoder
label_encoder = LabelEncoder()
loans_df["month_le"] = label_encoder.fit_transform(loans_df["month"])
loans_df.head()

In [None]:
# Months dictionary
months_num = {
    "January": 1,
    "February": 2,
    "March": 3,
    "April": 4,
    "May": 5,
    "June": 6,
    "July": 7,
    "August": 8,
    "September": 9,
    "October": 10,
    "November": 11,
    "December": 12,
}


In [None]:
# Months' names encoded using the dictionary values
loans_df["month_num"] = loans_df["month"].apply(lambda x: months_num[x])
loans_df.head()


In [None]:
# Drop the month and month_le columns
loans_df = loans_df.drop(["month", "month_le"], axis=1)
loans_df.head()


# 17.6.1 Scripts

In [None]:
#17.6.1 Scripts
loans_binary_encoded = pd.get_dummies(loans_df, columns=["gender"])
loans_binary_encoded.head()

In [None]:
loans_binary_encoded = pd.get_dummies(loans_df, columns=["education", "gender"])
loans_binary_encoded.head()

# 17.6.2 Encode Labels With Scikit-learn (shift tab pulls up definition)

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df2 = loans_df.copy()
df2['education'] = le.fit_transform(df2['education'])
df2.head()

# Skill Drill 17.6.2 - switched gender, but can't figure out how to do it with education

In [None]:
df3 = df2.copy()
df3['gender'] = le.fit_transform(df3['gender'])
df3.head()

In [None]:
# copy wasn't needed if you want to keep the same dataframe name (could append the transform as a different method)
# df2['gender'] = le.fit_transform(df2['gender'])
# df2.head()

In [None]:
# append origianl columns and then do distinct values from pandas df on certian column for a legend


In [None]:
# Load data
file_path = Path("./Resources/loans_data.csv")
loans_df = pd.read_csv(file_path)
loans_df.head()

In [None]:
months_num = {
   "January": 1,
   "February": 2,
   "March": 3,
   "April": 4,
   "May": 5,
   "June": 6,
   "July": 7,
   "August": 8,
   "September": 9,
   "October": 10,
   "November": 11,
   "December": 12,
}


In [None]:
loans_df["month_num"] = loans_df["month"].apply(lambda x: months_num[x])
loans_df.head()

In [None]:
# Creating an instance of label encoder
label_encoder = LabelEncoder()
loans_df["month_le"] = label_encoder.fit_transform(loans_df["month"])
loans_df.head()


In [None]:
loans_df = loans_df.drop(["month", "month_le"], axis=1)
loans_df.head()

In [None]:
## Skipped Skill Drill 17.6.3 
# Create a new Jupyter Notebook and open loans_data.csv as a Pandas DataFrame. 
# Encode the following labels of the dataset: month, education, and gender. 
# Then save your DataFrame as loans_data_encoded.csv.
# 11 columns ex one column is male 1, 0 and one for female 1, 0
