# Exercise

Develop notebook to apply decision tree classification algorithms to Australian credit risk dataset using code generated from chat GPT and measure its accuracy. Divide the notebook to the prompts/queries we give to the openai model. The model provides code snippets, so we may need to add some more code to make them work.
Study the ML with GPT notebook.

In [36]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Load the data set
# Load the dataset
columns = [
    "Customer ID",
    "Age",
    "Gender",
    "Income",
    "Employment Status",
    "Education Level",
    "Marital Status",
    "Dependents",
    "Credit Score",
    "Credit Limit",
    "Balance",
    "Payment History",
    "Credit Utilization",
    "Number of Credit Accounts",
    "Loan Purpose"
]
data = pd.read_csv('australian.csv', header=None)
data.columns = columns
data.head()

Unnamed: 0,Customer ID,Age,Gender,Income,Employment Status,Education Level,Marital Status,Dependents,Credit Score,Credit Limit,Balance,Payment History,Credit Utilization,Number of Credit Accounts,Loan Purpose
0,X1,X2,X3,X4,X5,X6,X7,X7,X9,X10,X11,X12,X13,X14,Y
1,1,22.08,11.46,2,4,4,1.585,0,0,0,1,2,100,1213,0
2,0,22.67,7,2,8,4,0.165,0,0,0,0,2,160,1,0
3,0,29.58,1.75,1,4,4,1.25,0,0,0,1,2,280,1,0
4,0,21.67,11.5,1,5,3,0,1,1,11,1,2,0,1,1


# Prompt 1

Can you give me a snippet of python code that trains a classification model on this dataset https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/australian/australian.dat ?

In [38]:
# Split the data into features and target
X = data.drop("Loan Purpose", axis=1)
y = data["Loan Purpose"]

# Encode categorical variables
le = LabelEncoder()
X_encoded = X.copy()
for col in X.columns:
    if X[col].dtype == 'object':
        X_encoded[col] = le.fit_transform(X[col])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Train a decision tree classifier
classifier = DecisionTreeClassifier()
classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = classifier.predict(X_test)

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8273381294964028


# Prompt 2

Can you improve the performance of the model?

In [39]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

# Split the data into features and target
X = data.drop("Loan Purpose", axis=1)
y = data["Loan Purpose"]

# Encode categorical variables
le = LabelEncoder()
X_encoded = X.copy()
for col in X.columns:
    if X[col].dtype == 'object':
        X_encoded[col] = le.fit_transform(X[col])

# Scale numerical features
scaler = StandardScaler()
numerical_cols = ['Age', 'Income', 'Credit Score', 'Credit Limit', 'Balance', 'Credit Utilization', 'Number of Credit Accounts']
X_encoded[numerical_cols] = scaler.fit_transform(X_encoded[numerical_cols])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Train a Random Forest classifier
classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = classifier.predict(X_test)

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8776978417266187


# Prompt 3

Can you give me a snippet of code that does all the necessary preprocessing steps for this specific dataset and trains a model for classification? Also, can you do the preprocessing and the inference inside of a pipeline?

In [45]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Load the dataset
columns = [
    "Customer ID",
    "Age",
    "Gender",
    "Income",
    "Employment Status",
    "Education Level",
    "Marital Status",
    "Dependents",
    "Credit Score",
    "Credit Limit",
    "Balance",
    "Payment History",
    "Credit Utilization",
    "Number of Credit Accounts",
    "Loan Purpose"
]

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/australian/australian.dat'
data = pd.read_csv(url, sep=' ', header=None, names=columns)
data.dropna(inplace=True)

# Split the data into features and target
X = data.drop("Loan Purpose", axis=1)
y = data["Loan Purpose"]

# Define the preprocessing steps
categorical_features = ['Gender', 'Employment Status', 'Education Level', 'Marital Status']
numerical_features = ['Age', 'Income', 'Credit Score', 'Credit Limit', 'Balance', 'Payment History',
                      'Credit Utilization', 'Number of Credit Accounts']

categorical_transformer = Pipeline(steps=[
    ('label_encoding', LabelEncoder())
])

numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(transformers=[
    ('categorical', categorical_transformer, categorical_features),
    ('numerical', numerical_transformer, numerical_features)
])

# Define the classification model
model = DecisionTreeClassifier()

# Create the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


TypeError: LabelEncoder.fit_transform() takes 2 positional arguments but 3 were given

In [46]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Load the dataset
columns = [
    "Customer ID",
    "Age",
    "Gender",
    "Income",
    "Employment Status",
    "Education Level",
    "Marital Status",
    "Dependents",
    "Credit Score",
    "Credit Limit",
    "Balance",
    "Payment History",
    "Credit Utilization",
    "Number of Credit Accounts",
    "Loan Purpose"
]

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/australian/australian.dat'
data = pd.read_csv(url, sep=' ', header=None, names=columns)
data.dropna(inplace=True)

# Split the data into features and target
X = data.drop("Loan Purpose", axis=1)
y = data["Loan Purpose"]

# Define a custom transformer for label encoding categorical features
class LabelEncoderTransformer:
    def __init__(self):
        self.label_encoder = LabelEncoder()

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_encoded = self.label_encoder.transform(X)
        return X_encoded.reshape(-1, 1)

# Define the preprocessing steps
categorical_features = ['Gender', 'Employment Status', 'Education Level', 'Marital Status']
numerical_features = ['Age', 'Income', 'Credit Score', 'Credit Limit', 'Balance', 'Payment History',
                      'Credit Utilization', 'Number of Credit Accounts']

categorical_transformer = Pipeline(steps=[
    ('label_encoding', LabelEncoderTransformer()),
    ('one_hot_encoding', OneHotEncoder(handle_unknown='ignore'))
])

numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(transformers=[
    ('categorical', categorical_transformer, categorical_features),
    ('numerical', numerical_transformer, numerical_features)
])

# Define the classification model
model = DecisionTreeClassifier()

# Create the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


NotFittedError: This LabelEncoder instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.