In [None]:
import numpy as np
import pandas as pd

from sklearn.ensemble import VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict

In [None]:
seed = 42

# Load data

In [None]:
# load the data from the url into a pandas dataframe
col_name = ['A1', 'A2', 'A3','A4','A5','A6','A7','A8','A9','A10','A11','A12','A13','A14','A15','A16']
data_path = "http://archive.ics.uci.edu/ml/machine-learning-databases/credit-screening/crx.data"

df = pd.read_csv(data_path, sep=',', names=col_name)

# pay attention how I've taken the file path and col names out of pd.read_csv().
# Overall, this is a good practice to assign long arguments to variables and then pass those variables to your function


In [None]:
# check datatypes to compare with the info in crx.names
df.dtypes

A2 and A14 should've had dtype of either `float` or `integer`, but they are objects -> something is going on here! maybe missing data - this is a good guess since the data description on `crx.names` says A2 and A14 have missing data.
```    A1:  12
    A2:  12
    A4:   6
    A5:   6
    A6:   9
    A7:   9
    A14: 13
    
    
let's convert one of them to numeric type and see what rows are causing problem
   


In [None]:
pd.to_numeric(df['A2'], errors="raise")

REMEBER: last line of error log is usually the most important line you have to look at initially!

From the error `ValueError: Unable to parse string "?" at position 83` I can see that row 83 with the value of `?` is causing issue.

refering to `imputation` strategies, I have multiple options. since the data dictoinary (crx.name) tells me the number of missing values are not that much (at most ~2% in column A14), i'll drop them.

In [None]:
# convert selected columns to numeric dtype
df[['A2', 'A14']] = df[['A2', 'A14']].apply(pd.to_numeric, errors='coerce')

In [None]:
# check number of missing values to compare with the data dictionary
df.isna().sum()

I can see that A2 and A14 now have the same number of missing values as the data dictionary says. However, A1, A4, 5, 6, 7 also should have missing values, but it doesn't show here. My guess is that because they have data type of object (string), I need to find what is used to denote missing values. Let's check...

In [None]:
for col in ['A1', 'A4', 'A5', 'A6', 'A7']:
    print(col, ": ", df[col].unique(), '\n')

I see that all those columns have '?' in their values. so now I need to convert `?` to `NaN`

In [None]:
df = df.replace('?',np.NaN)

Now let's count NaNs one more time...

In [None]:
df.isna().sum()

Perfect! now I'm getting the same numbers as the data dictionary suggests.  I can simply  drop those rows.
However, I want to show you how you can add imputation to your **pipeline**. 

### Check solution for assignment 1 for More EDAV!!!!!

# 2- Linear Leaner


important: always use **pipelines**.

I am using logistic regression. you can use other linear classifiers like SVM as well

In [None]:
# seperating features and lables
X = df.loc[:, df.columns != 'A16']
y = df['A16']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=seed)

`ColumnTransformer` is a great method to apply different preprocessing and feature extraction pipelines to different subsets of features . 

This is particularly handy for the case of datasets that contain heterogeneous data types, since we may want to scale the numeric features and one-hot encode the categorical ones.

In this example, the numeric data is standard-scaled after mean-imputation, while the categorical data is one-hot encoded after imputing missing values with a new category ('missing').

I'm using `Chelsea`'s solution with some modification:

In [None]:
numeric_features = X.select_dtypes(exclude='O').columns
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

categorical_features = X.select_dtypes(include='O').columns
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

In [None]:
# Now let's add a linear classifier to preprocessing pipeline to create a full prediction pipeline.
clf_linear = Pipeline(
    steps=[("preprocessor", preprocessor), 
           ("classifier", LogisticRegression(random_state=seed))]
)

clf_linear.fit(X_train, y_train)
print("model score: %.3f" % clf_linear.score(X_test, y_test))

### Evaluating the linear model on test set

In [None]:
y_pred_linear = clf_linear.predict(X_test)

print(classification_report(y_test, y_pred_linear))

### Cross-validation (optional)
Below is an example of `5 fold cross-validation` for the linear learner, in case you want to generate cross-validated estimates. You can apply the same logic/code for other models.

In [None]:
cv_score = np.mean(cross_val_score(clf_linear, X_test, y_test, cv=5))
print("model score: %.3f" % cv_score)

In [None]:
y_pred_linear_cv = cross_val_predict(clf_linear, X_test, y_test, cv=5)

print(classification_report(y_test, y_pred_linear_cv))

# 3- Non-Linear Learner


important: always use **pipelines**.

I am using kNN. you can use other non-linear classifiers like decision trees as well

In [None]:
clf_non_linear = Pipeline(
    steps=[("preprocessor", preprocessor), 
           ("classifier", KNeighborsClassifier(n_neighbors=5))]
)

clf_non_linear.fit(X_train, y_train)
print("model score: %.3f" % clf_non_linear.score(X_test, y_test))

### Evaluating the non-linear model on the test set

In [None]:
y_pred_nonlinear = clf_non_linear.predict(X_test)

print(classification_report(y_test, y_pred_nonlinear))

You can see that I'm getting a better F1 score for both classes using a non-linear model

### look how clean and easy it is to change a step when you use pipelines

# 4- Ensemble model
In this section, we will use one of the discussed ensemble methods to create a new model based on step 2 and 3.

In [None]:
ensemble = VotingClassifier(
    estimators=[('lr', LogisticRegression(random_state=seed)),
                ('knn', KNeighborsClassifier(n_neighbors=5))],
    voting='hard')

clf_ensemble = Pipeline(
    steps=[("preprocessor", preprocessor), 
           ("classifier", ensemble)]
       )

clf_ensemble.fit(X_train, y_train)
print("model score: %.3f" % clf_ensemble.score(X_test, y_test))

### Evaluating the ensemble model on the test set

In [None]:
y_pred_ensemble = clf_ensemble.predict(X_test)

print(classification_report(y_test, y_pred_ensemble))

I can see improvement in precision for `-` class, while worse result for other metrics. This is ok! We shouldn't expect a more complex model to be naturally better! also I haven't run any hyperparameter optimization.

# 5- Prediction

In [None]:
# declaring the function
def credit_approval(row, model=clf_ensemble):
   """
   Main function to take a sample data and use a trained model to predict if sample's application will get approved or not

   row: sample data
   model: ensemble model you have already trained
   """
   
   result = model.predict(row)
   
   return result


# i passed the clf_ensemble as the default value for the model argument of the function

Now I can use my `test set` to get prediction. the following does it for the first row in the test set.

In [None]:
print("Predicted value:", credit_approval(X_test.iloc[[0]]))
print("Real value:", y_test.iloc[0])