In [1]:
import numpy as np
import copy
import pandas as pd
from typing import Tuple, Union
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn import metrics
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler

# Part 1

## Classification on Wine Dataset

In [2]:
red_data = pd.read_csv("winequality-red.csv", sep=";")
white_data = pd.read_csv("winequality-white.csv", sep=";")

# Fix for correct dtypes for some columns
columns_dtype_change = ["free sulfur dioxide", "total sulfur dioxide"]
red_data[columns_dtype_change] = red_data[columns_dtype_change].astype(int)
white_data[columns_dtype_change] = white_data[columns_dtype_change].astype(int)

In [3]:
def get_all_scores(pred: np.array, y_true: np.array, model_name: str) -> None:
    """Prints a nicely formatted table with loss from the model predictions."""
    mae = mean_absolute_error(y_true, pred)
    msq = mean_squared_error(y_true, pred)
    r2 = r2_score(y_true, pred)
    print(f"{model_name:^27}")
    print("*" * 27)
    print(f"{'Method':<19}| Result")
    print("—" * 27)
    print(f"{'R^2':<19}| {r2:.4f}")
    print(f"{'Mean Absolute Error':<19}| {mae:.4f}")
    print(f"{'Mean Squared Error':<19}| {msq:.4f}")
    
    
def fit_model(model :Pipeline,
              X_train_data: np.array,
              y_train_data: np.array,
              X_test_data : np.array,
              y_test_data : np.array,
              print_results : bool = False,
              name_reg : str = "",
)->None:
    model.fit(X_train_data,y_train_data)
    y_pred = model.predict(X_test_data)
    if print_results :
        get_all_scores(pred = y_pred, y_true = y_test_data, model_name = name_reg)

In [4]:
X_white = white_data.iloc[:,:-1].values
y_white = white_data.iloc[:,-1].values

X_w_train, X_w_test, y_w_train, y_w_test = train_test_split(
    X_white, y_white, random_state=100
)
white_data_dictionary = {
    "X_train_data": X_w_train,
    "X_test_data": X_w_test,
    "y_train_data": y_w_train,
    "y_test_data": y_w_test,
}


X_red = red_data.iloc[:,:-1].values
y_red = red_data.iloc[:,-1].values

X_r_train, X_r_test, y_r_train, y_r_test = train_test_split(
    X_red, y_red, random_state=100
)
red_data_dictionary = {
    "X_train_data": X_r_train,
    "X_test_data": X_r_test,
    "y_train_data": y_r_train,
    "y_test_data": y_r_test,
}

In [5]:
def do_exercise(type_of_wine: str = "white") -> None:
    if type_of_wine.startswith(("w", "W")):
        data_to_use = white_data_dictionary
        wine_type = "White"
    elif type_of_wine.startswith(("r", "R")):
        data_to_use = red_data_dictionary
        wine_type = "Red"
    else:
        raise ValueError
        
        
    pipe = make_pipeline(StandardScaler(), 
                         LogisticRegression(multi_class = "multinomial", solver = "lbfgs", max_iter = 500))
    
    Pipeline(steps=[('scaler', StandardScaler()), 
                           ('logreg', LogisticRegression(multi_class = "multinomial", solver = "lbfgs"))])
                    
    fit_model(model = pipe, print_results = True, name_reg = "Logistic Regression", **data_to_use)

In [6]:
do_exercise("white")

    Logistic Regression    
***************************
Method             | Result
———————————————————————————
R^2                | 0.0931
Mean Absolute Error| 0.5331
Mean Squared Error | 0.6996


In [7]:
do_exercise("red")

    Logistic Regression    
***************************
Method             | Result
———————————————————————————
R^2                | 0.2125
Mean Absolute Error| 0.4225
Mean Squared Error | 0.4975


# Part 2

In [8]:
train_set = pd.read_csv("train.csv",index_col = 0)
test_set = pd.read_csv("test.csv",index_col = 0)
test_y = pd.read_csv("test_label.csv",index_col = 0)
test_set = pd.concat([test_set, test_y], axis = 1)

display(train_set.sample(10))
display(test_set.sample(10))

Unnamed: 0_level_0,age,job,marital,education,default,balance,housing,loan,contact,day,month,campaign,pdays,previous,poutcome,y
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
22042,70,retired,married,secondary,no,6422,no,no,cellular,10,mar,1,-1,0,unknown,yes
20936,43,blue-collar,married,secondary,no,1495,yes,no,cellular,12,may,1,-1,0,unknown,no
14887,38,management,married,tertiary,no,2548,yes,no,unknown,30,may,1,-1,0,unknown,no
13920,42,technician,married,secondary,no,9324,no,no,cellular,27,aug,4,-1,0,unknown,no
10211,60,blue-collar,married,primary,no,6271,yes,no,cellular,6,may,2,254,5,failure,no
16867,39,technician,single,tertiary,no,393,no,no,cellular,26,mar,1,-1,0,unknown,yes
20090,26,technician,single,secondary,no,626,yes,no,cellular,15,may,2,-1,0,unknown,no
20315,36,technician,single,secondary,no,27,yes,no,cellular,9,feb,1,260,2,success,yes
18650,49,unknown,married,primary,no,341,yes,yes,unknown,15,may,2,-1,0,unknown,yes
13986,63,retired,married,secondary,no,415,yes,no,cellular,7,oct,1,-1,0,unknown,no


Unnamed: 0_level_0,age,job,marital,education,default,balance,housing,loan,contact,day,month,campaign,pdays,previous,poutcome,y
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
54198,60,retired,divorced,secondary,no,1,no,no,cellular,19,feb,1,184,1,success,yes
51382,34,admin.,married,tertiary,no,899,yes,no,unknown,12,nov,1,170,3,failure,yes
53636,29,blue-collar,married,secondary,no,2378,yes,yes,cellular,20,apr,1,276,6,failure,no
52753,45,management,divorced,secondary,no,1144,yes,no,cellular,16,apr,1,-1,0,unknown,no
53180,24,student,single,secondary,no,689,no,no,cellular,8,sep,1,-1,0,unknown,yes
50003,49,blue-collar,married,primary,no,216,yes,no,unknown,9,may,2,-1,0,unknown,no
50359,36,management,single,tertiary,no,546,yes,no,cellular,6,aug,3,-1,0,unknown,no
52668,34,services,divorced,secondary,no,89,yes,no,cellular,15,may,4,-1,0,unknown,yes
50571,35,services,single,primary,no,167,no,yes,cellular,11,jul,2,-1,0,unknown,yes
50505,42,housemaid,married,primary,no,-209,no,yes,cellular,11,aug,2,-1,0,unknown,no


In [9]:
train_set.info()
round(train_set.describe(), 2)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12870 entries, 13829 to 20137
Data columns (total 16 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        12870 non-null  int64 
 1   job        12870 non-null  object
 2   marital    12870 non-null  object
 3   education  12870 non-null  object
 4   default    12870 non-null  object
 5   balance    12870 non-null  int64 
 6   housing    12870 non-null  object
 7   loan       12870 non-null  object
 8   contact    12870 non-null  object
 9   day        12870 non-null  int64 
 10  month      12870 non-null  object
 11  campaign   12870 non-null  int64 
 12  pdays      12870 non-null  int64 
 13  previous   12870 non-null  int64 
 14  poutcome   12870 non-null  object
 15  y          12870 non-null  object
dtypes: int64(6), object(10)
memory usage: 1.7+ MB


Unnamed: 0,age,balance,day,campaign,pdays,previous
count,12870.0,12870.0,12870.0,12870.0,12870.0,12870.0
mean,41.09,1483.77,15.64,2.66,45.56,0.69
std,11.31,3311.06,8.37,2.86,104.45,2.05
min,18.0,-6847.0,1.0,1.0,-1.0,0.0
25%,32.0,102.0,8.0,1.0,-1.0,0.0
50%,39.0,515.0,16.0,2.0,-1.0,0.0
75%,49.0,1591.75,21.0,3.0,-1.0,0.0
max,95.0,102127.0,31.0,43.0,854.0,58.0


process month

In [10]:
train_set["month"].value_counts()

may    3594
jul    1823
aug    1811
jun    1484
nov    1094
apr     908
feb     831
jan     375
oct     338
sep     274
mar     237
dec     101
Name: month, dtype: int64

In [11]:
mon = {"month":     {"jan": 1, "feb": 2,"mar":3, "apr" : 4, "may":5, "jun" :6,
                    "jul" : 7, "aug":8, "sep":9 , "oct" : 10, "nov" : 11, "dec" :12}}

In [12]:
train_set = train_set.replace(mon)
test_set = test_set.replace(mon)

In [13]:
train_set.info()
round(train_set.describe(), 2)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12870 entries, 13829 to 20137
Data columns (total 16 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        12870 non-null  int64 
 1   job        12870 non-null  object
 2   marital    12870 non-null  object
 3   education  12870 non-null  object
 4   default    12870 non-null  object
 5   balance    12870 non-null  int64 
 6   housing    12870 non-null  object
 7   loan       12870 non-null  object
 8   contact    12870 non-null  object
 9   day        12870 non-null  int64 
 10  month      12870 non-null  int64 
 11  campaign   12870 non-null  int64 
 12  pdays      12870 non-null  int64 
 13  previous   12870 non-null  int64 
 14  poutcome   12870 non-null  object
 15  y          12870 non-null  object
dtypes: int64(7), object(9)
memory usage: 1.7+ MB


Unnamed: 0,age,balance,day,month,campaign,pdays,previous
count,12870.0,12870.0,12870.0,12870.0,12870.0,12870.0,12870.0
mean,41.09,1483.77,15.64,6.18,2.66,45.56,0.69
std,11.31,3311.06,8.37,2.49,2.86,104.45,2.05
min,18.0,-6847.0,1.0,1.0,1.0,-1.0,0.0
25%,32.0,102.0,8.0,5.0,1.0,-1.0,0.0
50%,39.0,515.0,16.0,6.0,2.0,-1.0,0.0
75%,49.0,1591.75,21.0,8.0,3.0,-1.0,0.0
max,95.0,102127.0,31.0,12.0,43.0,854.0,58.0


define a function to help encode categorical data and pick up X and y

In [14]:
def encode_data(dataset : pd.DataFrame
               )-> Tuple[np.array, np.array]:
    
    col_name_1 = ["job","marital","education","contact","poutcome"]
    col_name_2 = ["default","housing","loan","y"]
    dataset = pd.get_dummies(dataset, columns = col_name_1)
    dataset = pd.get_dummies(dataset, columns = col_name_2,drop_first = True)
    
    X = dataset.iloc[:,:-1].values
    y = dataset.iloc[:,-1].values
    
    return X,y

In [15]:
X_train, y_train = encode_data(train_set)
X_test, y_test = encode_data(test_set)

In [16]:
ModelRegressors = Union[LinearSVC, LogisticRegression, KNeighborsClassifier]

In [17]:
def get_all_results(pred: np.array, y_true: np.array, model_name: str,scale : bool, best_params:dict) -> None:
    """Prints a nicely formatted table with loss from the model predictions."""
    accuracy = metrics.accuracy_score(y_true, pred)
    macro_precision = metrics.precision_score(y_true,pred,average = "macro")
    micro_precision = metrics.precision_score(y_true,pred,average = "micro")
    macro_recall = metrics.recall_score(y_true,pred,average = "macro")
    micro_recall = metrics.recall_score(y_true,pred,average = "micro")
    f1_score = metrics.f1_score(y_true, pred)
    
    print(f"{model_name:^27}(Scale data : {scale})")
    print (f"{'best_params':<19} |{(best_params)}")
    print("*" * 27)
    print(f"{'Method':<19}| Result")
    print("—" * 27)
    print(f"{'Accuracy':<19}| {accuracy:.4f}")
    print(f"{'macro_precision':<19}| {macro_precision:.4f}")
    print(f"{'micro_precision':<19}| {micro_precision:.4f}")
    print(f"{'macro_recall':<19}| {macro_recall:.4f}")
    print(f"{'micro_precision':<19}| {micro_recall:.4f}")
    print(f"{'f1_score':<19}| {f1_score:.4f}")
    print("—" * 27)

In [18]:
def fit_model(
    Model: ModelRegressors,
    parameter :dict,
    scale: bool ,
    model_name: str = "",
) -> None:
    if not scale:
        pipe = make_pipeline(Model)
        
    else:
        pipe = make_pipeline(StandardScaler(), 
                              Model)
    grid_cv = GridSearchCV(pipe, param_grid = parameter, return_train_score = True, n_jobs = -1)
    grid_cv.fit(X_train,y_train)
    pred = grid_cv.predict(X_test)
    
    get_all_results(pred=pred, y_true=y_test, model_name=model_name, scale = scale, best_params = grid_cv.best_params_)

define a function to execute model

In [19]:
def do_question(scale:bool ) -> None:
    
    param_1 = {
        "linearsvc__C": np.logspace(-5,2,8)
}
    param_2 = {
        "logisticregression__C" : np.logspace(-3,3,7),
}
    
    param_3 = {
        "kneighborsclassifier__n_neighbors" : np.arange(1,33,2)
}
    
    fit_model(
    Model = LinearSVC(dual = False),
    parameter = param_1,
    scale = scale,
    model_name="LinearSVC Model")
    
    fit_model(
    Model = LogisticRegression(solver = 'lbfgs',n_jobs=-1,max_iter = 5000),
    parameter = param_2,
    scale = scale,
    model_name="LogisticRegression Model")
    
    fit_model(
    Model = KNeighborsClassifier(n_jobs= -1),
    parameter = param_3,
    scale = scale,
    model_name="KNeighborsClassifier Model")

In [20]:
do_question(False)

      LinearSVC Model      (Scale data : False)
best_params         |{'linearsvc__C': 1.0}
***************************
Method             | Result
———————————————————————————
Accuracy           | 0.7511
macro_precision    | 0.7701
micro_precision    | 0.7511
macro_recall       | 0.6147
micro_precision    | 0.7511
f1_score           | 0.3911
———————————————————————————
 LogisticRegression Model  (Scale data : False)
best_params         |{'logisticregression__C': 1.0}
***************************
Method             | Result
———————————————————————————
Accuracy           | 0.7537
macro_precision    | 0.7557
micro_precision    | 0.7537
macro_recall       | 0.6262
micro_precision    | 0.7537
f1_score           | 0.4240
———————————————————————————
KNeighborsClassifier Model (Scale data : False)
best_params         |{'kneighborsclassifier__n_neighbors': 23}
***************************
Method             | Result
———————————————————————————
Accuracy           | 0.7152
macro_precision    | 0.663

In [21]:
do_question(True)

      LinearSVC Model      (Scale data : True)
best_params         |{'linearsvc__C': 0.0001}
***************************
Method             | Result
———————————————————————————
Accuracy           | 0.7551
macro_precision    | 0.7508
micro_precision    | 0.7551
macro_recall       | 0.6321
micro_precision    | 0.7551
f1_score           | 0.4395
———————————————————————————
 LogisticRegression Model  (Scale data : True)
best_params         |{'logisticregression__C': 0.01}
***************************
Method             | Result
———————————————————————————
Accuracy           | 0.7532
macro_precision    | 0.7529
micro_precision    | 0.7532
macro_recall       | 0.6265
micro_precision    | 0.7532
f1_score           | 0.4254
———————————————————————————
KNeighborsClassifier Model (Scale data : True)
best_params         |{'kneighborsclassifier__n_neighbors': 29}
***************************
Method             | Result
———————————————————————————
Accuracy           | 0.7523
macro_precision    | 0.72