In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import StandardScaler

In [3]:
data = pd.read_csv('../data/diabetes.csv')
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [5]:
# Data preprocessing
cols_not_zero = ['Glucose', 'BloodPressure', 'SkinThickness', 'BMI', 'Insulin']
for column in cols_not_zero:
    data[column] = data[column].replace(0, np.NaN)
    mean = int(data[column].mean(skipna=True))
    data[column] = data[column].replace(np.NaN, mean)
# Split dataset
X = data.iloc[:, 0:8]
y = data.iloc[:, 8]
print(y)

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64


In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.3)

In [7]:
scaler = StandardScaler()
X_train_scale = scaler.fit_transform(X_train)

In [8]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train_scale, y_train)

In [9]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

print(model.score(X_train_scale, y_train))

X_test = scaler.transform(X_test)
y_pred = model.predict(X_test)
score = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(score, precision, recall, f1)

0.7802607076350093
0.7402597402597403 0.6351351351351351 0.5875 0.6103896103896104


In [11]:
# import Pipeline
from sklearn.pipeline import Pipeline
model_pipeline  = Pipeline(steps=[("Scale_data", StandardScaler()),
                                  ("Logistic Regression Model",
                                  LogisticRegression(C=1.0, max_iter=1000) )])
model_pipeline.fit(X_train, y_train)

y_pred = model_pipeline.predict(X_test)
print(model_pipeline.score(X_test, y_test))

0.6536796536796536


In [12]:
import joblib
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [13]:
np.random.seed(42)

n_samples = 1000
data = { 
'Gender': np.random.choice (['Male', 'Female'], size=n_samples), 
'Product Type': np.random.choice(['Electronics', 'Clothing', 'Home'' Appliances'], size=n_samples), 
'Age': np.random.randint(18, 65, size=n_samples), 
'Income': np.random.randint (2000, 10000, size=n_samples),
'Customer Satisfaction': np.random.choice([0,1], size=n_samples, p=[0.3,0.7])}
df=pd.DataFrame(data)
df.head()

Unnamed: 0,Gender,Product Type,Age,Income,Customer Satisfaction
0,Male,Clothing,45,8186,1
1,Female,Home Appliances,22,5456,1
2,Male,Electronics,30,9788,1
3,Male,Electronics,34,5506,1
4,Male,Electronics,57,2407,1


In [18]:

X = df[["Gender", "Product Type", "Age", "Income"]]
y = df["Customer Satisfaction"]

X_train, X_test, y_train, y_test = train_test_split(X ,y, test_size=0.3, random_state=42)

In [19]:
categorical_features = ["Gender", "Product Type"]
continious_features = ["Age","Income"]

preprocessor = ColumnTransformer(
    transformers= [
        ("cat", OneHotEncoder(), categorical_features),
        ("num", StandardScaler(), continious_features)
    ]
)

In [21]:
model_pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("classifier", LogisticRegression(max_iter=1000))
    ]
)

In [22]:
model_pipeline.fit(X_train, y_train)

joblib.dump(model_pipeline, "Customer_satisfaction_model.pkl")
print("Save Model Successful")

Save Model Successful


In [23]:
loaded_model = joblib.load("Customer_satisfaction_model.pkl")
y_pred = loaded_model.predict(X_test)

accuracy = loaded_model.score(X_test, y_test)
print(accuracy)

0.6966666666666667


In [24]:
from sklearn.preprocessing import FunctionTransformer

def User_function(X):
    X_new = X.copy()
    print("Đã xử lý dữ liệu chuyển vào bước sau")
    return X_new

Data_transform = FunctionTransformer(User_function)

In [25]:
def custom_label_encoding(X):
    X_new = X.copy()
    gender_mapping = {"Male": 0, "Female":1}
    product_type_mapping = {"Electronics": 0, "Clothing":1, "Home Appliances":2}
    
    X_new["Gender"] = X_new["Gender"].map(gender_mapping)
    X_new["Product Type"] = X_new["Product Type"].map(product_type_mapping)
    return X_new

label_encoder_transformer = FunctionTransformer(custom_label_encoding, validate=False)

In [28]:
model_pipeline = Pipeline(
    steps=[
        ("label_encoding", label_encoder_transformer),
        ("scaler", StandardScaler()),
        ("classifier", LogisticRegression(max_iter=1000))
    ]
)
model_pipeline.fit(X_train, y_train)
y_pred = model_pipeline.predict(X_test)
accuracy = model_pipeline.score(X_test, y_test)
print(accuracy)

0.6966666666666667
