In [1]:
import numpy as np
import pandas as pd

In [3]:
df=pd.read_csv("customer.csv")

In [4]:
df

Unnamed: 0,age,gender,review,education,purchased
0,30,Female,Average,School,No
1,68,Female,Poor,UG,No
2,70,Female,Good,PG,No
3,72,Female,Good,PG,No
4,16,Female,Average,UG,No
5,31,Female,Average,School,Yes
6,18,Male,Good,School,No
7,60,Female,Poor,School,Yes
8,65,Female,Average,UG,No
9,74,Male,Good,UG,Yes


In [5]:
# only keep review, education, purchased
df = df[["review", "education", "purchased"]]

print(df.shape)
df.head()

(50, 3)


Unnamed: 0,review,education,purchased
0,Average,School,No
1,Poor,UG,No
2,Good,PG,No
3,Good,PG,No
4,Average,UG,No


In [6]:
from sklearn.model_selection import train_test_split

X = df[["review", "education"]]     # features
y = df["purchased"]                # target (Yes/No)

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.20,
    random_state=42,
    stratify=y
)

print("X_train:", X_train.shape)
print("X_test :", X_test.shape)
print("\nSample X_train:")
print(X_train.head())


X_train: (40, 2)
X_test : (10, 2)

Sample X_train:
     review education
16     Poor        UG
13  Average    School
47     Good        PG
27     Poor        PG
2      Good        PG


In [9]:
from sklearn.preprocessing import OrdinalEncoder

oe=OrdinalEncoder(categories=[
    ["Poor","Average","Good"],
    ["School","UG","PG"]
])

In [10]:
# ✅ fit only on train
X_train_encoded = oe.fit_transform(X_train)

# ✅ transform test (same rule)
X_test_encoded = oe.transform(X_test)

print("✅ Encoded X_train (first 5 rows):")
print(X_train_encoded[:5])

print("\n✅ Categories learned:")
print(oe.categories_)

✅ Encoded X_train (first 5 rows):
[[0. 1.]
 [1. 0.]
 [2. 2.]
 [0. 2.]
 [2. 2.]]

✅ Categories learned:
[array(['Poor', 'Average', 'Good'], dtype=object), array(['School', 'UG', 'PG'], dtype=object)]


In [11]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

# ✅ fit only on train target
y_train_encoded = le.fit_transform(y_train)

# ✅ transform test target
y_test_encoded = le.transform(y_test)

print("✅ Target classes order:", le.classes_)   # ['No', 'Yes']
print("✅ y_train encoded first 10:", y_train_encoded[:10])


✅ Target classes order: ['No' 'Yes']
✅ y_train encoded first 10: [1 0 1 0 0 0 1 0 1 1]


In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

model = LogisticRegression()

# ✅ train
model.fit(X_train_encoded, y_train_encoded)

# ✅ predict
y_pred = model.predict(X_test_encoded)

print("✅ Accuracy:", accuracy_score(y_test_encoded, y_pred))


✅ Accuracy: 0.6


In [13]:
from sklearn.metrics import confusion_matrix, classification_report

print("✅ Confusion Matrix:")
print(confusion_matrix(y_test_encoded, y_pred))

print("\n✅ Classification Report:")
print(classification_report(y_test_encoded, y_pred))


✅ Confusion Matrix:
[[4 1]
 [3 2]]

✅ Classification Report:
              precision    recall  f1-score   support

           0       0.57      0.80      0.67         5
           1       0.67      0.40      0.50         5

    accuracy                           0.60        10
   macro avg       0.62      0.60      0.58        10
weighted avg       0.62      0.60      0.58        10



In [14]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Features and target
X = df[["review", "education"]]
y = df["purchased"]

# Train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.20,
    random_state=42,
    stratify=y
)

# Ordinal Encoding (order wise)
preprocessor = ColumnTransformer(
    transformers=[
        ("ord", OrdinalEncoder(categories=[
            ["Poor", "Average", "Good"],
            ["School", "UG", "PG"]
        ]), ["review", "education"])
    ]
)

# Pipeline
pipe = Pipeline([
    ("preprocessor", preprocessor),
    ("model", LogisticRegression())
])

# Train
pipe.fit(X_train, y_train)

# Predict
y_pred_pipe = pipe.predict(X_test)

print("✅ Pipeline Accuracy:", accuracy_score(y_test, y_pred_pipe))


✅ Pipeline Accuracy: 0.6
