In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
import warnings
warnings.filterwarnings("ignore")


In [3]:
df = pd.read_csv('https://raw.githubusercontent.com/YBIFoundation/Dataset/main/Customer%20Purchase.csv')

In [4]:
df.head()

Unnamed: 0,Customer ID,Age,Gender,Education,Review,Purchased
0,1021,30,Female,School,Average,No
1,1022,68,Female,UG,Poor,No
2,1023,70,Female,PG,Good,No
3,1024,72,Female,PG,Good,No
4,1025,16,Female,UG,Average,No


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Customer ID  50 non-null     int64 
 1   Age          50 non-null     int64 
 2   Gender       50 non-null     object
 3   Education    50 non-null     object
 4   Review       50 non-null     object
 5   Purchased    50 non-null     object
dtypes: int64(2), object(4)
memory usage: 2.5+ KB


In [6]:
df.describe()

Unnamed: 0,Customer ID,Age
count,50.0,50.0
mean,1045.5,54.16
std,14.57738,25.658161
min,1021.0,15.0
25%,1033.25,30.25
50%,1045.5,57.0
75%,1057.75,74.0
max,1070.0,98.0


In [7]:
df.columns

Index(['Customer ID', 'Age', 'Gender', 'Education', 'Review', 'Purchased'], dtype='object')

In [8]:
df.shape

(50, 6)

In [22]:
pd.DataFrame({'unicos':df.nunique(),
              'missing': df.isna().sum()/df.count(),
              'tipo':df.dtypes})


Unnamed: 0,unicos,missing,tipo
Customer ID,50,0.0,int64
Age,41,0.0,int64
Gender,2,0.0,object
Education,3,0.0,object
Review,3,0.0,object
Purchased,2,0.0,object


In [9]:
x = df[['Customer ID', 'Age', 'Gender', 'Education', 'Review']]

In [10]:
x.shape

(50, 5)

In [11]:
x

Unnamed: 0,Customer ID,Age,Gender,Education,Review
0,1021,30,Female,School,Average
1,1022,68,Female,UG,Poor
2,1023,70,Female,PG,Good
3,1024,72,Female,PG,Good
4,1025,16,Female,UG,Average
5,1026,31,Female,School,Average
6,1027,18,Male,School,Good
7,1028,60,Female,School,Poor
8,1029,65,Female,UG,Average
9,1030,74,Male,UG,Good


In [29]:
x.replace({"Gender":{"Male":0,"Female":1}},inplace=True)
x.replace({"Education":{"School":0,"UG":1,"PG":2}},inplace=True)
x.replace({"Review":{"Poor":0,"Average":1,"Good":2}},inplace=True)
x

Unnamed: 0,Customer ID,Age,Gender,Education,Review
0,1021,30,1,0,1
1,1022,68,1,1,0
2,1023,70,1,2,2
3,1024,72,1,2,2
4,1025,16,1,1,1
5,1026,31,1,0,1
6,1027,18,0,0,2
7,1028,60,1,0,0
8,1029,65,1,1,1
9,1030,74,0,1,2


In [30]:
y = df['Purchased']

In [31]:
y

0      No
1      No
2      No
3      No
4      No
5     Yes
6      No
7     Yes
8      No
9     Yes
10    Yes
11    Yes
12     No
13     No
14    Yes
15     No
16    Yes
17    Yes
18     No
19    Yes
20    Yes
21     No
22    Yes
23     No
24    Yes
25     No
26     No
27     No
28     No
29    Yes
30     No
31    Yes
32    Yes
33    Yes
34     No
35    Yes
36    Yes
37    Yes
38     No
39     No
40     No
41    Yes
42    Yes
43     No
44     No
45    Yes
46     No
47    Yes
48    Yes
49     No
Name: Purchased, dtype: object

In [32]:
y.shape

(50,)

In [33]:
xtr,xte,ytr,yte = train_test_split(x,y,test_size=0.2,random_state=2529)

In [34]:
xtr,xte,ytr,yte

(    Customer ID  Age  Gender  Education  Review
 6          1027   18       0          0       2
 1          1022   68       1          1       0
 12         1033   51       0          0       0
 5          1026   31       1          0       1
 26         1047   53       1          2       0
 29         1050   83       1          1       1
 16         1037   59       0          1       0
 34         1055   86       0          0       1
 44         1065   77       1          1       1
 14         1035   15       0          2       0
 17         1038   22       1          1       0
 9          1030   74       0          1       2
 31         1052   22       1          0       0
 30         1051   73       0          1       1
 4          1025   16       1          1       1
 3          1024   72       1          2       2
 18         1039   19       0          0       2
 33         1054   89       1          2       2
 10         1031   98       1          1       2
 39         1060   7

In [35]:
model = LogisticRegression(max_iter=10000)

In [36]:
model.fit(xtr,ytr)

In [37]:
ypred = model.predict(xte)

In [38]:
ypred

array(['No', 'No', 'No', 'No', 'No', 'No', 'No', 'Yes', 'No', 'No'],
      dtype=object)

In [39]:
accuracy_score(yte,ypred)

0.2

In [40]:
confusion_matrix(yte,ypred)

array([[2, 1],
       [7, 0]], dtype=int64)

In [41]:
confusion_matrix(yte,ypred)

array([[2, 1],
       [7, 0]], dtype=int64)

In [42]:
print(classification_report(yte,ypred))

              precision    recall  f1-score   support

          No       0.22      0.67      0.33         3
         Yes       0.00      0.00      0.00         7

    accuracy                           0.20        10
   macro avg       0.11      0.33      0.17        10
weighted avg       0.07      0.20      0.10        10

