### Step A: Data Preprocessing

#### 1.Import the libarries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#### 2.import the data

In [2]:
df=pd.read_csv('C:/Users/SUBHAM/Desktop/projects/ML/Data/Data.csv')
print(df)

   Country   Age   Salary Purchased
0   France  44.0  72000.0        No
1    Spain  27.0  48000.0       Yes
2  Germany  30.0  54000.0        No
3    Spain  38.0  61000.0        No
4  Germany  40.0      NaN       Yes
5   France  35.0  58000.0       Yes
6    Spain   NaN  52000.0        No
7   France  48.0  79000.0       Yes
8  Germany  50.0  83000.0        No
9   France  37.0  67000.0       Yes


In [3]:
#Check missing data
df.isnull().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

#### 3.Divide the data set into input and output, i.e, Feature Matrix X and column Y

In [4]:
X=df.iloc[:,0:-1].values
Y=df.iloc[:,-1].values

In [5]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [6]:
print(Y)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


#### 4.Take care of missing data

In [7]:
from sklearn.impute import SimpleImputer

In [8]:
imputer=SimpleImputer(missing_values=np.nan,strategy='mean')

In [9]:
imputer.fit(X[:,1:3])
X[:,1:3]=imputer.transform(X[:,1:3])

In [10]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


#### 5.Converting categorical data to numerical values

##### 5.a.Output Column Vector

In [11]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
Y=np.array(le.fit_transform(Y))

In [12]:
print(Y)

[0 1 0 0 1 1 0 1 0 1]


##### 5.b.Feature Matrix

In [13]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [14]:
ct=ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[0])],remainder="passthrough")
X=np.array(ct.fit_transform(X))

In [15]:
print(X)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [16]:
print(Y)

[0 1 0 0 1 1 0 1 0 1]


### Step B: Building a classifier model using Logistic Regression

#### 6. Spliting the data set into training and testing data

In [17]:
from sklearn.model_selection import train_test_split
Xtrain,Xtest,Ytrain,Ytest=train_test_split(X,Y,test_size=0.2)
print(Xtrain)

[[1.0 0.0 0.0 37.0 67000.0]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [1.0 0.0 0.0 44.0 72000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 27.0 48000.0]]


In [18]:
from sklearn.linear_model import LogisticRegression
LoR=LogisticRegression()
LoR.fit(Xtrain,Ytrain)

In [19]:
Yestimated=LoR.predict(Xtest)

In [20]:
print(Yestimated)

[0 0]


In [21]:
from sklearn.metrics import mean_absolute_error,mean_squared_error,mean_absolute_percentage_error,d2_absolute_error_score
a=mean_squared_error(Ytest,Yestimated)
b=mean_absolute_percentage_error(Ytest,Yestimated)
c=d2_absolute_error_score(Ytest,Yestimated)
d=mean_absolute_error(Ytest,Yestimated)
print(f"Mean Squared Error:{a}")
print(f"Mean Absolute Percentage Error:{b}")
print(f"Absolute Error Score:{c}")
print(f"Mean Absolute Error:{d}")

Mean Squared Error:0.5
Mean Absolute Percentage Error:0.5
Absolute Error Score:0.0
Mean Absolute Error:0.5
