In [32]:
import pandas as pd
from sklearn.impute import SimpleImputer
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [3]:
# lets load data from github raw url 
data_url = "https://raw.githubusercontent.com/redashu/Datasets/refs/heads/master/data_preprocess1.csv"

In [4]:
df=pd.read_csv(data_url)

In [5]:
df.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [6]:
# we can check basic meta data info about file 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    30 non-null     object 
 1   Age        29 non-null     float64
 2   Salary     28 non-null     float64
 3   Purchased  30 non-null     object 
dtypes: float64(2), object(2)
memory usage: 1.1+ KB


In [7]:
# accessing particular column 
df['Country']

0      France
1       Spain
2     Germany
3       Spain
4     Germany
5      France
6       Spain
7      France
8     Germany
9      France
10    Germany
11     France
12      Spain
13    Germany
14     France
15      Spain
16    Germany
17     France
18      Spain
19    Germany
20     France
21      Spain
22    Germany
23     France
24      Spain
25    Germany
26     France
27      Spain
28    Germany
29     France
Name: Country, dtype: object

In [11]:
# selecting first 3 all columns data 
inputs = df.iloc[:,0:3].values

In [14]:
 output = df.iloc[:,-1].values

In [17]:
# now for handling missing values in age and sal column
ashu_impute = SimpleImputer(strategy='mean',missing_values=np.nan)

In [19]:
ashu_impute.fit(inputs[:,1:3]) # calculating missing values 

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False


In [21]:
inputs[:,1:3]=ashu_impute.transform(inputs[:,1:3]) # making changes and storing it 
print(inputs)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 66071.42857142857]
 ['France' 35.0 58000.0]
 ['Spain' 39.0 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]
 ['Germany' 28.0 49000.0]
 ['France' 42.0 75000.0]
 ['Spain' 33.0 56000.0]
 ['Germany' 45.0 77000.0]
 ['France' 29.0 51000.0]
 ['Spain' 51.0 85000.0]
 ['Germany' 36.0 66071.42857142857]
 ['France' 31.0 55000.0]
 ['Spain' 39.0 63000.0]
 ['Germany' 47.0 80000.0]
 ['France' 26.0 47000.0]
 ['Spain' 32.0 53000.0]
 ['Germany' 41.0 71000.0]
 ['France' 49.0 82000.0]
 ['Spain' 34.0 57000.0]
 ['Germany' 43.0 74000.0]
 ['France' 52.0 88000.0]
 ['Spain' 46.0 78000.0]
 ['Germany' 25.0 45000.0]
 ['France' 53.0 90000.0]]


In [22]:
# output values in string 
print(output)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes' 'Yes' 'No' 'Yes'
 'No' 'Yes' 'No' 'No' 'No' 'Yes' 'Yes' 'No' 'No' 'Yes' 'No' 'Yes' 'No'
 'Yes' 'No' 'Yes' 'No']


In [24]:
# implemnting label encoding to convert String (2 category) to numbers
le = LabelEncoder()
output1 = le.fit_transform(output)
print(output1)

[0 1 0 0 1 1 0 1 0 1 1 0 1 0 1 0 0 0 1 1 0 0 1 0 1 0 1 0 1 0]


In [26]:
# data preprocessing using single column having multi category
# categorical values handing in data preprocessing 
ct=ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[0])],remainder='passthrough')
#     transforms[encoding,method_ofecn,columnlocaiton], keep remaining data column values unchanged

In [30]:
# apply it
inputs= ct.fit_transform(inputs)

In [28]:
# finally our input data and output data in in numeric format which can be used by ML algo's
output1

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 1, 0])

In [31]:
inputs

array([[1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 1.0, 0.0, 40.0, 66071.42857142857],
       [1.0, 0.0, 0.0, 35.0, 58000.0],
       [0.0, 0.0, 1.0, 39.0, 52000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0],
       [0.0, 1.0, 0.0, 28.0, 49000.0],
       [1.0, 0.0, 0.0, 42.0, 75000.0],
       [0.0, 0.0, 1.0, 33.0, 56000.0],
       [0.0, 1.0, 0.0, 45.0, 77000.0],
       [1.0, 0.0, 0.0, 29.0, 51000.0],
       [0.0, 0.0, 1.0, 51.0, 85000.0],
       [0.0, 1.0, 0.0, 36.0, 66071.42857142857],
       [1.0, 0.0, 0.0, 31.0, 55000.0],
       [0.0, 0.0, 1.0, 39.0, 63000.0],
       [0.0, 1.0, 0.0, 47.0, 80000.0],
       [1.0, 0.0, 0.0, 26.0, 47000.0],
       [0.0, 0.0, 1.0, 32.0, 53000.0],
       [0.0, 1.0, 0.0, 41.0, 71000.0],
       [1.0, 0.0, 0.0, 49.0, 82000.0],
       [0.0, 0.0, 1.0, 34.0, 57000.0],
     

In [33]:
# lets split data into training and testing phase 
(input_90,input_10,output_90,output_10)=train_test_split(inputs,output1,test_size=0.1)
# size range 0 to 1 -- example if 0.1 means 10%

In [34]:
input_10

array([[1.0, 0.0, 0.0, 49.0, 82000.0],
       [0.0, 0.0, 1.0, 32.0, 53000.0],
       [0.0, 1.0, 0.0, 30.0, 54000.0]], dtype=object)

In [35]:
input_90

array([[0.0, 1.0, 0.0, 25.0, 45000.0],
       [1.0, 0.0, 0.0, 42.0, 75000.0],
       [0.0, 0.0, 1.0, 33.0, 56000.0],
       [0.0, 1.0, 0.0, 40.0, 66071.42857142857],
       [0.0, 1.0, 0.0, 41.0, 71000.0],
       [1.0, 0.0, 0.0, 53.0, 90000.0],
       [1.0, 0.0, 0.0, 31.0, 55000.0],
       [0.0, 0.0, 1.0, 39.0, 52000.0],
       [0.0, 0.0, 1.0, 39.0, 63000.0],
       [0.0, 0.0, 1.0, 34.0, 57000.0],
       [0.0, 1.0, 0.0, 43.0, 74000.0],
       [1.0, 0.0, 0.0, 52.0, 88000.0],
       [1.0, 0.0, 0.0, 35.0, 58000.0],
       [0.0, 1.0, 0.0, 45.0, 77000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 0.0, 1.0, 46.0, 78000.0],
       [0.0, 1.0, 0.0, 28.0, 49000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [1.0, 0.0, 0.0, 26.0, 47000.0],
       [1.0, 0.0, 0.0, 29.0, 51000.0],
       [0.0, 0.0, 1.0, 51.0, 85000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 1.