In [1]:
#DATA PREPROCESSING

#importing required libraries
import numpy as np
import pandas as pd


In [2]:
#creating dataframe
df=pd.DataFrame({
    'age':[25,30,np.nan,35,40],
    'salary':[50000,60000,np.nan,70000,55000],
    'city':['New York','San Francisco','Los Angeles','Chicago',np.nan],
    'gender':['M','F','M','F','M']
})
print(df)

    age   salary           city gender
0  25.0  50000.0       New York      M
1  30.0  60000.0  San Francisco      F
2   NaN      NaN    Los Angeles      M
3  35.0  70000.0        Chicago      F
4  40.0  55000.0            NaN      M


In [3]:
#handling missing age and salary
from sklearn.impute import SimpleImputer
i=SimpleImputer(missing_values=np.nan,strategy='mean')
df[['age','salary']]=i.fit_transform(df[['age','salary']])
print(df)

    age   salary           city gender
0  25.0  50000.0       New York      M
1  30.0  60000.0  San Francisco      F
2  32.5  58750.0    Los Angeles      M
3  35.0  70000.0        Chicago      F
4  40.0  55000.0            NaN      M


In [4]:
#handling missing city values
ci=SimpleImputer(missing_values=np.nan,strategy='most_frequent')
df['city']=ci.fit_transform(df[['city']]).ravel()
print(df)

    age   salary           city gender
0  25.0  50000.0       New York      M
1  30.0  60000.0  San Francisco      F
2  32.5  58750.0    Los Angeles      M
3  35.0  70000.0        Chicago      F
4  40.0  55000.0        Chicago      M


In [5]:
#encoding categorical data
from sklearn.preprocessing import (LabelEncoder, MinMaxScaler, StandardScaler, OneHotEncoder)
le=LabelEncoder()
df['gender']=le.fit_transform(df['gender'])
print(df)

    age   salary           city  gender
0  25.0  50000.0       New York       1
1  30.0  60000.0  San Francisco       0
2  32.5  58750.0    Los Angeles       1
3  35.0  70000.0        Chicago       0
4  40.0  55000.0        Chicago       1


In [6]:
#one hot encoding
ohe=OneHotEncoder(sparse_output=False)
city_encoded=ohe.fit_transform(df[['city']])
city_df=pd.DataFrame(city_encoded,columns=ohe.get_feature_names_out(['city']))
df=pd.concat([df,city_df],axis=1)
print(df)

    age   salary           city  gender  city_Chicago  city_Los Angeles  \
0  25.0  50000.0       New York       1           0.0               0.0   
1  30.0  60000.0  San Francisco       0           0.0               0.0   
2  32.5  58750.0    Los Angeles       1           0.0               1.0   
3  35.0  70000.0        Chicago       0           1.0               0.0   
4  40.0  55000.0        Chicago       1           1.0               0.0   

   city_New York  city_San Francisco  
0            1.0                 0.0  
1            0.0                 1.0  
2            0.0                 0.0  
3            0.0                 0.0  
4            0.0                 0.0  


In [7]:
#feature scaling: normalization
scaler=MinMaxScaler()
df[['age','salary']]=scaler.fit_transform(df[['age','salary']])

In [8]:
#feature scaling: standardization
scaler=StandardScaler()
df[['age','salary']]=scaler.fit_transform(df[['age','salary']])

In [9]:
#train test and split
from sklearn.model_selection import train_test_split
x=df.drop('gender',axis=1)
y=df['gender']
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)
print(x_train)
print(x_test)
print(y_train)
print(y_test)

            age    salary         city  city_Chicago  city_Los Angeles  \
4  1.500000e+00 -0.566947      Chicago           1.0               0.0   
2 -3.330669e-16  0.000000  Los Angeles           0.0               1.0   
0 -1.500000e+00 -1.322876     New York           0.0               0.0   
3  5.000000e-01  1.700840      Chicago           1.0               0.0   

   city_New York  city_San Francisco  
4            0.0                 0.0  
2            0.0                 0.0  
0            1.0                 0.0  
3            0.0                 0.0  
   age    salary           city  city_Chicago  city_Los Angeles  \
1 -0.5  0.188982  San Francisco           0.0               0.0   

   city_New York  city_San Francisco  
1            0.0                 1.0  
4    1
2    1
0    1
3    0
Name: gender, dtype: int64
1    0
Name: gender, dtype: int64


In [10]:
#feature engineering
df['salperage']=df['salary']/df['age']
print(df)

            age    salary           city  gender  city_Chicago  \
0 -1.500000e+00 -1.322876       New York       1           0.0   
1 -5.000000e-01  0.188982  San Francisco       0           0.0   
2 -3.330669e-16  0.000000    Los Angeles       1           0.0   
3  5.000000e-01  1.700840        Chicago       0           1.0   
4  1.500000e+00 -0.566947        Chicago       1           1.0   

   city_Los Angeles  city_New York  city_San Francisco  salperage  
0               0.0            1.0                 0.0   0.881917  
1               0.0            0.0                 1.0  -0.377964  
2               1.0            0.0                 0.0  -0.000000  
3               0.0            0.0                 0.0   3.401680  
4               0.0            0.0                 0.0  -0.377964  
