In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [3]:
df = pd.read_csv('gender.csv')

In [4]:
df.head()

Unnamed: 0,Gender,Age,Height (cm),Weight (kg),Occupation,Education Level,Marital Status,Income (USD),Favorite Color,Unnamed: 9
0,male,32,175,70,Software Engineer,Master's Degree,Married,75000,Blue,
1,male,25,182,85,Sales Representative,Bachelor's Degree,Single,45000,Green,
2,female,41,160,62,Doctor,Doctorate Degree,Married,120000,Purple,
3,male,38,178,79,Lawyer,Bachelor's Degree,Single,90000,Red,
4,female,29,165,58,Graphic Designer,Associate's Degree,Single,35000,Yellow,


In [5]:
df.shape

(131, 10)

In [6]:
df.columns

Index([' Gender', ' Age', ' Height (cm)', ' Weight (kg)', ' Occupation',
       ' Education Level', ' Marital Status', ' Income (USD)',
       ' Favorite Color', 'Unnamed: 9'],
      dtype='object')

In [7]:
df.columns = [i.strip() for i in df.columns]

In [8]:
df.describe()

Unnamed: 0,Age,Height (cm),Weight (kg),Income (USD),Unnamed: 9
count,131.0,131.0,131.0,131.0,0.0
mean,34.564885,173.198473,71.458015,93206.10687,
std,5.984723,8.045467,12.648052,74045.382919,
min,24.0,160.0,50.0,30000.0,
25%,29.0,166.0,60.0,55000.0,
50%,34.0,175.0,75.0,75000.0,
75%,39.0,180.5,83.0,100000.0,
max,52.0,190.0,94.0,500000.0,


In [9]:
df.isnull().sum()

Gender               0
Age                  0
Height (cm)          0
Weight (kg)          0
Occupation           0
Education Level      0
Marital Status       0
Income (USD)         0
Favorite Color       0
Unnamed: 9         131
dtype: int64

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 131 entries, 0 to 130
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Gender           131 non-null    object 
 1   Age              131 non-null    int64  
 2   Height (cm)      131 non-null    int64  
 3   Weight (kg)      131 non-null    int64  
 4   Occupation       131 non-null    object 
 5   Education Level  131 non-null    object 
 6   Marital Status   131 non-null    object 
 7   Income (USD)     131 non-null    int64  
 8   Favorite Color   131 non-null    object 
 9   Unnamed: 9       0 non-null      float64
dtypes: float64(1), int64(4), object(5)
memory usage: 10.4+ KB


In [11]:
numeric_cols = [i for i in df.columns if(df[i].dtype in ['int64' ,'float64'])]
cat_col = [i for i in df.columns if i not in numeric_cols]
for i in cat_col:
    df[i] = [c.strip() for c in df[i]]

In [12]:
data = df.copy()

In [13]:
data.drop(['Unnamed: 9'], axis=1, inplace=True)

In [14]:
data.head()

Unnamed: 0,Gender,Age,Height (cm),Weight (kg),Occupation,Education Level,Marital Status,Income (USD),Favorite Color
0,male,32,175,70,Software Engineer,Master's Degree,Married,75000,Blue
1,male,25,182,85,Sales Representative,Bachelor's Degree,Single,45000,Green
2,female,41,160,62,Doctor,Doctorate Degree,Married,120000,Purple
3,male,38,178,79,Lawyer,Bachelor's Degree,Single,90000,Red
4,female,29,165,58,Graphic Designer,Associate's Degree,Single,35000,Yellow


In [15]:
from sklearn.preprocessing import LabelEncoder

In [16]:
le_Gender = LabelEncoder()
le_Occupation = LabelEncoder()
le_Education = LabelEncoder()
le_MaritalStatus = LabelEncoder()
le_Color = LabelEncoder()

In [17]:
data['gender'] = le_Gender.fit_transform(data['Gender'])
data['occupation'] = le_Occupation.fit_transform(data['Occupation'])
data['education level'] = le_Education.fit_transform(data['Education Level'])
data['marital status'] = le_MaritalStatus.fit_transform(data['Marital Status'])
data['favorite color'] = le_Color.fit_transform(data['Favorite Color'])


In [18]:
data.drop(['Gender', 'Occupation', 'Education Level', 'Marital Status', 'Favorite Color'], axis = 1, inplace=True)

In [19]:
data.head()

Unnamed: 0,Age,Height (cm),Weight (kg),Income (USD),gender,occupation,education level,marital status,favorite color
0,32,175,70,75000,1,16,3,1,1
1,25,182,85,45000,1,14,1,2,2
2,41,160,62,120000,0,6,2,1,6
3,38,178,79,90000,1,10,1,2,7
4,29,165,58,35000,0,8,0,2,8


In [20]:
X = data.drop(['gender'], axis = 1)

In [21]:
y = data['gender']

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [23]:
dc = DecisionTreeClassifier()
dc.fit(X_train , y_train)

In [24]:
y_pred = dc.predict(X_test)
accuracy_score(y_test , y_pred)

1.0

In [25]:
X_test.iloc[0]

Age                   30
Height (cm)          182
Weight (kg)           83
Income (USD)       95000
occupation             9
education level        1
marital status         2
favorite color         6
Name: 55, dtype: int64

In [26]:
y_test.iloc[0]

1

In [27]:
y_pred[0]

1