In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

In [3]:
df = pd.read_csv('gender.csv')

In [4]:
df.head()

Unnamed: 0,Gender,Age,Height (cm),Weight (kg),Occupation,Education Level,Marital Status,Income (USD),Favorite Color,Unnamed: 9
0,male,32,175,70,Software Engineer,Master's Degree,Married,75000,Blue,
1,male,25,182,85,Sales Representative,Bachelor's Degree,Single,45000,Green,
2,female,41,160,62,Doctor,Doctorate Degree,Married,120000,Purple,
3,male,38,178,79,Lawyer,Bachelor's Degree,Single,90000,Red,
4,female,29,165,58,Graphic Designer,Associate's Degree,Single,35000,Yellow,


In [5]:
df.shape

(131, 10)

In [6]:
df.isnull().sum()

 Gender               0
 Age                  0
 Height (cm)          0
 Weight (kg)          0
 Occupation           0
 Education Level      0
 Marital Status       0
 Income (USD)         0
 Favorite Color       0
Unnamed: 9          131
dtype: int64

In [7]:
df.columns = [i.strip() for i in df.columns]

In [8]:
df.drop(['Unnamed: 9'], axis=1, inplace = True)

In [9]:
numeric_cols = [i for i in df.columns if(df[i].dtype in ['int64' ,'float64'])]
cat_col = [i for i in df.columns if i not in numeric_cols]
for i in cat_col:
    df[i] = [c.strip() for c in df[i]]

In [10]:
data = pd.get_dummies(df , drop_first = True)

In [11]:
data.head()

Unnamed: 0,Age,Height (cm),Weight (kg),Income (USD),Gender_male,Occupation_Analyst,Occupation_Architect,Occupation_Business Analyst,Occupation_Business Consultant,Occupation_CEO,...,Marital Status_Single,Marital Status_Widowed,Favorite Color_Blue,Favorite Color_Green,Favorite Color_Grey,Favorite Color_Orange,Favorite Color_Pink,Favorite Color_Purple,Favorite Color_Red,Favorite Color_Yellow
0,32,175,70,75000,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,25,182,85,45000,1,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0
2,41,160,62,120000,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,38,178,79,90000,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
4,29,165,58,35000,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1


In [12]:
X = data.drop(['Gender_male'], axis = 1)

In [13]:
y = data['Gender_male']

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [15]:
model = GaussianNB()

In [16]:
model.fit(X_train, y_train)

In [17]:
model.score(X_test,y_test)

1.0

In [18]:
X_test[0:5]

Unnamed: 0,Age,Height (cm),Weight (kg),Income (USD),Occupation_Analyst,Occupation_Architect,Occupation_Business Analyst,Occupation_Business Consultant,Occupation_CEO,Occupation_Doctor,...,Marital Status_Single,Marital Status_Widowed,Favorite Color_Blue,Favorite Color_Green,Favorite Color_Grey,Favorite Color_Orange,Favorite Color_Pink,Favorite Color_Purple,Favorite Color_Red,Favorite Color_Yellow
55,30,182,83,95000,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
40,45,187,92,500000,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
19,44,160,58,150000,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
31,38,172,68,75000,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
115,41,184,85,110000,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [19]:
y_test[0:5]

55     1
40     1
19     0
31     0
115    1
Name: Gender_male, dtype: uint8

In [20]:
model.predict(X_test[0:5])

array([1, 1, 0, 0, 1], dtype=uint8)

In [21]:
model.predict_proba(X_test[:10])

array([[1.55151879e-010, 1.00000000e+000],
       [1.44473566e-136, 1.00000000e+000],
       [9.56626065e-001, 4.33739346e-002],
       [8.29301310e-001, 1.70698690e-001],
       [1.11521707e-015, 1.00000000e+000],
       [9.99012953e-001, 9.87047230e-004],
       [9.39873368e-016, 1.00000000e+000],
       [8.42678689e-006, 9.99991573e-001],
       [1.00000000e+000, 1.67067640e-011],
       [8.78187386e-009, 9.99999991e-001]])

In [22]:
cross_val_score(GaussianNB(),X_train, y_train, cv=5)

array([1.        , 0.95238095, 1.        , 1.        , 1.        ])