## Library use

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

## Load dataset

In [2]:
df =pd.read_csv('nama.csv')

In [3]:
df.head()

Unnamed: 0,nama,jenis_kelamin
0,ERWIN TJAHJONO,Laki-Laki
1,DAVIANDRIE ANDIKA BAHROENY,Laki-Laki
2,ELAN KURNIAWAN,Laki-Laki
3,AYU DWI CAHYANING MUKTI,Perempuan
4,WAHYOEDIN,Laki-Laki


In [4]:
df.shape

(13137, 2)

In [5]:
df.tail()

Unnamed: 0,nama,jenis_kelamin
13132,HERMANSYAH,Laki-Laki
13133,SITA.HJ,Perempuan
13134,MASNI TAMBUNAN,Perempuan
13135,MARJANEDI,Laki-Laki
13136,NGALIMAN,Laki-Laki


## Checking dataset

In [6]:
df.isnull().values.any()

True

In [7]:
len(df[pd.isnull(df).any(axis=1)])

187

In [8]:
df = df.dropna(how='all')
len(df[pd.isnull(df).any(axis=1)])

0

In [9]:
df.shape

(12950, 2)

## Encoding

In [10]:
gen_enc = {"Laki-Laki" : 1, "Perempuan" : 0}
df["jenis_kelamin"] = df["jenis_kelamin"].map(gen_enc)

In [11]:
df.head()

Unnamed: 0,nama,jenis_kelamin
0,ERWIN TJAHJONO,1
1,DAVIANDRIE ANDIKA BAHROENY,1
2,ELAN KURNIAWAN,1
3,AYU DWI CAHYANING MUKTI,0
4,WAHYOEDIN,1


## Check imbalance data

In [12]:
num_df = len(df)
num_male = len(df.loc[df['jenis_kelamin'] == 1])
num_female = len(df.loc[df['jenis_kelamin'] == 0])
print("Number of males:  {0} ({1:2.2f}%)".format(num_male, (num_male/num_df) * 100))
print("Number of females: {0} ({1:2.2f}%)".format(num_female, (num_female/num_df) * 100))

Number of males:  6162 (47.58%)
Number of females: 6788 (52.42%)


In [13]:
feature_col = ["nama"]
predicted_col = ["jenis_kelamin"]

X = df[feature_col].values     
y = df[predicted_col].values
split_test_size = 0.30

text_train, text_test, y_train, y_test = train_test_split(X, y, test_size=split_test_size, stratify=y, random_state=42)

In [14]:
print("Original Dataset Male       : {0} ({1:0.2f}%)".format(len(df.loc[df['jenis_kelamin'] == 1]), (len(df.loc[df['jenis_kelamin'] == 1])/len(df.index)) * 100.0))
print("Original Dataset Female     : {0} ({1:0.2f}%)".format(len(df.loc[df['jenis_kelamin'] == 0]), (len(df.loc[df['jenis_kelamin'] == 0])/len(df.index)) * 100.0))
print("")
print("Training Dataset Male   : {0} ({1:0.2f}%)".format(len(y_train[y_train[:] == 1]), (len(y_train[y_train[:] == 1])/len(y_train) * 100.0)))
print("Training Dataset Female : {0} ({1:0.2f}%)".format(len(y_train[y_train[:] == 0]), (len(y_train[y_train[:] == 0])/len(y_train) * 100.0)))
print("")
print("Test Dataset Male       : {0} ({1:0.2f}%)".format(len(y_test[y_test[:] == 1]), (len(y_test[y_test[:] == 1])/len(y_test) * 100.0)))
print("Test Dataset Female     : {0} ({1:0.2f}%)".format(len(y_test[y_test[:] == 0]), (len(y_test[y_test[:] == 0])/len(y_test) * 100.0)))

Original Dataset Male       : 6162 (47.58%)
Original Dataset Female     : 6788 (52.42%)

Training Dataset Male   : 4313 (47.58%)
Training Dataset Female : 4752 (52.42%)

Test Dataset Male       : 1849 (47.59%)
Test Dataset Female     : 2036 (52.41%)


## Feature extraction

In [15]:
#from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(analyzer = 'char_wb', ngram_range=(2,6))
vectorizer.fit(text_train.ravel())

X_train = vectorizer.transform(text_train.ravel())
X_test = vectorizer.transform(text_test.ravel())

## Logistic Regression

In [16]:
#from sklearn.linear_model import LogisticRegression

lg = LogisticRegression()
lg.fit(X_train, y_train.ravel())

In [17]:
print(lg.score(X_train, y_train))

print(lg.score(X_test, y_test))

0.9965802537231109
0.9366795366795366


In [18]:
#from sklearn import metrics

lg_predict = lg.predict(X_test)

print("Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, lg_predict)))
print(metrics.confusion_matrix(y_test, lg_predict, labels=[1, 0]) )
print("")
print("Classification Report")
print(metrics.classification_report(y_test, lg_predict, labels=[1,0]))

Accuracy: 0.9367
[[1727  122]
 [ 124 1912]]

Classification Report
              precision    recall  f1-score   support

           1       0.93      0.93      0.93      1849
           0       0.94      0.94      0.94      2036

    accuracy                           0.94      3885
   macro avg       0.94      0.94      0.94      3885
weighted avg       0.94      0.94      0.94      3885



In [19]:
gen_label = {1:"Laki-Laki", 0:"Perempuan"}

test_predict = vectorizer.transform(["Teguh Kurniadi"])
res = lg.predict(test_predict)

print(gen_label[int(res)])

Laki-Laki


## With pipeline 

In [20]:
#from sklearn.pipeline import Pipeline

pipe_lg = Pipeline([('vect', CountVectorizer(analyzer = 'char_wb', ngram_range=(2,6))),
                     ('clf', LogisticRegression()),
])
pipe_lg = pipe_lg.fit(text_train.ravel(), y_train.ravel())
predicted = pipe_lg.predict(text_test.ravel())
np.mean(predicted == y_test.ravel()) 

0.9366795366795366

In [21]:
result = pipe_lg.predict(["Slamet Widodo"])
print(gen_label[result[0]])

Laki-Laki


## Naive Bayes 

In [22]:
#from sklearn.naive_bayes import MultinomialNB

pipe_nb = Pipeline([('vect', CountVectorizer(analyzer = 'char_wb', ngram_range=(2,6))),
                     ('clf', MultinomialNB()),
])

pipe_nb = pipe_nb.fit(text_train.ravel(), y_train.ravel())
predicted = pipe_nb.predict(text_test.ravel())
np.mean(predicted == y_test.ravel())  

0.9330759330759331

In [23]:
result = pipe_nb.predict(["Alifah Rahmah"])
print(gen_label[result[0]])

Perempuan


## Random Forest

In [24]:
#from sklearn.ensemble import RandomForestClassifier

pipe_rf = Pipeline([('vect', CountVectorizer(analyzer = 'char_wb', ngram_range=(2,6))),
                     ('clf', RandomForestClassifier(n_estimators=90, n_jobs=-1)),
])

pipe_rf = pipe_rf.fit(text_train.ravel(), y_train.ravel())
predicted = pipe_rf.predict(text_test.ravel())
np.mean(predicted == y_test.ravel())  

0.9317889317889317

In [25]:
result = pipe_rf.predict(["Yuni ahmad"])
print(gen_label[result[0]])

Laki-Laki
