In [None]:
import pandas as pd

After annotating 100 man and 100 woman, we use that annotation to train data gender prediction

In [None]:
df_gender = pd.read_csv("/content/drive/MyDrive/Data Science Project/Social Graph PPKM/master_data_user_gender_annotated.csv")
df_gender

Unnamed: 0,user_screen_name,user_name,gender
0,RustamIbrahim,Voter Education,m
1,Anjaswisesa_K,ANJAS WISESA,m
2,mhsgunadarma,IG : Mahasiswagunadarma,
3,KATADATAcoid,Katadata.co.id,
4,Beritasatu,BeritaSatu,
...,...,...,...
18788,dykookieluv,dy⁷ kookie 🍪💜,
18789,yntsmntr,unyun 🍒,
18790,onyurmrk,fan ◡̈ ia,
18791,KwanKhong,dimitri,


In [None]:
df_gender_training = df_gender[df_gender['gender'].notna()]
df_gender_training.gender.value_counts()

m    100
w    100
Name: gender, dtype: int64

In [None]:
df_gender_testing = df_gender[df_gender['gender'].isna()]
df_gender_testing

Unnamed: 0,user_screen_name,user_name,gender
2,mhsgunadarma,IG : Mahasiswagunadarma,
3,KATADATAcoid,Katadata.co.id,
4,Beritasatu,BeritaSatu,
5,zwirasakti,🐙Wira🐙,
12,tempodotco,TEMPO.CO,
...,...,...,...
18788,dykookieluv,dy⁷ kookie 🍪💜,
18789,yntsmntr,unyun 🍒,
18790,onyurmrk,fan ◡̈ ia,
18791,KwanKhong,dimitri,


## Preprocessing
Gender can be detected from user real name. Because user real name is not clear enough, we normalize user real name, if account doesn't have real name then normalize username.

In [None]:
import re

In [None]:
def normalize(text):
  result = ''
  try :
    result = re.sub("\d", "", text)
    result = re.sub("[^\w\s]", " ", result) # Remove punctuation
    result = result.lower().strip()
    result = re.sub("\s+", " ", result)
  except :
    result = ''
  return result

In [None]:
def preprocessing(data):
  normal_data = normalize(data)
  return normal_data

In [None]:
def name_or_username(name_username):
  res = []
  for index,data in name_username.iterrows() :
    hasil = preprocessing(data['user_name'])
    if hasil == "" :
      hasil = preprocessing(data['user_screen_name'])
    res.append(hasil)
  return res

In [None]:
clean_labeled_train_normalize = name_or_username(df_gender_training[['user_name','user_screen_name']])
clean_labeled_test_normalize = name_or_username(df_gender_testing[['user_name','user_screen_name']])

## Char-gram
Using Char-gram to cut words into smaller chunk.

In [None]:
#Change data to char-gram
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer='char', ngram_range=(3, 3))

train_vector = vectorizer.fit_transform(clean_labeled_train_normalize).toarray()
test_vector = vectorizer.transform(clean_labeled_test_normalize).toarray()

## Modelling

In [None]:
train_label = df_gender_training['gender']

In [None]:
from sklearn.linear_model import LogisticRegression
import sklearn.metrics

classifier = LogisticRegression(C=0.3)

# train
classifier.fit(train_vector, train_label)

# predict
predictions = classifier.predict(test_vector)

In [None]:
df_gender

Unnamed: 0,user_screen_name,user_name,gender
0,RustamIbrahim,Voter Education,m
1,Anjaswisesa_K,ANJAS WISESA,m
2,mhsgunadarma,IG : Mahasiswagunadarma,
3,KATADATAcoid,Katadata.co.id,
4,Beritasatu,BeritaSatu,
...,...,...,...
18788,dykookieluv,dy⁷ kookie 🍪💜,
18789,yntsmntr,unyun 🍒,
18790,onyurmrk,fan ◡̈ ia,
18791,KwanKhong,dimitri,


In [None]:
len(predictions)

18593

In [None]:
pd.DataFrame(predictions).to_csv('gender_prediction.csv', header=False)
!cp gender_prediction.csv "drive/My Drive/Data Science Project"

In [None]:
df_gender_testing["gender"] = predictions
df_gender_testing

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,user_screen_name,user_name,gender
2,mhsgunadarma,IG : Mahasiswagunadarma,m
3,KATADATAcoid,Katadata.co.id,m
4,Beritasatu,BeritaSatu,w
5,zwirasakti,🐙Wira🐙,m
12,tempodotco,TEMPO.CO,w
...,...,...,...
18788,dykookieluv,dy⁷ kookie 🍪💜,w
18789,yntsmntr,unyun 🍒,w
18790,onyurmrk,fan ◡̈ ia,m
18791,KwanKhong,dimitri,w


In [None]:
df_gender_testing

Unnamed: 0,user_screen_name,user_name,gender
2,mhsgunadarma,IG : Mahasiswagunadarma,m
3,KATADATAcoid,Katadata.co.id,m
4,Beritasatu,BeritaSatu,w
5,zwirasakti,🐙Wira🐙,m
12,tempodotco,TEMPO.CO,w
...,...,...,...
18788,dykookieluv,dy⁷ kookie 🍪💜,w
18789,yntsmntr,unyun 🍒,w
18790,onyurmrk,fan ◡̈ ia,m
18791,KwanKhong,dimitri,w


In [None]:
user_gender_all = pd.concat([df_gender_training, df_gender_testing], ignore_index=True)
user_gender_all.to_csv('user_with_gender.csv')
!cp user_with_gender.csv "drive/My Drive/Data Science Project"