In [34]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('dataset'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

dataset\gender-classifier-DFE-791531.csv


In [35]:
df = pd.read_csv(r"dataset/gender-classifier-DFE-791531.csv", encoding = "latin1")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20050 entries, 0 to 20049
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   _unit_id               20050 non-null  int64  
 1   _golden                20050 non-null  bool   
 2   _unit_state            20050 non-null  object 
 3   _trusted_judgments     20050 non-null  int64  
 4   _last_judgment_at      20000 non-null  object 
 5   gender                 19953 non-null  object 
 6   gender:confidence      20024 non-null  float64
 7   profile_yn             20050 non-null  object 
 8   profile_yn:confidence  20050 non-null  float64
 9   created                20050 non-null  object 
 10  description            16306 non-null  object 
 11  fav_number             20050 non-null  int64  
 12  gender_gold            50 non-null     object 
 13  link_color             20050 non-null  object 
 14  name                   20050 non-null  object 
 15  pr

In [36]:
df = pd.concat([df.gender, df.description], axis = 1)
df.gender = [1 if each == "female" else 0 for each in df.gender]
df.head()

Unnamed: 0,gender,description
0,0,i sing my own rhythm.
1,0,I'm the author of novels filled with family dr...
2,0,louis whining and squealing and all
3,0,"Mobile guy. 49ers, Shazam, Google, Kleiner Pe..."
4,1,Ricky Wilson The Best FRONTMAN/Kaiser Chiefs T...


In [37]:
#For cleaning DATA
import re #regular expression
df.dropna(axis = 0, inplace = True)

first_description = df.description[4]
description = re.sub("[^a-zA-Z]", " ", first_description) # replacing non-alphabet characters with space character.
description = description.lower() # making all letters lower.
description

'ricky wilson the best frontman kaiser chiefs the best band xxxx thank you kaiser chiefs for an incredible year of gigs and memories to cherish always    xxxxxxx'

In [38]:
import nltk # natural language tool kit
nltk.download("stopwords") # downloading the stopwords into corpus file.
nltk.download('punkt')
from nltk.corpus import stopwords # importing stopwords from corpus file.

# splitting with tokenizer
description = nltk.word_tokenize(description) # we could have use description.split() but it doesn't split words like this: "shouldn't = should not"
description = [word for word in description if not word in set(stopwords.words("english"))] # I am going to show another alternative way instead of this way.
description

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['ricky',
 'wilson',
 'best',
 'frontman',
 'kaiser',
 'chiefs',
 'best',
 'band',
 'xxxx',
 'thank',
 'kaiser',
 'chiefs',
 'incredible',
 'year',
 'gigs',
 'memories',
 'cherish',
 'always',
 'xxxxxxx']

In [41]:
#Lemmatization
import nltk as nlp
nltk.download('wordnet')
nltk.download('omw-1.4')
lemma = nlp.WordNetLemmatizer()
description = [lemma.lemmatize(word) for word in description]
description = " ".join(description)
description

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\omw-1.4.zip.


'ricky wilson best frontman kaiser chief best band xxxx thank kaiser chief incredible year gig memory cherish always xxxxxxx'

In [42]:
description_list = []

for description in df.description:
    description = re.sub("[^a-zA-Z]", " ", description)
    description = description.lower()
    description = nltk.word_tokenize(description)
    lemma = nlp.WordNetLemmatizer()
    description = [lemma.lemmatize(word) for word in description]
    description = " ".join(description)
    description_list.append(description)

In [44]:
from sklearn.feature_extraction.text import CountVectorizer # to create bag of words.
max_features = 5000 # using just 5000 words to make the process faster.

count_vectorizer = CountVectorizer(max_features = max_features, stop_words = "english") # preparing 5000 words to create sparse_matrix

sparse_matrix = count_vectorizer.fit_transform(description_list).toarray() # create sparse matrix. There is an example for sparse matrix in the picture above.
#print("{} common used words: {}".format(max_features, count_vectorizer.get_feature_names()))

In [45]:
y = df.iloc[:,0].values # male or female classes
x = sparse_matrix

# train test split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.1, random_state = 42)

# naive bayes classification method
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(x_train, y_train)

#prediction
print("Accuracy:", nb.score(x_test, y_test))

Accuracy: 0.5419987737584304
