# Preprocessing Data
1. Import data from three different datasets
2. Concatenate the data
3. Remove duplicates and nulls from dataset

In [None]:
import polars as pl

In [None]:
dataset1 = pl.read_csv("./datasets/names.csv")
dataset2 = pl.read_csv("./datasets/name_gender.csv")
dataset3 = pl.read_csv("./datasets/NationalNames.csv")

In [None]:
dataset = pl.concat([dataset1,dataset2,dataset3],how="diagonal")
dataset = dataset.select(["Name","Gender"])
raw_dataset_size = dataset.shape[0]
print(f"The raw dataset contains {raw_dataset_size} names")

In [None]:
dataset = dataset.unique()
dataset = dataset.drop_nulls()
print(f"There are {dataset.shape[0]} unique names")

# Using LSTM

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences, to_categorical
from keras.models import Sequential
from keras.layers import Dense,LSTM,Embedding,Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pickle

In [None]:
names = list(dataset["Name"])
labels = list(dataset["Gender"])


tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(names)
sequence_of_int = tokenizer.texts_to_sequences(names)
padsequences=pad_sequences(sequence_of_int,maxlen=15,padding='post')
le = LabelEncoder()
labels = le.fit_transform(labels)
labels=to_categorical(labels)
pickle.dump(tokenizer,open('tokenizer.pkl','wb+'),protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
feature_train,feature_test,label_train,label_test=train_test_split(padsequences,labels,test_size=0.1,random_state=42)

In [None]:
model=Sequential()
model.add(Embedding(27,64,input_length=15))
model.add(LSTM(2048,return_sequences=True))
model.add(LSTM(256,return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(2,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

In [None]:
model.fit(feature_train,label_train,epochs=1,validation_data=(feature_test,label_test),batch_size=500)
model.save('model.h5')