# Import Dataset

In [1]:
import pandas as pd
df = pd.read_csv('datasets/MalaysianNames.csv')

In [2]:
print(df.head())

  gender country_code                    name
0      M           MY  Wan Hasszry Wan Hassan
1      M           MY              Abu Sufian
2      M           MY            Benjamin Foo
3      F           MY             Nor Hazlina
4      M           MY                 Mat Kat


# Data Preprocessing

In [3]:
# remove rows with missing values
df = df.dropna()
print(df.isnull().sum())
df = df.drop(['country_code'], axis=1)

# save to csv
df.to_csv('datasets/MalaysianNames_clean.csv', index=False)

gender          0
country_code    0
name            0
dtype: int64


In [4]:
# load clean dataset
df = pd.read_csv('datasets/MalaysianNames_clean.csv')
print(df.head())

  gender                    name
0      M  Wan Hasszry Wan Hassan
1      M              Abu Sufian
2      M            Benjamin Foo
3      F             Nor Hazlina
4      M                 Mat Kat


In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings("ignore")

# Extract features from the combined name
name_vectorizer = CountVectorizer(analyzer='char', ngram_range=(2, 3))
name_features = name_vectorizer.fit_transform(df['name'])

label_encoder = LabelEncoder()
df["gender"] = label_encoder.fit_transform(df["gender"])


# Step 4: Split the Data
# Split the data into training and testing sets


In [6]:
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(name_features, df["gender"], test_size=0.2, random_state=42)

# Step 5: Choose a Machine Learning Algorithm
# Initialize the model (Decision Tree Classifier)
model = DecisionTreeClassifier()

# Step 6: Model Training
# Train the model
model.fit(X_train, y_train)

# Step 7: Make Predictions and Evaluate the Model
# Make predictions on the test data
y_pred = model.predict(X_test)

# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy}")



               Date  Order # N. Revenue (formatted)     Status  \
0  14/09/2023 20:37    15072              RM117.80   completed   
1  14/09/2023 19:38    15069               RM32.90   completed   
2  14/09/2023 19:09    15068               RM57.90   completed   
3  14/09/2023 17:25    15066               RM32.90   completed   
4  14/09/2023 17:17    15065              RM167.90   completed   

                Customer Customer type  \
0        narimah mokhtar     returning   
1  sudiana mohamad rasib           new   
2          Haizum Hasnan           new   
3        Hanifizah Wahid     returning   
4     Zuhaidah Abd razak           new   

                                          Product(s)  Items sold  Coupon(s)  \
0  1× COLLAGEN SERUM FOUNDATION ALHA ALFA - LIGHT...           2        NaN   
1                        1× LIP BOOSTER - STRAWBERRY           1        NaN   
2     1× COLLAGEN SERUM FOUNDATION ALHA ALFA - LIGHT           1        NaN   
3                        1× LIP BO