Import Data Set

In [3]:
import pandas as pd
base_path = "/content/random_names_gender.csv"
ngd = pd.read_csv(base_path)

 Manually Feature Extraction

In [4]:
# Function to check if a name has an odd character count
def is_odd_length(name):
    return 1 if len(name) % 2 != 0 else 0

# Function to check if the second last letter is a vowel
def second_last_letter_is_vowel(name):
    if len(name) > 1:
        second_last_letter = name[-2].lower()
        return 1 if second_last_letter in 'aeiou' else 0
    return 0

# Apply the functions to create the new columns
ngd['alphabet_count'] = ngd['Name'].apply(is_odd_length)  # 1 if odd, 0 if even
ngd['second_last_letter_vowel'] = ngd['Name'].apply(second_last_letter_is_vowel)  # 1 if vowel, 0 otherwise

# Display the DataFrame with the new columns
print(ngd)

      Name  Gender  alphabet_count  second_last_letter_vowel
0   Marcus    Male               0                         1
1    Grace  Female               1                         0
2    Damon    Male               1                         1
3   Rachel  Female               0                         1
4    James    Male               1                         1
5     Faye  Female               0                         0
6    Clark    Male               1                         0
7     Dana  Female               0                         0
8    Aaron    Male               1                         1
9     Lynn  Female               0                         0
10   Ethan    Male               1                         1
11    Jill  Female               0                         0
12   Jonah    Male               1                         1
13    Emma  Female               0                         0
14  Elijah    Male               0                         1
15   Alice  Female      

Split the Data:

In [8]:
from sklearn.model_selection import train_test_split

# Features and target variable
X = ngd[['alphabet_count', 'second_last_letter_vowel']]
y = ngd['Gender']

# Split the data into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Applying Classification Algorithm

Logistic Regression:

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Initialize the Logistic Regression model
logreg = LogisticRegression()

# Train the model
logreg.fit(X_train, y_train)

# Make predictions on the test set
y_pred_logreg = logreg.predict(X_test)

# Calculate accuracy and classification report
accuracy_logreg = accuracy_score(y_test, y_pred_logreg)
print(f"Logistic Regression Accuracy: {accuracy_logreg}")
print(classification_report(y_test, y_pred_logreg))


Logistic Regression Accuracy: 1.0
              precision    recall  f1-score   support

      Female       1.00      1.00      1.00         5
        Male       1.00      1.00      1.00         1

    accuracy                           1.00         6
   macro avg       1.00      1.00      1.00         6
weighted avg       1.00      1.00      1.00         6



Automated Feature Extraction

Polynomial Feature Expansion:

In [10]:
from sklearn.preprocessing import PolynomialFeatures

# Apply PolynomialFeatures to increase the feature space
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X)  # X is the feature set from manual extraction

print(f"Shape after Polynomial Expansion: {X_poly.shape}")

Shape after Polynomial Expansion: (30, 6)


In [11]:
X_poly

array([[1., 0., 1., 0., 0., 1.],
       [1., 1., 0., 1., 0., 0.],
       [1., 1., 1., 1., 1., 1.],
       [1., 0., 1., 0., 0., 1.],
       [1., 1., 1., 1., 1., 1.],
       [1., 0., 0., 0., 0., 0.],
       [1., 1., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [1., 1., 1., 1., 1., 1.],
       [1., 0., 0., 0., 0., 0.],
       [1., 1., 1., 1., 1., 1.],
       [1., 0., 0., 0., 0., 0.],
       [1., 1., 1., 1., 1., 1.],
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 1., 0., 0., 1.],
       [1., 1., 0., 1., 0., 0.],
       [1., 0., 1., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0.],
       [1., 1., 1., 1., 1., 1.],
       [1., 0., 0., 0., 0., 0.],
       [1., 1., 1., 1., 1., 1.],
       [1., 1., 0., 1., 0., 0.],
       [1., 1., 1., 1., 1., 1.],
       [1., 0., 0., 0., 0., 0.],
       [1., 1., 1., 1., 1., 1.],
       [1., 1., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 1., 0., 0., 1.],
       [1., 1., 1., 1., 1., 1.]])

In [12]:
y = ngd['Gender']

# Split the data into train and test sets (80% train, 20% test)
X_poly_train, X_poly_test, y_poly_train, y_poly_test = train_test_split(X_poly, y, test_size=0.2, random_state=42)

# Initialize the Logistic Regression model
logreg = LogisticRegression()

# Train the model
logreg.fit(X_poly_train, y_poly_train)

# Make predictions on the test set
y_pred_logreg = logreg.predict(X_poly_test)

# Calculate accuracy and classification report
accuracy_logreg = accuracy_score(y_poly_test, y_pred_logreg)
print(f"Logistic Regression Accuracy: {accuracy_logreg}")
print(classification_report(y_poly_test, y_pred_logreg))


Logistic Regression Accuracy: 1.0
              precision    recall  f1-score   support

      Female       1.00      1.00      1.00         5
        Male       1.00      1.00      1.00         1

    accuracy                           1.00         6
   macro avg       1.00      1.00      1.00         6
weighted avg       1.00      1.00      1.00         6

