<a href="https://colab.research.google.com/github/sanskarmalviya7/ARANYA-THE-DIGITAL-FORESTRY-SVVV/blob/main/acuracy_of_gender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from tabulate import tabulate

# Load dataset
df = pd.read_csv('/content/universalnames_dataset.csv')
df.sex.replace({'F': 0, 'M': 1}, inplace=True)

# Feature extraction function
def extract_features(name):
    if isinstance(name, str):  # Check if name is a string
        name = name.lower()    # Convert to lowercase
        return {
            'first-letter': name[0],       # First letter
            'first2-letters': name[:2],    # First 2 letters
            'first3-letters': name[:3],    # First 3 letters
            'last-letter': name[-1],       # Last letter
            'last2-letters': name[-2:],    # Last 2 letters
            'last3-letters': name[-3:],    # Last 3 letters
        }
    else:
        return {}  # Return an empty dictionary for NaN values


# Extract features for all names
df['features'] = df['name'].apply(extract_features)

# Split data into features and labels
X = df['features'].tolist()
y = df['sex']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Vectorize features
dv = DictVectorizer()
X_train_vectorized = dv.fit_transform(X_train)
X_test_vectorized = dv.transform(X_test)

# Train Decision Tree Classifier
dclf = DecisionTreeClassifier()
dclf.fit(X_train_vectorized, y_train)

# Function to predict gender for a list of names
def classify_gender(names):
    features = [extract_features(name) for name in names]
    features_vectorized = dv.transform(features)
    predictions = dclf.predict(features_vectorized)
    return predictions

# Test the classifier
names_to_predict = ["saroj", "banti"]
gender_predictions = classify_gender(names_to_predict)
for name, prediction in zip(names_to_predict, gender_predictions):
    print(f"{name}: {'Female' if prediction == 0 else 'Male'}")

# Accuracy on training set
print("Accuracy on training set:", dclf.score(X_train_vectorized, y_train))
# Accuracy on test set
print("Accuracy on test set:", dclf.score(X_test_vectorized, y_test))
print(df.columns)

saroj: Female
banti: Male
Accuracy on training set: 0.9525775078324716
Accuracy on test set: 0.8926351938029873
Index(['name', 'sex', 'features'], dtype='object')


In [None]:
# @title Default title text
from sklearn.metrics import confusion_matrix
def predict_gender_with_confusion_matrix(name):
    # Predict gender
    gender_prediction = classify_gender([name])[0]

    # Confusion matrix
    X = [extract_features(name) for name in df['name']]  # Extract features from all names in the original dataset
    X_vectorized = dv.transform(X)
    y_true = df['sex']
    y_pred = dclf.predict(X_vectorized)

    # Output the prediction
    predicted_gender = 'Female' if gender_prediction == 0 else 'Male'
    print(predicted_gender)


In [None]:
# Test the function
for name in df['name']:
    predict_gender_with_confusion_matrix(name)


In [None]:
# @title Default title text
def predict_gender_with_confusion_matrix(name):
    # Predict gender
    gender_prediction = classify_gender([name])[0]

    # Confusion matrix
    X = [extract_features(name) for name in df['name']]  # Extract features from all names in the original dataset
    X_vectorized = dv.transform(X)
    y_true = df['sex']
    y_pred = dclf.predict(X_vectorized)
    cm = confusion_matrix(y_true, y_pred)

    accuracy = (cm[0, 0] + cm[1, 1]) / cm.sum()
    # Output the prediction
    predicted_gender = 'Female' if gender_prediction == 0 else 'Male'

    return predicted_gender, accuracy


In [None]:
Gendercl = 'Farmer Gender'
Namecl ='Farmer Name'
def classify_gender_from_csv(file_path, name_column):
    data = pd.read_csv(file_path)
    data.dropna(subset=[name_column], inplace=True)

    # Extract first name
    data['First Name'] = data[name_column].str.split().str[0]

    # Predict gender and accuracy for each first name
    data['predicted_gender'], data['accuracy'] = zip(*data['First Name'].apply(predict_gender_with_confusion_matrix))

    # Determine prediction correctness
    data['prediction_correctness'] = 'Correct'  # Initialize correctness column
    data.loc[data[Gendercl].str.startswith('Male') & (data['predicted_gender'] == 'Male'), 'prediction_correctness'] = 'Correct'
    data.loc[data[Gendercl].str.startswith('Female') & (data['predicted_gender'] == 'Female'), 'prediction_correctness'] = 'Correct'
    data.loc[(data[Gendercl].str.startswith('Male') & (data['predicted_gender'] == 'Female')) |
             (data[Gendercl].str.startswith('Female') & (data['predicted_gender'] == 'Male')), 'prediction_correctness'] = 'Wrong'

    wrong_predictions = data[data['prediction_correctness'] == 'Wrong']
    sorted_data = pd.concat([wrong_predictions, data[data['prediction_correctness'] != 'Wrong']])

    return sorted_data[[Namecl, Gendercl, 'predicted_gender', 'prediction_correctness']]

# Test with a CSV file
csv_file_path = '/content/FPONext.csv'    # Replace with your CSV file path
predicted_data = classify_gender_from_csv(csv_file_path, Namecl)  # Replace 'name_column' with the name of the column containing names

# Display the DataFrame with required columns using tabulate
print(tabulate(predicted_data, headers='keys', tablefmt='psql'))


+------+------------------------------------------+-----------------+--------------------+--------------------------+
|      | Farmer Name                              | Farmer Gender   | predicted_gender   | prediction_correctness   |
|------+------------------------------------------+-----------------+--------------------+--------------------------|
|    1 | Sanju                                    | Male (पुरुष)      | Female             | Wrong                    |
|    2 | Sourabh joshi                            | Female (महिला)    | Male               | Wrong                    |
|   30 | Aa                                       | Male (पुरुष)      | Female             | Wrong                    |
|   47 | Surendra Kumar Pal                       | Female (महिला)    | Male               | Wrong                    |
|   85 | Hasim bee                                | Female (महिला)    | Male               | Wrong                    |
|   97 | Attu Yadav                           

In [None]:
def classify_gender_from_csv(file_path, name_column):
    data = pd.read_csv(file_path)
    data.dropna(subset=[name_column], inplace=True)


    names = data[name_column].tolist()
    gender_predictions = classify_gender(names)
    data['predicted_gender'] = ['Female' if pred == 0 else 'Male' for pred in gender_predictions]  # Add predictions to DataFrame
    data['prediction_correctness'] = 'Correct'  # Initialize correctness column
    data.loc[data['Farmer Gender'].str.startswith('Male') & (data['predicted_gender'] == 'Male'), 'prediction_correctness'] = 'Correct'
    data.loc[data['Farmer Gender'].str.startswith('Female') & (data['predicted_gender'] == 'Female'), 'prediction_correctness'] = 'Correct'
    data.loc[(data['Farmer Gender'].str.startswith('Male') & (data['predicted_gender'] == 'Female')) |
             (data['Farmer Gender'].str.startswith('Female') & (data['predicted_gender'] == 'Male')), 'prediction_correctness'] = 'Wrong'
    data.loc[data['Farmer Gender'] == data['predicted_gender'], 'prediction_correctness'] = 'Same'
    wrong_predictions = data[data['prediction_correctness'] == 'Wrong']
    sorted_data = pd.concat([wrong_predictions, data[data['prediction_correctness'] != 'Wrong']])

    return sorted_data[[Namecl, Gendercl, 'predicted_gender', 'prediction_correctness']]

# Test with a CSV file
csv_file_path = '/content/FPONext.csv'  # Replace with your CSV file path
predicted_data = classify_gender_from_csv(csv_file_path, 'Farmer Name')  # Replace 'name_column' with the name of the column containing names

# Display the DataFrame with required columns using tabulate
print(tabulate(predicted_data, headers='keys', tablefmt='psql'))

+------+------------------------------------------+-----------------+--------------------+--------------------------+
|      | Farmer Name                              | Farmer Gender   | predicted_gender   | prediction_correctness   |
|------+------------------------------------------+-----------------+--------------------+--------------------------|
|    1 | Sanju                                    | Male (पुरुष)      | Female             | Wrong                    |
|    7 | Pritam Singh                             | Male (पुरुष)      | Female             | Wrong                    |
|   13 | Valmik ramesh namaste                    | Male (पुरुष)      | Female             | Wrong                    |
|   15 | JAYPAL NAGWANSHI                         | Male (पुरुष)      | Female             | Wrong                    |
|   20 | Sant chandrawanshi                       | Male (पुरुष)      | Female             | Wrong                    |
|   23 | Momin mansuri                        