In [6]:
!pip install openpyxl



In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import numpy as np

# Define global objects for reusability
vectorizer = TfidfVectorizer()
scaler = StandardScaler()
knn = None  # Initialize KNN as None to define it later

def model():
    global vectorizer, scaler, knn

    # Correct file path
    file_path = "Sample_data.xlsx"

    # Read the data
    df = pd.read_excel(file_path)

    # Ensure numeric column is clean
    df["matricule_ocr_value"] = pd.to_numeric(df["matricule_ocr_value"], errors='coerce')

    # Drop rows with invalid numeric values
    df = df.dropna(subset=["matricule_ocr_value"])

    # Extract features and target
    X_text = df["Full_name"]
    X_numeric = df["matricule_ocr_value"].values.reshape(-1, 1)
    y = df["Corrected_Name"]

    # Split the data into training and testing sets
    X_text_train, X_text_test, X_numeric_train, X_numeric_test, y_train, y_test = train_test_split(
        X_text, X_numeric, y, test_size=0.2, random_state=42
    )

    # Text feature vectorization
    X_text_train_vectors = vectorizer.fit_transform(X_text_train)
    X_text_test_vectors = vectorizer.transform(X_text_test)

    # Numeric feature scaling
    X_numeric_train_scaled = scaler.fit_transform(X_numeric_train)
    X_numeric_test_scaled = scaler.transform(X_numeric_test)

    # Combine text and numeric features
    X_train_combined = np.hstack([X_text_train_vectors.toarray(), X_numeric_train_scaled])
    X_test_combined = np.hstack([X_text_test_vectors.toarray(), X_numeric_test_scaled])

    # Train KNN model
    knn = KNeighborsClassifier(n_neighbors=1)
    knn.fit(X_train_combined, y_train)

    # Make predictions
    y_pred = knn.predict(X_test_combined)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", accuracy)

def new_data():
    global vectorizer, scaler, knn

    if knn is None:
        print("Model is not trained yet. Please run the 'model' function first.")
        return

    # Input new data interactively
    matricule_number = float(input("Enter the Matricule number: "))
    full_name = input("Enter the Full name: ")

    # Transform new text data using the existing vectorizer
    new_text_vectors = vectorizer.transform([full_name])

    # Scale new numeric data using the existing scaler
    new_numeric_scaled = scaler.transform(np.array([matricule_number]).reshape(-1, 1))

    # Combine new features
    new_combined = np.hstack([new_text_vectors.toarray(), new_numeric_scaled])

    # Predict using the trained KNN model
    prediction = knn.predict(new_combined)

    print("Prediction for new data:")
    print(f"Matricule: {matricule_number}, Combined Text: '{full_name}', Predicted Name: {prediction[0]}")


In [8]:
model()

Accuracy: 0.968944099378882


In [9]:
new_data()

Prediction for new data:
Matricule: 904817.0, Combined Text: 'Allard A', Predicted Name: Allard Arthur
