In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report


df_train = pd.read_csv('cleaned_train.csv')
df_test = pd.read_csv('cleaned_test.csv')
df_train['text'] = df_train['text'].astype(str).fillna("")
df_test['text'] = df_test['text'].astype(str).fillna("")


# Encoding categories and sub-categories
category_encoder = LabelEncoder()
sub_category_encoder = LabelEncoder()

# Custom function for safe encoding
def safe_transform(encoder, data):
    # Get the unique classes seen by the encoder
    classes = set(encoder.classes_)

    # Replace unseen labels with -1
    return [encoder.transform([label])[0] if label in classes else -1 for label in data]

category_encoder.fit_transform(df_train['category'])

# Transform the train data
df_train['category_label'] = category_encoder.transform(df_train['category'])

# Transform the test data (with safe handling for unseen labels)
df_test['category_label'] = safe_transform(category_encoder, df_test['category'])




# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000)  # You can adjust max_features as needed

# Fit on training text data and transform both train and test
X_train_tfidf = vectorizer.fit_transform(df_train['text'])
X_test_tfidf = vectorizer.transform(df_test['text'])



# Split the data into train and test sets
X_train, X_test, y_train_category, y_test_category = X_train_tfidf, X_test_tfidf, df_train['category_label'], df_test['category_label']


# Initialize Logistic Regression model
category_model = LogisticRegression(max_iter=1000)  # Increase `max_iter` if convergence issues occur

# Train the Logistic Regression model on the category labels
category_model.fit(X_train, y_train_category)

# Make predictions
y_pred_category = category_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test_category, y_pred_category)
print(f"Category Model Accuracy (Logistic Regression): {accuracy:.2f}")


prediction = category_model.predict(X_test_tfidf[0])
print(df_test['text'][0])
print(category_encoder.inverse_transform(prediction))


sub_category_models = {}

category_groups = df_train.groupby('category_label')

for category, group in category_groups:
    # Extract features and labels for this category
    X_category = X_train_tfidf[group.index]
    y_sub_category = group['sub_category']

    # Initialize and fit a LabelEncoder
    sub_category_encoder = LabelEncoder()
    y_sub_category_encoded = sub_category_encoder.fit_transform(y_sub_category)

    # Check if there's only one unique sub-category
    if len(set(y_sub_category_encoded)) == 1:
        # Save the constant prediction (always the single label)
        sub_category_models[category] = {
            "model": None,
            "encoder": sub_category_encoder,
            "constant_label": y_sub_category.iloc[0]
        }
        continue

    # Train a Logistic Regression model for sub-category
    sub_category_model = LogisticRegression(max_iter=1000)
    sub_category_model.fit(X_category, y_sub_category_encoded)

    # Save the trained model and encoder in the dictionary
    sub_category_models[category] = {
        "model": sub_category_model,
        "encoder": sub_category_encoder
    }


def predict_single_sample(X_sample):
    # Ensure the input sample is reshaped correctly for prediction
    if len(X_sample.shape) == 1:
        X_sample = X_sample.reshape(1, -1)

    # Step 1: Predict the category
    predicted_category = category_model.predict(X_sample)[0]

    # Step 2: Retrieve the sub-category model and encoder for the predicted category
    sub_category_data = sub_category_models.get(predicted_category)

    if sub_category_data is None:
        raise ValueError(f"No sub-category model found for category {predicted_category}.")

    sub_category_model = sub_category_data.get('model')  # Retrieve the model
    sub_category_encoder = sub_category_data.get('encoder')  # Retrieve the encoder

    # Step 3: Handle single sub-category case (model not trained)
    if sub_category_model is None:
        # Directly decode the only possible sub-category
        single_sub_category = sub_category_encoder.inverse_transform([0])[0]
        return predicted_category, single_sub_category

    # Step 4: Predict the sub-category using the trained model
    predicted_sub_category_encoded = sub_category_model.predict(X_sample)[0]
    predicted_sub_category = sub_category_encoder.inverse_transform([predicted_sub_category_encoded])[0]

    return predicted_category, predicted_sub_category


def evaluate_combined_model(X_test_tfidf, y_test_category, df_test):
    category_accuracy = 0
    combined_accuracy = 0

    # Convert X_test_tfidf to dense format for prediction
    X_test_dense = X_test_tfidf.toarray()
    total_samples = len(X_test_dense)
    # Track correct predictions
    correct_category_predictions = 0
    correct_combined_predictions = 0

    for i in range(total_samples):
        # Get the true category and sub-category for the current sample
        true_category = y_test_category.iloc[i]
        true_sub_category = df_test.iloc[i]['sub_category']

        # Step 1: Predict the category
        predicted_category = category_model.predict([X_test_dense[i]])[0]

        # Check if the category prediction is correct
        if predicted_category == true_category:
            correct_category_predictions += 1

            # Step 2: Predict the sub-category for the correct category
            sub_category_model_info = sub_category_models.get(true_category, None)
            if sub_category_model_info:
                sub_category_model = sub_category_model_info.get("model", None)
                if sub_category_model:
                    # Predict sub-category using the model
                    predicted_sub_category_encoded = sub_category_model.predict([X_test_dense[i]])[0]
                    predicted_sub_category = sub_category_model_info['encoder'].inverse_transform([predicted_sub_category_encoded])[0]
                else:
                    # No model (only one sub-category), use the constant label
                    predicted_sub_category = sub_category_model_info['constant_label']
            else:
                # No model (missing sub-category model), predict None or fallback
                predicted_sub_category = None

            # Check if sub-category prediction is correct
            if predicted_sub_category == true_sub_category:
                correct_combined_predictions += 1

    # Calculate accuracies
    category_accuracy = correct_category_predictions / total_samples
    combined_accuracy = correct_combined_predictions / total_samples

    return {
        "category_accuracy": category_accuracy,
        "combined_accuracy": combined_accuracy
    }



# Now, let's evaluate the model:
results = evaluate_combined_model(X_test_tfidf, y_test_category, df_test)

# Display the results
print(f"Category Accuracy: {results['category_accuracy']:.2f}")
print(f"Combined Accuracy: {results['combined_accuracy']:.2f}")

# Category Classification Report
# True labels and predictions for categories
y_true_category = df_test['category_label']
y_pred_category = category_model.predict(X_test_tfidf)

# Generate classification report for categories
category_report = classification_report(
    y_true_category,
    y_pred_category,
    target_names=category_encoder.classes_,
    zero_division=0
)
print("Category Classification Report:\n")
print(category_report)

# Sub-Category Classification Report
# Initialize variables to store true and predicted sub-categories
y_true_sub_category = []
y_pred_sub_category = []

# Convert X_test_tfidf to dense format for prediction
X_test_dense = X_test_tfidf.toarray()

# Iterate through each test sample
for i in range(len(X_test_dense)):
    true_category = y_true_category.iloc[i]
    true_sub_category = df_test.iloc[i]['sub_category']

    # Predict the category first
    predicted_category = category_model.predict([X_test_dense[i]])[0]

    # Retrieve the sub-category model and encoder for the predicted category
    sub_category_model_info = sub_category_models.get(predicted_category, None)
    if sub_category_model_info:
        sub_category_model = sub_category_model_info.get("model", None)
        sub_category_encoder = sub_category_model_info.get("encoder", None)

        if sub_category_model:
            # Predict sub-category using the model
            predicted_sub_category_encoded = sub_category_model.predict([X_test_dense[i]])[0]
            predicted_sub_category = sub_category_encoder.inverse_transform([predicted_sub_category_encoded])[0]
        else:
            # Use constant label if no model is available
            predicted_sub_category = sub_category_model_info['constant_label']
    else:
        predicted_sub_category = None

    # Append true and predicted sub-categories for the classification report
    y_true_sub_category.append(true_sub_category)
    y_pred_sub_category.append(predicted_sub_category)

# Generate classification report for sub-categories
sub_category_report = classification_report(
    y_true_sub_category,
    y_pred_sub_category,
    zero_division=0
)
print("\nSub-Category Classification Report:\n")
print(sub_category_report)

Category Model Accuracy (Logistic Regression): 0.76
sir namaskar mein ranjit kumar patrapais nehi tho sir kuch din pehel onlin loan aap credit pearl loan aap se money loan kiya thalekin sir loan bolk jub loan diy tho mein turant return kar diya thalekin din baad what app pe messag aya payment karomein bola diy aap mein wo de diyawo gali diy tho v return kar diyafir v messag kark bolt hai full payment karo half payment nehi chalegarap case mein daldeng etcfak illig se contact number v hack kar dete haibol rahehai sab ko messag kareng ye rapist hai bolk sirpl sir small ammount ke liy goggl play store se loan appli kiya thafak loan aap v hai socha nehi thapl sir request kar rahahun action lo sir mera number hai jo v proof chahiy dunga sir
['Online Financial Fraud']
Category Accuracy: 0.76
Combined Accuracy: 0.51
Category Classification Report:

                                                      precision    recall  f1-score   support

                               Any Other Cyber Crim