In [1]:
import pandas as pd
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

df = pd.read_csv("cleaned_train.csv")
df['text'] = df['text'].fillna('')

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000)

# Fit and transform the entire dataset
X_vec = vectorizer.fit_transform(df['text'])

# Remove classes with only one sample
counts = df['category'].value_counts()
low_count_classes = counts[counts <= 1].index
df_filtered = df[~df['category'].isin(low_count_classes)]

# Assuming 'df' is your DataFrame and 'category' is the column with categories
df = df_filtered['category'].value_counts()
df = df_filtered


df.to_csv("unique_categories.csv", index=False)


# Define your features and target
X = df['text']
y = df['category']  # Replace with actual category column

# Split the data while keeping the indices
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# After splitting, we will also need to transform the text data
vectorizer = TfidfVectorizer(max_features=8000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Prepare features and target
X = df['text']
y = df['category']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorization using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Train a Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_vectorized, y_train)

# Make predictions
y_pred = rf_model.predict(X_test_vectorized)



# Print accuracy score
print("Accuracy:", accuracy_score(y_test, y_pred))

# Create a DataFrame for the results
results_df = pd.DataFrame({
    'cleaned_text': X_test,  # Use original text from X_test
    'predicted_category': y_pred,
    'actual_category': y_test
})


# Save the results to a CSV file
results_df.to_csv('main_category_classification_results.csv', index=False)

# Save the Random Forest model
# joblib.dump(rf_model_model, 'random_forest_model.pkl')

# Save the TF-IDF vectorizer
# joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')


# Load the saved Random Forest model
rf_model = joblib.load('random_forest_model.pkl')

# Load the saved TF-IDF vectorizer
vectorizer = joblib.load('tfidf_vectorizer.pkl')

# Print classification report
print("Main Category Classification Report:\n", classification_report(y_test, y_pred,zero_division=0))

Accuracy: 0.7525396644218697
Main Category Classification Report:
                                                       precision    recall  f1-score   support

                               Any Other Cyber Crime       0.66      0.09      0.16      2116
Child Pornography CPChild Sexual Abuse Material CSAM       0.87      0.21      0.33        63
                                Cryptocurrency Crime       0.89      0.07      0.13       113
                      Cyber Attack/ Dependent Crimes       1.00      1.00      1.00       741
                                     Cyber Terrorism       0.00      0.00      0.00        33
      Hacking  Damage to computercomputer system etc       0.85      0.09      0.16       322
                            Online Cyber Trafficking       0.00      0.00      0.00        31
                              Online Financial Fraud       0.76      0.98      0.86     10440
                            Online Gambling  Betting       0.00      0.00      0.00   