In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix  # Optional, for evaluation


In [2]:
# Load data
data = pd.read_csv("data/train.csv") 
text = data["content"] 
labels = data["type"] 

In [3]:
# import nltk
# nltk.download('stopwords')

In [4]:
from nltk.corpus import stopwords  # Stop words removal
from nltk.stem import PorterStemmer  # Stemming (optional)

# Ensure all text data are strings and handle missing values
text = text.fillna("").astype(str)

stop_words = stopwords.words('english')  # Load English stop words
stemmer = PorterStemmer()  # Create a stemmer object

# Data pre-processing (replace with your preferred NLP pipeline)
def preprocess_text(text):
    tokens = [stemmer.stem(word.lower()) for word in text.split() if word not in stop_words]
    return ' '.join(tokens)
    
text = [preprocess_text(t) for t in text]

In [5]:
# Feature extraction using TF-IDF
vectorizer = TfidfVectorizer()
features = vectorizer.fit_transform(text)

In [6]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Train the model (replace with your chosen model)
model = LogisticRegression(multi_class="ovr", solver="lbfgs")  # Multi-class classification
model.fit(X_train, y_train)

In [7]:
# Make predictions on testing data
# Assuming new_text is a list of new texts you want to predict
new_text = ["Sample complaint text 1", "Sample complaint text 2"]  # Replace with your testing data

# Preprocess and transform new text data
new_text_preprocessed = [preprocess_text(t) for t in new_text]
X_new = vectorizer.transform(new_text_preprocessed)
predictions = model.predict(X_new)

# Save predictions (replace with your desired format)
submission_df = pd.DataFrame({"complaint_id": range(len(new_text)), "class": predictions})
submission_df.to_csv("predictions.csv", index=False)

# Optional: Evaluate model performance
y_pred = model.predict(X_test)
print(confusion_matrix(y_test, y_pred))

[[ 450   91    9   13   53]
 [  51 3390  122   76   12]
 [  15  246  620   31    6]
 [  10  100   32  624   17]
 [  32   16    5   11  465]]


In [8]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')  # Weighted average for multi-class
recall = recall_score(y_test, y_pred, average='weighted')  # Weighted average for multi-class
f1 = f1_score(y_test, y_pred, average='weighted')  # Weighted average for multi-class

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.8540865014622133
Precision: 0.8514162946031788
Recall: 0.8540865014622133
F1 Score: 0.8515729100355377


In [13]:
# Example of making predictions on new text data
# Load data
data = pd.read_csv("data/test.csv") 
new_text = data["content"] 
new_text = new_text.fillna("").astype(str)
# new_text = ["how dare they use my trust this way and fraud me on the basis of my trust, these card sacms by banks these days are just extreme.", "i took a loan and now these rascals are demanding money form me even knowing that my financial condition is not good, please give me some time to pay"]
new_text_preprocessed = [preprocess_text(t) for t in new_text]
X_new = vectorizer.transform(new_text_preprocessed)
predictions = model.predict(X_new)

# Save predictions
submission_df = pd.DataFrame({"id": range(len(new_text)), "class": predictions})
submission_df.to_csv("predictions.csv", index=False)