In [1]:
# Step 1: Import libraries
import pandas as pd
import string
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Step 2: Load datasets
train_df = pd.read_csv('/content/legal_sentiment_train.csv')
test_df = pd.read_csv('/content/legal_sentiment_test.csv')

# Step 3: View dataset structure
print("Training Set Sample:")
print(train_df.head())

# Step 4: Preprocessing function
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = ' '.join(word for word in text.split() if word not in stop_words)  # Remove stopwords
    return text

# Step 5: Apply cleaning
train_df['clean_text'] = train_df['text'].apply(clean_text)
test_df['clean_text'] = test_df['text'].apply(clean_text)


Training Set Sample:
                                                text     label
0  The law firm's efforts were acknowledged in th...  positive
1  All stakeholders agreed to the settlement term...  positive
2  The client complied with all regulatory requir...  positive
3  The company was awarded compensation for the d...  positive
4  The appeal was dismissed due to procedural err...  negative


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
# Step 6: Import TF-IDF Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Step 7: Initialize vectorizer
tfidf = TfidfVectorizer(max_features=500)

# Step 8: Fit and transform training and test data
X_train = tfidf.fit_transform(train_df['clean_text']).toarray()
X_test = tfidf.transform(test_df['clean_text']).toarray()

# Step 9: Extract labels
y_train = train_df['label']
y_test = test_df['label']


In [3]:
# Step 10: Import model and evaluation tools
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Step 11: Initialize and train the model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Step 12: Make predictions
y_pred = model.predict(X_test)

# Step 13: Evaluate model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nAccuracy Score:")
print(accuracy_score(y_test, y_pred))


Confusion Matrix:
[[33  0  0]
 [ 0 30  0]
 [ 0  0 27]]

Classification Report:
              precision    recall  f1-score   support

    negative       1.00      1.00      1.00        33
     neutral       1.00      1.00      1.00        30
    positive       1.00      1.00      1.00        27

    accuracy                           1.00        90
   macro avg       1.00      1.00      1.00        90
weighted avg       1.00      1.00      1.00        90


Accuracy Score:
1.0


In [4]:
# Step 14: Combine predictions with test data
test_df['predicted_sentiment'] = y_pred

# Step 15: Display representative examples from each class
def show_summary_by_sentiment(sentiment):
    subset = test_df[test_df['predicted_sentiment'] == sentiment]
    print(f"\n🔹 Example {sentiment.upper()} Sentiment Documents:\n")
    for i, row in subset.head(3).iterrows():
        print(f"Original: {row['text']}")
        print(f"Cleaned: {row['clean_text']}")
        print("-" * 80)

# Step 16: Show 3 examples for each sentiment
for sentiment in ['positive', 'negative', 'neutral']:
    show_summary_by_sentiment(sentiment)



🔹 Example POSITIVE Sentiment Documents:

Original: The contract negotiation concluded successfully with mutual agreement.
Cleaned: contract negotiation concluded successfully mutual agreement
--------------------------------------------------------------------------------
Original: All stakeholders agreed to the settlement terms in record time.
Cleaned: stakeholders agreed settlement terms record time
--------------------------------------------------------------------------------
Original: The client complied with all regulatory requirements and was praised for diligence.
Cleaned: client complied regulatory requirements praised diligence
--------------------------------------------------------------------------------

🔹 Example NEGATIVE Sentiment Documents:

Original: The defendant failed to provide any supporting documents despite multiple requests.
Cleaned: defendant failed provide supporting documents despite multiple requests
------------------------------------------------------