In [1]:
!pip install scikit-learn nltk spacy networkx
!python -m spacy download en_core_web_sm


Defaulting to user installation because normal site-packages is not writeable
Collecting requests<3.0.0,>=2.13.0 (from spacy)
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Downloading requests-2.32.5-py3-none-any.whl (64 kB)
   ---------------------------------------- 64.7/64.7 kB 1.2 MB/s eta 0:00:00
Installing collected packages: requests
  Attempting uninstall: requests
    Found existing installation: requests 1.2.3
    Uninstalling requests-1.2.3:
      Successfully uninstalled requests-1.2.3
Successfully installed requests-2.32.5


DEPRECATION: textract 1.6.5 has a non-standard dependency specifier extract-msg<=0.29.*. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of textract or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
anaconda-project 0.11.1 requires ruamel-yaml, which is not installed.
textteaser 0.3 requires requests==1.2.3, but you have requests 2.32.5 which is incompatible.
conda-repo-cli 1.0.20 requires clyent==1.2.1, but you have clyent 1.2.2 which is incompatible.
conda-repo-cli 1.0.20 requires nbformat==5.4.0, but you have nbformat 5.5.0 which is incompatible.
conda-repo-cli 1.0.20 requires requests==2.28.1, but you have requests 2.32.5 which is incompati

Defaulting to user installation because normal site-packages is not writeable
Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     --------------------------------------- 12.8/12.8 MB 10.9 MB/s eta 0:00:00
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.7.1
[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


DEPRECATION: textract 1.6.5 has a non-standard dependency specifier extract-msg<=0.29.*. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of textract or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063

[notice] A new release of pip is available: 24.0 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


2️⃣ Import Libraries

In [2]:
import pandas as pd
import numpy as np
import nltk
import re
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import networkx as nx
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize


Preprocessing

In [3]:
# Download NLTK stopwords
nltk.download('stopwords')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))
nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # remove special chars & numbers
    text = text.lower()
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if token.text not in stop_words and not token.is_punct]
    return " ".join(tokens)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pc\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
data = {
    'text': [
        "This contract agreement is between two parties and is legally binding.",
        "The court ruling states that the defendant is guilty.",
        "This agreement outlines the partnership between companies.",
        "The legal document specifies terms and conditions of employment."
    ],
    'label': ['Contract', 'Ruling', 'Agreement', 'Contract']
}

df = pd.DataFrame(data)

# Preprocess text
df['clean_text'] = df['text'].apply(preprocess_text)

print(df.head())


                                                text      label  \
0  This contract agreement is between two parties...   Contract   
1  The court ruling states that the defendant is ...     Ruling   
2  This agreement outlines the partnership betwee...  Agreement   
3  The legal document specifies terms and conditi...   Contract   

                                         clean_text  
0         contract agreement two party legally bind  
1               court ruling state defendant guilty  
2             agreement outline partnership company  
3  legal document specify term condition employment  


5️⃣ Categorization Model (Naive Bayes + TF-IDF)

In [5]:
# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df['clean_text'])
y = df['label']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

# Predictions
y_pred = nb_model.predict(X_test)

print("Classification Report:\n", classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))


Classification Report:
               precision    recall  f1-score   support

    Contract       0.00      0.00      0.00       0.0
      Ruling       0.00      0.00      0.00       1.0

    accuracy                           0.00       1.0
   macro avg       0.00      0.00      0.00       1.0
weighted avg       0.00      0.00      0.00       1.0

Accuracy: 0.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


6️⃣ Text Summarization (TextRank)

In [6]:
def textrank_summarizer(text, top_n=3):
    sentences = sent_tokenize(text)
    if len(sentences) <= top_n:
        return text  # return full if too short
    
    # TF-IDF similarity matrix
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(sentences)
    sim_matrix = (X * X.T).toarray()
    
    # Build graph
    nx_graph = nx.from_numpy_array(sim_matrix)
    scores = nx.pagerank(nx_graph)
    
    # Rank sentences
    ranked = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
    
    # Extract top_n sentences
    summary = " ".join([sent for _, sent in ranked[:top_n]])
    return summary


7️⃣ Integration – Classify & Summarize

In [8]:
sample_doc = """This contract agreement is made between the two parties, 
detailing the responsibilities, liabilities, and obligations of each side. 
It ensures compliance with all applicable laws and regulations."""

# Preprocess & classify
processed = preprocess_text(sample_doc)
vectorized = tfidf.transform([processed])
pred_label = nb_model.predict(vectorized)[0]

# Summarize
summary = textrank_summarizer(sample_doc, top_n=2)

print("Predicted Category:", pred_label)
print("\nSummary:\n", summary)


Predicted Category: Contract

Summary:
 This contract agreement is made between the two parties, 
detailing the responsibilities, liabilities, and obligations of each side. 
It ensures compliance with all applicable laws and regulations.
