In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [None]:
df=pd.read_csv('news.csv')
df.head()

In [None]:
# 1. Category distribution – Bar Plot
plt.figure(figsize=(12, 6))
sns.countplot(x=df['target'])  # bar plot of target category indices
plt.title("Documents per Category")
target_names=df['target'].unique()
plt.xticks(ticks=range(len(target_names)), labels=target_names, rotation=90)  # label x-axis with category names
plt.tight_layout()
plt.show()


In [None]:
# 2. Document length distribution – Histogram
doc_lengths = [len(doc.split()) for doc in df['document']]  # compute word count per document
sns.histplot(doc_lengths, bins=50)
plt.title("Document Length Distribution")
plt.xlabel("Words per document")
plt.ylabel("Frequency")
plt.show()

In [None]:
# 3. Average document length per category – Horizontal Bar Plot
df.rename(columns={'document':'text','target':'category'})
df['doc_len'] = df['text'].apply(lambda x: len(x.split()))
avg_len = df.groupby('category')['doc_len'].mean().sort_values()
plt.figure(figsize=(12, 6))
avg_len.plot(kind='barh')
plt.title("Average Document Length per Category")
plt.xlabel("Average Word Count")
plt.show()

In [None]:
# 4. Shortest and longest documents – Text output
shortest_doc = min(df['text'], key=lambda x: len(x.split()))
longest_doc = max(df['text'], key=lambda x: len(x.split()))
print("\nShortest Document:\n", shortest_doc[:300], "...")
print("\nLongest Document:\n", longest_doc[:300], "...")


In [None]:
# 5. Top 10 longest documents per category – Bar Plot
top_docs = df.groupby('category')['doc_len'].nlargest(10).reset_index()
plt.figure(figsize=(12, 6))
sns.boxplot(x='category', y='doc_len', data=top_docs)
plt.xticks(rotation=90)
plt.title("Top 10 Longest Documents per Category")
plt.ylabel("Word Count")
plt.show()


In [None]:
# 6. Median document length per category – Bar Plot
median_len = df.groupby('category')['doc_len'].median().sort_values()
plt.figure(figsize=(12, 6))
median_len.plot(kind='barh')
plt.title("Median Document Length per Category")
plt.xlabel("Median Word Count")
plt.show()

In [None]:
# 7. Boxplot of document lengths per category – Box Plot
plt.figure(figsize=(14, 6))
sns.boxplot(x='category', y='doc_len', data=df)
plt.xticks(rotation=90)
plt.title("Document Length Distribution by Category")
plt.ylabel("Word Count")
plt.tight_layout()
plt.show()

In [None]:
# 8. Number of empty or very short docs – Text output
short_docs = df[df['doc_len'] < 5]
print(f"\nNumber of documents with less than 5 words: {len(short_docs)}")

In [None]:
# 9. Bar chart of total characters per category – Bar Plot
df['char_len'] = df['text'].apply(len)
total_chars = df.groupby('category')['char_len'].sum().sort_values()
plt.figure(figsize=(12, 6))
total_chars.plot(kind='barh')
plt.title("Total Characters per Category")
plt.xlabel("Total Characters")
plt.show()


In [None]:
word_lengths = []
for text in data.data:
    words = text.split()
    word_lengths.extend([len(word) for word in words])

plt.figure(figsize=(8, 5))
sns.histplot(word_lengths, bins=30)
plt.title("Distribution of Word Lengths")
plt.xlabel("Word Length")
plt.ylabel("Frequency")
plt.show()

In [None]:
import numpy as np
import lime
import lime.lime_text
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from lime.lime_text import LimeTextExplainer
import matplotlib.pyplot as plt

In [None]:
df= pd.read_csv("questions.csv")

texts= df["question1"][:400]
labels=df["is_duplicate"][:400]

In [None]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(texts)
classifier = LogisticRegression()
classifier.fit(X, labels)

In [None]:
pipeline = make_pipeline(vectorizer, classifier)

# LIME Explainer
explainer = LimeTextExplainer(class_names=["Negative", "Positive"])

def explain_text(text):
    exp = explainer.explain_instance(
        text, pipeline.predict_proba, num_features=5
    )
    exp.show_in_notebook(text=True)
    exp.save_to_file('lime_explanation.html')

    fig = exp.as_pyplot_figure()
    plt.show()

    return exp

# Test explanation
sample_text = "I really enjoyed this film, it was fantastic!"
explanation = explain_text(sample_text)

In [None]:
import pandas as pd
import numpy as np
import lime
import lime.lime_text
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from lime.lime_text import LimeTextExplainer
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv("train.csv")

# Preprocess the data
texts = df["question1"].fillna('') + " " + df["question2"].fillna('')
labels = df["is_duplicate"]

# Vectorize the text data
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(texts)

# Train a classifier
classifier = LogisticRegression(max_iter=100)
classifier.fit(X, labels)

# Create a pipeline
pipeline = make_pipeline(vectorizer, classifier)

# Initialize LIME Explainer
explainer = LimeTextExplainer(class_names=["Not Duplicate", "Duplicate"])

def explain_text(text):
    exp = explainer.explain_instance(
        text, pipeline.predict_proba, num_features=5
    )
    exp.show_in_notebook(text=True)
    exp.save_to_file('lime_explanation.html')

    fig = exp.as_pyplot_figure()
    plt.show()

    return exp

# Test explanation
sample_text = "How can I improve my coding skills?"  # Replace with any question pair
explanation = explain_text(sample_text)
