In [3]:
import pandas as pd
import numpy as np
import re
import nltk
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from nltk.corpus import stopwords
import zipfile
import os

# Ensure NLTK resources are downloaded
nltk.download('stopwords')
nltk.download('punkt')

# Load and extract the dataset
def load_and_extract_data(zip_path, extract_to):
    try:
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(extract_to)
        print("Data extracted successfully.")
        for file_name in os.listdir(extract_to):
            if file_name.endswith('.csv'):
                return os.path.join(extract_to, file_name)
        print("No CSV file found in the extracted data.")
        return None
    except Exception as e:
        print(f"Error extracting data: {e}")
        return None

# Load the dataset
def load_data(file_path):
    try:
        data = pd.read_csv(file_path)
        print("Data loaded successfully.")
        return data
    except Exception as e:
        print(f"Error loading data: {e}")
        return None

# Exploratory Data Analysis (EDA)
def perform_eda(data, text_column, label_column):
    print("Performing EDA...")
    print("\nDataset Information:")
    print(data.info())

    print("\nDataset Description:")
    print(data.describe())

    print("\nChecking for Missing Values:")
    print(data.isnull().sum())

    print("\nLabel Distribution:")
    label_counts = data[label_column].value_counts()
    print(label_counts)

    sns.countplot(x=label_column, data=data)
    plt.title("Label Distribution")
    plt.show()

    print("\nSample Comments:")
    print(data[text_column].head())

# Data pre-processing
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

# Preprocess the dataset
def preprocess_dataset(data, text_column):
    data[text_column] = data[text_column].apply(preprocess_text)
    return data

# Split the dataset
def split_data(data, text_column, label_column, test_size=0.2, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(
        data[text_column], data[label_column], test_size=test_size, random_state=random_state
    )
    return X_train, X_test, y_train, y_test

# Feature extraction
def vectorize_text(X_train, X_test):
    vectorizer = TfidfVectorizer()
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)
    return X_train_tfidf, X_test_tfidf, vectorizer

# Train the model
def train_model(X_train_tfidf, y_train):
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train_tfidf, y_train)
    return model

# Evaluate the model
def evaluate_model(model, X_test_tfidf, y_test):
    predictions = model.predict(X_test_tfidf)
    acc = accuracy_score(y_test, predictions)
    report = classification_report(y_test, predictions)
    return acc, report

# Main pipeline
def main(zip_path, extract_to, text_column, label_column):
    file_path = load_and_extract_data(zip_path, extract_to)
    if file_path is None:
        return

    data = load_data(file_path)
    if data is None:
        return

    perform_eda(data, text_column, label_column)

    print("Preprocessing data...")
    data = preprocess_dataset(data, text_column)

    print("Splitting data...")
    X_train, X_test, y_train, y_test = split_data(data, text_column, label_column)

    print("Vectorizing data...")
    X_train_tfidf, X_test_tfidf, vectorizer = vectorize_text(X_train, X_test)

    print("Training model...")
    model = train_model(X_train_tfidf, y_train)

    print("Evaluating model...")
    acc, report = evaluate_model(model, X_test_tfidf, y_test)

    print(f"Accuracy: {acc}")
    print(f"Classification Report:\n{report}")

# Run the script
if __name__ == "__main__":
    ZIP_PATH = "https://github.com/priya-roy/unhealthy-comments-Dataset/raw/refs/heads/main/commentClassification.zip"  # Replace with your dataset's ZIP file path
    EXTRACT_TO = "dataset"  # Directory to extract the dataset
    TEXT_COLUMN = "comment_text"  # Replace with the column name for comments
    LABEL_COLUMN = "label"  # Replace with the column name for labels

    main(ZIP_PATH, EXTRACT_TO, TEXT_COLUMN, LABEL_COLUMN)


ModuleNotFoundError: No module named 'seaborn'