In [19]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
from fpdf import FPDF
from docx import Document
import os

# Chemins de sortie
base_path = "C:\\Users\\sbond\\Desktop\\SPAM Sms detection"
results_path = os.path.join(base_path, "resultat et evaluation detection")
os.makedirs(results_path, exist_ok=True)

# Charger les données
data_path = "C:\\Users\\sbond\\Desktop\\SPAM Sms detection\\SMSSpamCollection.csv"
data = pd.read_csv(data_path, sep='\t', header=None, names=['label', 'message'])

# Nettoyage des données
def clean_message(message):
    stopwords = set([
        "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours",
        "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers",
        "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves",
        "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are",
        "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does",
        "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until",
        "while", "of", "at", "by", "for", "with", "about", "against", "between", "into",
        "through", "during", "before", "after", "above", "below", "to", "from", "up", "down",
        "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here",
        "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more",
        "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so",
        "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"
    ])
    message = ''.join([char for char in message.lower() if char.isalnum() or char.isspace()])
    return ' '.join([word for word in message.split() if word not in stopwords])

data['cleaned_message'] = data['message'].apply(clean_message)

# Vectorisation et division des données
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(data['cleaned_message'])
y = (data['label'] == 'spam').astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Entraînement et évaluation
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
classification_rep = classification_report(y_test, y_pred, target_names=["ham", "spam"], output_dict=True)
conf_matrix = confusion_matrix(y_test, y_pred)

# Sauvegarde des graphiques
def save_graphs():
    # Distribution des classes
    class_distribution = data['label'].value_counts()
    fig, ax = plt.subplots(figsize=(8, 6))
    bars = ax.bar(class_distribution.index, class_distribution.values, color=['blue', 'orange'])
    plt.title('Distribution des classes (HAM vs SPAM)')
    plt.xticks(rotation=0)
    plt.ylabel('Nombre de messages')
    plt.xlabel('Label')
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width() / 2.0, height, f'{int(height)}', ha='center', va='bottom')
    plt.savefig(os.path.join(results_path, "distribution_classes.png"))
    plt.close()

    # Matrice de confusion
    fig, ax = plt.subplots(figsize=(8, 6))
    cax = ax.matshow(conf_matrix, cmap="Blues")
    plt.colorbar(cax)
    plt.title('Matrice de confusion')
    plt.xlabel('Prédictions')
    plt.ylabel('Classe réelle')
    plt.xticks([0, 1], ['HAM', 'SPAM'])
    plt.yticks([0, 1], ['HAM', 'SPAM'])
    for i in range(conf_matrix.shape[0]):
        for j in range(conf_matrix.shape[1]):
            ax.text(j, i, str(conf_matrix[i, j]), ha='center', va='center', color='red')
    plt.savefig(os.path.join(results_path, "matrice_confusion.png"))
    plt.close()

    # Longueur moyenne des messages
    data['message_length'] = data['message'].apply(len)
    avg_length = data.groupby('label')['message_length'].mean()
    fig, ax = plt.subplots(figsize=(8, 6))
    bars = ax.bar(avg_length.index, avg_length.values, color=['blue', 'orange'])
    plt.title('Longueur moyenne des messages par classe')
    plt.xticks(rotation=0)
    plt.ylabel('Longueur moyenne')
    plt.xlabel('Classe')
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width() / 2.0, height, f'{height:.2f}', ha='center', va='bottom')
    plt.savefig(os.path.join(results_path, "message_length.png"))
    plt.close()

    # Mots les plus fréquents dans les spams
    spam_messages = data[data['label'] == 'spam']['cleaned_message']
    vectorizer_count = CountVectorizer(max_features=10)
    frequent_words = vectorizer_count.fit_transform(spam_messages)
    word_counts = np.asarray(frequent_words.sum(axis=0)).flatten()
    words = vectorizer_count.get_feature_names_out()
    fig, ax = plt.subplots(figsize=(8, 6))
    bars = ax.bar(words, word_counts, color='orange')
    plt.title('Mots les plus fréquents dans les SPAM')
    plt.ylabel('Fréquence')
    plt.xlabel('Mots')
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width() / 2.0, height, f'{int(height)}', ha='center', va='bottom')
    plt.savefig(os.path.join(results_path, "frequent_words_spam.png"))
    plt.close()

save_graphs()

# Génération des rapports PDF et Word
def save_reports():
    # Rapport PDF
    pdf = FPDF()
    pdf.add_page()
    pdf.set_font("Arial", size=12)
    pdf.cell(200, 10, txt="Rapport de Détection de SPAM SMS", ln=True, align="C")
    pdf.ln(10)

    # Ajouter les graphiques
    pdf.set_font("Arial", size=10)
    pdf.cell(0, 10, "Graphiques :", ln=True)
    pdf.ln(5)
    for graph in ["distribution_classes.png", "matrice_confusion.png", "message_length.png", "frequent_words_spam.png"]:
        pdf.image(os.path.join(results_path, graph), x=10, w=180)
        pdf.ln(5)

    # Ajouter la matrice de confusion sous forme de tableau
    pdf.cell(0, 10, "Matrice de Confusion :", ln=True)
    pdf.cell(40, 10, "Classe Réelle", border=1, align='C')
    pdf.cell(40, 10, "Prédit HAM", border=1, align='C')
    pdf.cell(40, 10, "Prédit SPAM", border=1, align='C')
    pdf.ln()
    for i, row in enumerate(conf_matrix):
        pdf.cell(40, 10, "HAM" if i == 0 else "SPAM", border=1, align='C')
        pdf.cell(40, 10, str(row[0]), border=1, align='C')
        pdf.cell(40, 10, str(row[1]), border=1, align='C')
        pdf.ln()

    pdf.ln(10)

    # Ajouter les métriques de classification sous forme de tableau
    pdf.cell(0, 10, "Métriques de Classification :", ln=True)
    pdf.cell(50, 10, "Classe", border=1, align='C')
    pdf.cell(50, 10, "Précision", border=1, align='C')
    pdf.cell(50, 10, "F1-Score", border=1, align='C')
    pdf.ln()
    for label, metrics in classification_rep.items():
        if isinstance(metrics, dict):
            pdf.cell(50, 10, label, border=1, align='C')
            pdf.cell(50, 10, f"{metrics['precision']:.2f}", border=1, align='C')
            pdf.cell(50, 10, f"{metrics['f1-score']:.2f}", border=1, align='C')
            pdf.ln()

    pdf.output(os.path.join(results_path, "rapport_detection_spam.pdf"))

    # Rapport Word
    doc = Document()
    doc.add_heading("Rapport de Détection de SPAM SMS", level=1)

    # Ajouter les graphiques
    doc.add_heading("Graphiques :", level=2)
    for graph in ["distribution_classes.png", "matrice_confusion.png", "message_length.png", "frequent_words_spam.png"]:
        doc.add_picture(os.path.join(results_path, graph), width=doc.sections[0].page_width * 0.8)

    # Matrice de confusion
    doc.add_heading("Matrice de Confusion", level=2)
    table = doc.add_table(rows=3, cols=3)
    table.style = 'Table Grid'
    table.cell(0, 0).text = "Classe Réelle"
    table.cell(0, 1).text = "Prédit HAM"
    table.cell(0, 2).text = "Prédit SPAM"
    for i, row in enumerate(conf_matrix):
        table.cell(i + 1, 0).text = "HAM" if i == 0 else "SPAM"
        table.cell(i + 1, 1).text = str(row[0])
        table.cell(i + 1, 2).text = str(row[1])

    # Métriques de classification
    doc.add_heading("Métriques de Classification", level=2)
    table = doc.add_table(rows=len(classification_rep) + 1, cols=3)
    table.style = 'Table Grid'
    table.cell(0, 0).text = "Classe"
    table.cell(0, 1).text = "Précision"
    table.cell(0, 2).text = "F1-Score"
    for i, (label, metrics) in enumerate(classification_rep.items()):
        if isinstance(metrics, dict):
            table.cell(i + 1, 0).text = label
            table.cell(i + 1, 1).text = f"{metrics['precision']:.2f}"
            table.cell(i + 1, 2).text = f"{metrics['f1-score']:.2f}"

    doc.save(os.path.join(results_path, "rapport_detection_spam.docx"))

save_reports()

print("Graphiques et rapports générés avec succès.")


Graphiques et rapports générés avec succès.
