<a href="https://colab.research.google.com/github/rouakhadhraoui/Text-Mining-Labs-/blob/main/2_Text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Charger les datasets
df_complaint = pd.read_csv("complaint1700.csv")
df_noncomplaint = pd.read_csv("noncomplaint1700.csv")

# Ajouter la colonne label
df_complaint['label'] = 1      # 1 = complaint
df_noncomplaint['label'] = 0   # 0 = non-complaint

# Combiner les datasets
df = pd.concat([df_complaint, df_noncomplaint], ignore_index=True)

# Nettoyage du texte
def clean_text(text):
    text = text.lower()  # mettre en minuscules
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)  # enlever URL
    text = re.sub(r'\@\w+|\#','', text)  # enlever mentions et hashtags
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # enlever ponctuation et chiffres
    text = re.sub(r'\s+', ' ', text).strip()  # enlever espaces multiples
    return text

df['tweet'] = df['tweet'].apply(clean_text)

# Mélanger les données
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Vérifier
print("Combined and cleaned dataset:")
print(df.head())
print(df.info())

# Séparer features et target
X = df['tweet']
y = df['label']

# Split en train et test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorisation TF-IDF
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Modèle Naive Bayes
model = MultinomialNB()
model.fit(X_train_vec, y_train)

# Prédiction
y_pred = model.predict(X_test_vec)

# Évaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Combined and cleaned dataset:
       id   airline                                              tweet  label
0   95181  American           no excuse for lost luggage youhaveonejob      1
1   20845  American  i thought airport wifi was ridiculous until i ...      1
2   32473    United  i hope you guys go surfing if youre going to l...      0
3  165082    United  my flight to la had no electricity for passeng...      1
4   37552   JetBlue  poop announces new bag fees and squeezing out ...      1
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3400 entries, 0 to 3399
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       3400 non-null   int64 
 1   airline  3400 non-null   object
 2   tweet    3400 non-null   object
 3   label    3400 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 106.4+ KB
None
Accuracy: 0.7161764705882353

Classification Report:
               precision    recall  f1-score   support

        