In [38]:
import pandas as pd
import numpy as np
import warnings 
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re

In [69]:
warnings.filterwarnings('ignore')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\utilisateur\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\utilisateur\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\utilisateur\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\utilisateur\AppData\Roaming\nltk_data...


True

In [254]:
from bs4 import BeautifulSoup

def remove_html_tags_code(text):
    soup = BeautifulSoup(text, 'html.parser')
    code_tags = soup.find_all('code')
    for code_tag in code_tags:
        code_tag.decompose()
    return soup.get_text()

In [135]:
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    lemmatized_text = " ".join([lemmatizer.lemmatize(word) for word in text.split()])
    return lemmatized_text

In [84]:
def remove_stop_short_words(text):
    stop_words = set(stopwords.words("english"))
    filtered_text = " ".join([word for word in text.split() if word not in stop_words and len(word) > 2])
    return filtered_text

In [None]:
def remove_code_blocks(text):
    # Define the regular expression pattern to match code blocks
    pattern = r'```[\s\S]*?```'
    
    # Use re.sub() to replace the code blocks with an empty string
    cleaned_text = re.sub(pattern, '', text)
    
    return cleaned_text


In [22]:
data =  pd.read_csv('QueryResults.csv',  engine = 'python',  on_bad_lines = 'skip', index_col=[0])

In [23]:
col = data.columns
col

Index(['Body', 'Tags', 'Id', 'Score', 'ViewCount', 'FavoriteCount',
       'AnswerCount'],
      dtype='object')

In [24]:
data.reset_index(inplace=True)

In [25]:
col = data.columns
col

Index(['Title', 'Body', 'Tags', 'Id', 'Score', 'ViewCount', 'FavoriteCount',
       'AnswerCount'],
      dtype='object')

In [21]:
# data.set_index('Id', inplace=True)

In [13]:
col = data.columns
col

Index(['Title', 'Body', 'Tags', 'Score', 'ViewCount', 'FavoriteCount',
       'AnswerCount'],
      dtype='object')

In [26]:
data.shape

(50000, 8)

In [27]:
data.head()

Unnamed: 0,Title,Body,Tags,Id,Score,ViewCount,FavoriteCount,AnswerCount
0,Access Denied when using Power Shell Remoting ...,<p>I want to remotely trigger some commands wi...,<windows><powershell><server><core><remoting>,54569222,2,2151,,2
1,Preprocess SHPAML in Django's template loader?,<p>Is there any way to make Django's template ...,<python><django><templates><preprocessor><shpaml>,2131029,3,585,0.0,2
2,Writing eclipse templates,<p>I am writing django templates in Eclipse->p...,<django><eclipse><templates><eclipse-plugin><e...,2131039,7,554,0.0,1
3,Auto update and build with eclipse and SVN,<p>every morning when i come to work i update ...,<eclipse><svn><build-process><performance><sub...,2131045,2,1262,,1
4,Use .NET in VB6 or classical ASP,"<p><strong>Duplicate of</strong> <a href=""http...",<.net><com><vb6><asp-classic><wrapper>,2131111,3,1548,0.0,3


# 1. Traitement des donnees

## 1.1 extrait les données initiales

In [279]:
corpus = data[['Id', 'Title', 'Body']].copy()

## 1.2 Traitements de title

In [280]:
corpus.dtypes

Id        int64
Title    object
Body     object
dtype: object

In [281]:
# Normalisation de la casse (mise en minuscules)
corpus['Title_process'] = corpus['Title'].str.lower()

In [282]:
# Tokenization (découpage en mots)
corpus['Title_process'] = corpus['Title_process'].astype(str).apply(nltk.word_tokenize)
corpus.head()

Unnamed: 0,Id,Title,Body,Title_process
0,54569222,Access Denied when using Power Shell Remoting ...,<p>I want to remotely trigger some commands wi...,"[access, denied, when, using, power, shell, re..."
1,2131029,Preprocess SHPAML in Django's template loader?,<p>Is there any way to make Django's template ...,"[preprocess, shpaml, in, django, 's, template,..."
2,2131039,Writing eclipse templates,<p>I am writing django templates in Eclipse->p...,"[writing, eclipse, templates]"
3,2131045,Auto update and build with eclipse and SVN,<p>every morning when i come to work i update ...,"[auto, update, and, build, with, eclipse, and,..."
4,2131111,Use .NET in VB6 or classical ASP,"<p><strong>Duplicate of</strong> <a href=""http...","[use, .net, in, vb6, or, classical, asp]"


In [283]:
# Supprimer les caractères spéciaux  
# les caractères qui ne sont pas des lettres de l'alphabet (majuscules ou minuscules) ni des chiffres sauf '-'
corpus['Title_process'] = corpus['Title_process'].apply(lambda x: re.sub('[^a-zA-Z0-9_]', ' ', str(x)))
corpus.head()

Unnamed: 0,Id,Title,Body,Title_process
0,54569222,Access Denied when using Power Shell Remoting ...,<p>I want to remotely trigger some commands wi...,access denied when using power ...
1,2131029,Preprocess SHPAML in Django's template loader?,<p>Is there any way to make Django's template ...,preprocess shpaml in django s ...
2,2131039,Writing eclipse templates,<p>I am writing django templates in Eclipse->p...,writing eclipse templates
3,2131045,Auto update and build with eclipse and SVN,<p>every morning when i come to work i update ...,auto update and build with ec...
4,2131111,Use .NET in VB6 or classical ASP,"<p><strong>Duplicate of</strong> <a href=""http...",use net in vb6 or classical ...


In [284]:
# Supprimer les mots non informatifs (stop words)
corpus['Title_process'] = corpus['Title_process'].astype(str).apply(lambda x: remove_stop_short_words(x))
corpus.head()

Unnamed: 0,Id,Title,Body,Title_process
0,54569222,Access Denied when using Power Shell Remoting ...,<p>I want to remotely trigger some commands wi...,access denied using power shell remoting local...
1,2131029,Preprocess SHPAML in Django's template loader?,<p>Is there any way to make Django's template ...,preprocess shpaml django template loader
2,2131039,Writing eclipse templates,<p>I am writing django templates in Eclipse->p...,writing eclipse templates
3,2131045,Auto update and build with eclipse and SVN,<p>every morning when i come to work i update ...,auto update build eclipse svn
4,2131111,Use .NET in VB6 or classical ASP,"<p><strong>Duplicate of</strong> <a href=""http...",use net vb6 classical asp


In [285]:
# Lemmatisation
# We use the WordNet lemmatizer from NLTK. A lemmatizer is preferred over a stemmer 
# in this case because it produces more readable words.
corpus['Title_process'] = corpus['Title_process'].astype(str).apply(lambda x: lemmatize_text(x))
corpus.head()

Unnamed: 0,Id,Title,Body,Title_process
0,54569222,Access Denied when using Power Shell Remoting ...,<p>I want to remotely trigger some commands wi...,access denied using power shell remoting local...
1,2131029,Preprocess SHPAML in Django's template loader?,<p>Is there any way to make Django's template ...,preprocess shpaml django template loader
2,2131039,Writing eclipse templates,<p>I am writing django templates in Eclipse->p...,writing eclipse template
3,2131045,Auto update and build with eclipse and SVN,<p>every morning when i come to work i update ...,auto update build eclipse svn
4,2131111,Use .NET in VB6 or classical ASP,"<p><strong>Duplicate of</strong> <a href=""http...",use net vb6 classical asp


In [286]:
# Filter out words that occur less than 10 documents, or more than 80% of the documents.

corpus['Title_process'] = corpus['Title_process'].apply(word_tokenize)
dictionary = Dictionary(corpus['Title_process'])
dictionary.filter_extremes(no_below=20, no_above=0.5)

In [287]:
corpus.head()

Unnamed: 0,Id,Title,Body,Title_process
0,54569222,Access Denied when using Power Shell Remoting ...,<p>I want to remotely trigger some commands wi...,"[access, denied, using, power, shell, remoting..."
1,2131029,Preprocess SHPAML in Django's template loader?,<p>Is there any way to make Django's template ...,"[preprocess, shpaml, django, template, loader]"
2,2131039,Writing eclipse templates,<p>I am writing django templates in Eclipse->p...,"[writing, eclipse, template]"
3,2131045,Auto update and build with eclipse and SVN,<p>every morning when i come to work i update ...,"[auto, update, build, eclipse, svn]"
4,2131111,Use .NET in VB6 or classical ASP,"<p><strong>Duplicate of</strong> <a href=""http...","[use, net, vb6, classical, asp]"


In [None]:
# Mise à jour de traitement de title avec le résultat de filtrage
def filter_tokens(tokens):
    return [token for token in tokens if token in dictionary.token2id]

corpus['Title_tokens'] = corpus['Title_process'].apply(filter_tokens)

In [288]:
corpus[['Title', 'Title_process','Title_tokens' ]].to_csv('title1.csv', index=False)

## 1.3 traitement de body

In [293]:
# Normalisation de la casse (mise en minuscules)
corpus['Body_process'] = corpus['Body'].str.lower()

In [294]:
# Enleve les tags html et les codes
corpus['Body_process'] = corpus['Body_process'].astype(str).apply(remove_html_tags_code)

In [296]:
# Tokenization (découpage en mots)
corpus['Body_process'] = corpus['Body_process'].astype(str).apply(nltk.word_tokenize)

In [297]:
# Supprimer les caractères spéciaux  
# les caractères qui ne sont pas des lettres de l'alphabet (majuscules ou minuscules) ni des chiffres sauf '-'
corpus['Body_process'] = corpus['Body_process'].apply(lambda x: re.sub('[^a-zA-Z0-9_]', ' ', str(x)))

In [298]:
# Supprimer les mots non informatifs (stop words)
corpus['Body_process'] = corpus['Body_process'].astype(str).apply(lambda x: remove_stop_short_words(x))

In [299]:
# Lemmatisation
corpus['Body_process'] = corpus['Body_process'].astype(str).apply(lambda x: lemmatize_text(x))

In [300]:
corpus['Body_process'].head()

0    want remotely trigger command power shell wind...
1    way make django template loader run template l...
2    writing django template eclipse prefrences tem...
3    every morning come work update source svn buil...
4    duplicate calling net method vb6 via com visib...
Name: Body_process, dtype: object

In [302]:
corpus[['Title', 'Title_tokens', 'Body', 'Body_process']].to_csv('title_body1.csv', index=False)

# 2 Extraction des features 

## 2.1  Bag of word

In [151]:
from sklearn.feature_extraction.text import CountVectorizer

# corpus de title
corpus_title = corpus['Title_token']

# Create instance de CountVectorizer
vectorizer = CountVectorizer()

# Fit vectorizer avec corpus_title
vectorizer.fit(corpus_title)

# transform documents (title) de corpus_title en vecteurs
X = vectorizer.transform(corpus_title)

first_document_vector = X[0].toarray()

print(first_document_vector)

[[0 0 0 ... 0 0 0]]


## 2.2 Word Embedding

In [154]:
from gensim.models import Word2Vec

# Exemple de corpus de phrases
corpus = [
    "Le chat est mignon",
    "Le chien est fidèle",
    "Le chat et le chien sont amis",
    "Le lion est puissant",
    "Le tigre est sauvage"
]

# Prétraitement des phrases en une liste de listes de mots
sentences = [sentence.split() for sentence in corpus]

# Entraînement du modèle Word2Vec
model = Word2Vec(sentences, min_count=1)

# Obtention du plongement du mot "chat"
embedding = model.wv['chat']
print(embedding)

[-8.2428139e-03  9.2992587e-03 -1.9804893e-04 -1.9670932e-03
  4.6037803e-03 -4.0953029e-03  2.7429049e-03  6.9401739e-03
  6.0651284e-03 -7.5107305e-03  9.3822703e-03  4.6716495e-03
  3.9663245e-03 -6.2434538e-03  8.4600979e-03 -2.1501011e-03
  8.8254288e-03 -5.3620036e-03 -8.1294794e-03  6.8242415e-03
  1.6710253e-03 -2.1985222e-03  9.5133632e-03  9.4935708e-03
 -9.7736213e-03  2.5049869e-03  6.1567132e-03  3.8728428e-03
  2.0225546e-03  4.3048026e-04  6.7400362e-04 -3.8207199e-03
 -7.1403324e-03 -2.0886923e-03  3.9234436e-03  8.8187810e-03
  9.2591941e-03 -5.9757386e-03 -9.4025871e-03  9.7644711e-03
  3.4297551e-03  5.1663239e-03  6.2822350e-03 -2.8042037e-03
  7.3227971e-03  2.8304595e-03  2.8709546e-03 -2.3802817e-03
 -3.1282553e-03 -2.3699638e-03  4.2765420e-03  7.6203403e-05
 -9.5845042e-03 -9.6655367e-03 -6.1483392e-03 -1.2894146e-04
  1.9972622e-03  9.4317868e-03  5.5843079e-03 -4.2908196e-03
  2.7835433e-04  4.9644676e-03  7.6987804e-03 -1.1442254e-03
  4.3230909e-03 -5.81425

## 2.3 TF-IDF

In [152]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Corpus of titles
corpus_title = corpus['Title_process']

# Create an instance of TfidfVectorizer
vectorizer = TfidfVectorizer()

# Fit vectorizer to the corpus_title
X = vectorizer.fit_transform(corpus_title)

# Get the TF-IDF features of the first document
first_document_features = X[0]

print(first_document_features)

  (0, 209)	0.3645275929721957
  (0, 4335)	0.2538980331530107
  (0, 18163)	0.19744323970388894
  (0, 537)	0.3257924494349014
  (0, 11455)	0.27084775370958164
  (0, 16871)	0.42406655773225366
  (0, 18410)	0.297932094004714
  (0, 15303)	0.369394865602048
  (0, 21866)	0.14826889698401818
  (0, 5253)	0.3216424925957781
  (0, 514)	0.23170295726574017
