In [40]:
#Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import os
import re
import ast 
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from nltk.corpus import stopwords
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, Input


nltk.download('stopwords')
stop_words = set(stopwords.words('french'))



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gerar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Chargement des Données

In [41]:
dataPath =  "../data/"
linesFile = os.path.join(dataPath, "movie_lines.txt")
conversationsFile = os.path.join(dataPath, "movie_conversations.txt")

In [42]:
with open(linesFile, 'r') as f:
    lines = f.readlines()

with open(conversationsFile, 'r') as f:
    conversations = f.readlines()

In [43]:
lines[:5]

['L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!\n',
 'L1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ They do to!\n',
 'L985 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I hope so.\n',
 'L984 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ She okay?\n',
 "L925 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Let's go.\n"]

In [44]:
conversations[:5]

["u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L194', 'L195', 'L196', 'L197']\n",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L198', 'L199']\n",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L200', 'L201', 'L202', 'L203']\n",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L204', 'L205', 'L206']\n",
 "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L207', 'L208']\n"]

# Transformation des Données

In [45]:
movieLines  = {}

for lignes in lines:
    ligne = lignes.split(" +++$+++ ")
    if len(ligne) == 5:
        movieLines[ligne[0]] = ligne[4]

In [46]:
for key, value in list(movieLines.items())[:5]:  
    print(f"{key}: {value}")

L1045: They do not!

L1044: They do to!

L985: I hope so.

L984: She okay?

L925: Let's go.



In [47]:
movieConversations = []
for conv in conversations:
    parts = conv.strip().split(" +++$+++ ")
    try:
        conversation_ids = ast.literal_eval(parts[3])
        movieConversations.append(conversation_ids)
    except:
        pass 

In [48]:
movieConversations

[['L194', 'L195', 'L196', 'L197'],
 ['L198', 'L199'],
 ['L200', 'L201', 'L202', 'L203'],
 ['L204', 'L205', 'L206'],
 ['L207', 'L208'],
 ['L271', 'L272', 'L273', 'L274', 'L275'],
 ['L276', 'L277'],
 ['L280', 'L281'],
 ['L363', 'L364'],
 ['L365', 'L366'],
 ['L367', 'L368'],
 ['L401', 'L402', 'L403'],
 ['L404', 'L405', 'L406', 'L407'],
 ['L575', 'L576'],
 ['L577', 'L578'],
 ['L662', 'L663'],
 ['L693', 'L694', 'L695'],
 ['L696', 'L697', 'L698', 'L699'],
 ['L860', 'L861'],
 ['L862', 'L863', 'L864', 'L865'],
 ['L866', 'L867', 'L868', 'L869'],
 ['L870', 'L871', 'L872'],
 ['L924', 'L925'],
 ['L984', 'L985'],
 ['L1044', 'L1045'],
 ['L49', 'L50', 'L51'],
 ['L571', 'L572', 'L573'],
 ['L579', 'L580'],
 ['L595', 'L596', 'L597'],
 ['L598', 'L599', 'L600'],
 ['L659', 'L660'],
 ['L952', 'L953'],
 ['L394', 'L395'],
 ['L396', 'L397'],
 ['L589', 'L590', 'L591'],
 ['L592', 'L593'],
 ['L756', 'L757', 'L758'],
 ['L759', 'L760'],
 ['L164', 'L165'],
 ['L319', 'L320'],
 ['L441', 'L442', 'L443', 'L444', 'L445']

In [49]:
movieConversationsWithText = []
for conversation in movieConversations:
    conversationWithText = []
    for line in conversation:
        if line in movieLines:
            conversationWithText.append(movieLines[line])
    movieConversationsWithText.append(conversationWithText)

In [50]:
movieConversationsCleaned = [
    [line.strip() for line in conversation] 
    for conversation in movieConversationsWithText
]

In [51]:
movieConversationsCleaned

[['Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.',
  "Well, I thought we'd start with pronunciation, if that's okay with you.",
  'Not the hacking and gagging and spitting part.  Please.',
  "Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?"],
 ["You're asking me out.  That's so cute. What's your name again?",
  'Forget it.'],
 ["No, no, it's my fault -- we didn't have a proper introduction ---",
  'Cameron.',
  "The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does.",
  'Seems like she could get a date easy enough...'],
 ['Why?',
  'Unsolved mystery.  She used to be really popular when she started high school, then it was just like she got sick of it or something.',
  "That's a shame."],
 ['Gosh, if only we could find Kat a boyfriend...',
  'Let me see what I can do.'],
 ["C'esc ma tete. This is my head"

## Passage en Dataframe

In [52]:
pairs = []
for conversation in movieConversationsCleaned:
    for i in range(len(conversation) - 1):
        pairs.append((conversation[i], conversation[i + 1]))

df = pd.DataFrame(pairs, columns=["intro", "ccl"])

print(df.head(1))
print(df.info())
print(df.describe())
print(df.columns)
df

                                               intro  \
0  Can we make this quick?  Roxanne Korrine and A...   

                                                 ccl  
0  Well, I thought we'd start with pronunciation,...  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 221616 entries, 0 to 221615
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   intro   221616 non-null  object
 1   ccl     221616 non-null  object
dtypes: object(2)
memory usage: 3.4+ MB
None
         intro     ccl
count   221616  221616
unique  193828  193241
top      What?   What?
freq      1488    1373
Index(['intro', 'ccl'], dtype='object')


Unnamed: 0,intro,ccl
0,Can we make this quick? Roxanne Korrine and A...,"Well, I thought we'd start with pronunciation,..."
1,"Well, I thought we'd start with pronunciation,...",Not the hacking and gagging and spitting part....
2,Not the hacking and gagging and spitting part....,Okay... then how 'bout we try out some French ...
3,You're asking me out. That's so cute. What's ...,Forget it.
4,"No, no, it's my fault -- we didn't have a prop...",Cameron.
...,...,...
221611,"Your orders, Mr Vereker?",I'm to take the Sikali with the main column to...
221612,I'm to take the Sikali with the main column to...,Lord Chelmsford seems to want me to stay back ...
221613,Lord Chelmsford seems to want me to stay back ...,I think Chelmsford wants a good man on the bor...
221614,"Well I assure you, Sir, I have no desire to cr...","And I assure you, you do not In fact I'd be ob..."


# Cleaning

In [53]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9' ]", "", text)
    text = re.sub(r"\d+", "<num>", text)
    text = re.sub(r"(\w)'(\w)", r"\1 ' \2", text)
    text = re.sub(r"\s+", " ", text).strip()
    
    words = text.split()
    words = [word for word in words if word not in stop_words]
    
    return " ".join(words)

In [54]:
df["questions"] = df["intro"].apply(clean_text)
df["reponce"] = df["ccl"].apply(clean_text)

print(df.head())

                                               intro  \
0  Can we make this quick?  Roxanne Korrine and A...   
1  Well, I thought we'd start with pronunciation,...   
2  Not the hacking and gagging and spitting part....   
3  You're asking me out.  That's so cute. What's ...   
4  No, no, it's my fault -- we didn't have a prop...   

                                                 ccl  \
0  Well, I thought we'd start with pronunciation,...   
1  Not the hacking and gagging and spitting part....   
2  Okay... then how 'bout we try out some French ...   
3                                         Forget it.   
4                                           Cameron.   

                                           questions  \
0  can we make this quick roxanne korrine and and...   
1  well i thought we ' start with pronunciation i...   
2  not the hacking and gagging and spitting part ...   
3  you ' re asking out that ' so cute what ' your...   
4  no no it ' my fault we didn ' have a proper

In [55]:
vocab_size = 10000
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")

tokenizer.fit_on_texts(df['questions'].tolist() + df['reponce'].tolist())

X = tokenizer.texts_to_sequences(df['questions'].tolist())
y = tokenizer.texts_to_sequences(df['reponce'].tolist())

print(X[:3])
print(y[:3])


[[30, 15, 102, 18, 953, 1, 1, 10, 3943, 7758, 34, 395, 75, 3792, 1, 930, 494, 48, 5, 1, 180], [56, 4, 136, 15, 2, 304, 32, 1, 43, 9, 2, 106, 32, 3], [26, 5, 8821, 10, 1, 10, 9532, 341, 173]]
[[56, 4, 136, 15, 2, 304, 32, 1, 43, 9, 2, 106, 32, 3], [26, 5, 8821, 10, 1, 10, 9532, 341, 173], [106, 85, 49, 946, 15, 254, 44, 79, 976, 1, 1414, 152]]


In [56]:
max_length = 40

X = pad_sequences(X, maxlen=max_length, padding='post')  
y = pad_sequences(y, maxlen=max_length, padding='post')

print(f"Shape de X: {X.shape}")
print(f"Shape de y: {y.shape}")


Shape de X: (221616, 40)
Shape de y: (221616, 40)


In [57]:
import numpy as np

# Décalage : chaque X doit prédire le mot suivant de y
y = np.array([seq[1:] + [0] for seq in y])  # Décalage + padding
y = np.expand_dims(y, axis=-1)  # Reshape pour le modèle

print(f"Nouvelle shape de y: {y.shape}")  # (nb_exemples, max_length, 1)


Nouvelle shape de y: (221616, 39, 1)


In [58]:
y = np.array([seq[-1] for seq in y])


In [60]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Shape de X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"Shape de X_test: {X_test.shape}, y_test: {y_test.shape}")

Shape de X_train: (177292, 40), y_train: (177292, 1)
Shape de X_test: (44324, 40), y_test: (44324, 1)


In [None]:
embedding_dim = 100

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length),
    Bidirectional(LSTM(128, return_sequences=True)),  
    Bidirectional(LSTM(128)),  
    Dense(vocab_size, activation='softmax')  
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

history = model.fit(X, y, epochs=50, batch_size=128, validation_split=0.2)


Epoch 1/50




[1m 710/1386[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m1:35[0m 142ms/step - accuracy: 0.9576 - loss: 1.1235

KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8,5))
plt.plot(history.history['loss'], label='Train Loss', color='blue')
plt.plot(history.history['val_loss'], label='Validation Loss', color='red')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training Loss vs Validation Loss')
plt.legend()
plt.grid()
plt.show()


In [None]:
questions = df['intro'].tolist()
reponses = df['ccl'].tolist()

print(f"Nombre de paires question-réponse : {len(questions)}")
print(f"Exemple de question : {questions[0]}")

Nombre de paires question-réponse : 221616
Exemple de question : can we make this quick roxanne korrine and andrew barrett are having an incredibly horrendous public break up on the quad again


In [None]:
import re

def clean_text(text):
    text = text.lower()  # Tout en minuscules
    text = re.sub(r"[^a-zA-Z0-9]+", " ", text)  # Supprimer caractères spéciaux
    return text.strip()

# Nettoyer les questions et réponses
questions_clean = [clean_text(q) for q in questions]
reponses_clean = [clean_text(r) for r in reponses]

In [None]:
print(f"Shape de X: {X.shape}")  # (nb_exemples, max_length, embedding_dim)
print(f"Shape de y: {y.shape}")  # (nb_exemples,)

AttributeError: 'list' object has no attribute 'shape'