#  STEP 1 : Preprocessing The manual Subset

**Input :** dataset_cleaned.csv & dataset_token_cleaned.csv

**Main :** Rule-based algorithm for dialogues segmentation

**Output :** Solution.pkl


In [48]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
from plotly.offline import plot
import pickle
import os
import time

In [49]:
start_time = time.time()
data        = pd.read_csv("dataset_cleaned.csv")
print("---Execution time :  %s seconds ---" % (time.time() - start_time))

---Execution time :  164.92797899246216 seconds ---


In [None]:
def Verification_ (Subset):
    Verification = Subset["Size"].value_counts().to_frame()
    Verification["%"] = Verification["Size"]/(Subset.shape[0]) *100
    Verification.sort_index(ascending=True, inplace=True)
    Verification["Cumulative %"] = Verification["%"].cumsum()
    return Verification

In [None]:
# 1. Subset            : Keep only row where  ":" appear
# 2. subset_full       : Get all dialogs when ":" appear
#    2.1  Verification : Study on ":" 
# 3. Good              : Subset with at least two turn
# 4. Best              : Good with sentence where ":" appears only once
#    4.1  Speaker      : Keep only speaker
# 5. Best_2            : Keep only dialogues where there is at least two distict character
# 6. Best_no_num       : Delete dialogs with  <NUM> in the speaker name
# 7. New_data          : Keep only dialgues with speaker of lenght 1

In [None]:
##################################################################################

# SUBSET
# Creation subset with only  :  sentences
Subset = data.loc[data["Text"].str.contains(':')].copy()
nbrs_dialog_and_sentences(Subset)

##################################################################################

# SUBSET_FULL
# Get get full dialogs when : appear
list_ndia = Subset.Num_dialog.to_list()
subset_full = data.loc[data["Num_dialog"].isin(list_ndia)].copy()
nbrs_dialog_and_sentences(subset_full)
# Study on : 
# Split Text with :. [John : , Hola how are you ? ] 
# Size Of the splitting : [John, Hola how are you ?] ----> size = 2
Subset["test"]=Subset.Text.str.split(pat = ":") 
Subset["Size"]=Subset["test"].apply(lambda x: len(x))
Verification_ (Subset).head(10)

##################################################################################

# GOOD: Only dialogues with at least two turn
Good = Subset[Subset.duplicated(['Num_dialog'],keep=False)]

##################################################################################

# BEST : Keep turn with only one :
Best = Good[Good["Size"]==2].copy()
# Keep only first part
Best["caracter"] = Best.test.apply(lambda x : x[0])

##################################################################################

# BEST_2
# Keep only dialogs where there at least 2 speaker
Step1 = Best.groupby(["Num_dialog"]).caracter.nunique().to_frame()
Step2 = Step1.loc[Step1.caracter>=2].index.values.tolist()
#Best.loc[Best.groupby(["Num_dialog"]).caracter.nunique().to_frame().caracter>=2]
Best_2 = Best.loc[Best["Num_dialog"].isin(Step2)]

##################################################################################

# BEST_NO_NUM
# Delete dialogues with <num> in the character name
Best_no_Num = Best_2[~Best_2.caracter.str.contains('<NUM>')].copy()
# Ne garde pas les duplicat
Best_no_Num=Best_no_Num[Best_no_Num.duplicated(['Num_dialog'],keep=False)]

# Tokenize the character
start_time = time.time()
Best_no_Num["Token"] = Best_no_Num["caracter"].apply(nltk.word_tokenize)
print("---Execution time :  %s seconds ---" % (time.time() - start_time))
# Mesure la taille des caracters
Best_no_Num["Size_caracter"] = Best_no_Num["Token"].apply(len)

##################################################################################

# NEW_DATA with only one words in the character name
New_data = Best_no_Num[Best_no_Num["Size_caracter"]==1].copy()

In [None]:
# 1. Measure the # turn between two caracter 
# 2. Calculus          : Datafram for some calcul
# 3. How many maximum row should we merge 
# 4. Take dialogs id IIF there is less than 31 turn between
# 5. Possible solution : Inserte index list in data_clean3
# 6. Solution          : Merge text.
# Issue : there is still some consecutive same character
# Issue : There is still dialogues with only one turn                   
#7.  Solution2         : merge consecutive identical speaker
# Keep only dialogues with at least 2 turn.

In [None]:
##################################################################################
#PART1
# Get list index
list_index2 = New_data.index.to_list()
list_index2.insert(0, 0)
pd.Series(list_index2)
# Allow to know the number of row between two character
list_index = New_data.index.to_list()
list_index2 = New_data.index.to_list()
list_index.append(0)
list_index2.insert(0, 0)
solution = pd.Series(list_index)-pd.Series(list_index2)
#solution.drop(labels=solution.tail(1).index.values,inplace = True)
solution.drop(labels=0,inplace = True)
solution.reset_index(inplace=True, drop=True)
New_data["index_sub"]= solution.to_list()
New_data.drop(New_data.tail(1).index,inplace=True)
New_data.reset_index(inplace=True, drop=False)

##################################################################################
#PART2
Calculus = New_data.drop(["Num_dialog",'genre','Text','Size','test','caracter','Token','Size_caracter'], axis=1).copy()

##################################################################################
#PART3
# Permet de Voir combiend de row on doit merge au maximum
Result = Calculus["index_sub"].value_counts().sort_index().to_frame().rename_axis("# row").rename(columns={0: '#Dialogs'})
Result["%"] = Result["index_sub"]/ Calculus["index"].nunique()*100
Result["Cumulative %"] = Result["%"].cumsum()
Result.head(30)

##################################################################################
#PART4
# Pemermet de prendre
start_time = time.time()

Fill_index = []
liste_1=list(zip(Calculus["index"], Calculus["index_sub"]))
for i in range(Calculus.shape[0]): 
    if liste_1[i][1] <=30:
        Fill_index = Fill_index + list(range(liste_1[i][0],liste_1[i+1][0]+1))
        
print("---Execution time :  %s seconds ---" % (time.time() - start_time))

##################################################################################
#PART5
Possible_solution = data.iloc[Fill_index].copy()
# # Permet d'avoir des dialogs avec au moin deux phrase
Possible_solution = Possible_solution.loc[~Possible_solution.index.duplicated(keep='first')]
# Permet de savoir ou est le caracter
Possible_solution["Size"]=Possible_solution.Text.str.split(pat = ":").apply(lambda x: len(x))
# Permet de donner un chiffre identique quand le text appartient a un speaker
Possible_solution.loc[(Possible_solution.Size == 2),'Size']=1
Possible_solution.loc[(Possible_solution.Size != 2),'Size']=0
Possible_solution["Size"] = Possible_solution["Size"].cumsum()

##################################################################################
#PART6
# Ici on merge les text
Solution = Possible_solution.groupby(['Num_dialog','Size'])['Text'].apply(' '.join).to_frame()
# Cree une nouvelle colone avec le nom du caracter
Solution["character"]=Solution.Text.str.split(pat = ":").apply(lambda x : x[0])
Solution = Solution[["character","Text"]]
# Met le text et le caracter de facon distinct
Solution= Solution.Text.str.split(pat = ":",n=1, expand=True).rename(columns={0: 'character',1: 'Text'})

#---> Ici Y a des dialogs de taille 1.

##################################################################################
#PART7 : Remouving dialogs of one sentence et joindre les caratcer consecutif identique

#Allow to remouve singe dialogs
Solution2 = Solution2[Solution2.Num_dialog.duplicated(keep = False)]

#Merge Dialogs with same speaker
# permet de cree un liste de nombre en cumulative et egal si meme caracter consecutivement
Solution2["Value2"]=Solution2.ne(Solution2.shift()).character.cumsum()
Solution2.Text.fillna(" ",inplace=True)
# Permet de joindre
Solution2 = Solution2.groupby(['Value2','Num_dialog','character'])['Text'].apply(' '.join).to_frame()
Solution2.reset_index(inplace = True)
Solution2.drop(columns=['Value2'],inplace= True)
# Permet de ne garder que les dilogs de taille 2 et supperieur
Solution2 = Solution2[Solution2.Num_dialog.duplicated(keep = False)]