# STEP 1 : Preprocessing Original OpenSubtitle

**input:** opensubtitles_2018.txt

**Main :** Build a efficient dataframe for futher analysis

**Output :** dataset_cleaned.csv & dataset_token_cleaned.csv


In [1]:
import pandas as pd
import numpy as np
import time
import pickle

In [2]:
####### LOADING DATA #############
start_time = time.time()
data       = pd.read_csv("opensubtitles_2018.txt", sep="\n", names = ["Raw_Text"])
data_genre = data.copy()
print("---Execution time :  %s seconds ---" % (time.time() - start_time))

---Execution time :  102.74621224403381 seconds ---


In [3]:
####### CLEANING CODE ###########
# Get the number of each dialogs in separate row
start_time = time.time()
data['Num_dialog'] =data["Raw_Text"].str.startswith("<->").astype(int).cumsum()+1
print("---Execution time :  %s seconds ---" % (time.time() - start_time))
# Get the genre of each dialogs in separate row
data['genre'] = data[data["Raw_Text"].str.startswith("genre")].Raw_Text.str.split(pat='|').str[1]
print("---Execution time :  %s seconds ---" % (time.time() - start_time))
# propagate non-null values forward or backward.
data= data.fillna(method='ffill')
print("---Execution time :  %s seconds ---" % (time.time() - start_time))


---Execution time :  35.34038710594177 seconds ---
---Execution time :  73.5492742061615 seconds ---
---Execution time :  99.4453375339508 seconds ---


In [5]:
data.head(30)

Unnamed: 0,Raw_Text,Num_dialog,genre
0,===,1,
1,path | 2013/2502200/4936679.xml,1,
2,confidence | 1.0,1,
3,"genre | Comedy,Family",1,"Comedy,Family"
4,year | 2013,1,"Comedy,Family"
5,sentences | 451,1,"Comedy,Family"
6,tokens | 3947,1,"Comedy,Family"
7,===,1,"Comedy,Family"
8,dialog_id | 1,1,"Comedy,Family"
9,"00:00:02,025 | 00:00:04,823 | So , let 's take...",1,"Comedy,Family"


In [6]:
# Get the text and creat new row with it
start_time = time.time()
data["Text"] = data.Raw_Text.str.split(pat='|').str[2]
print("---Execution time :  %s seconds ---" % (time.time() - start_time))

---Execution time :  2399.478283882141 seconds ---


In [12]:
start_time = time.time()
# Remove Row that contain NaN value in text column
data = data[data['Text'].notna()]
# Reset index
data.reset_index(inplace=True,drop=True)
print("---Execution time :  %s seconds ---" % (time.time() - start_time))

---Execution time :  2105.5066707134247 seconds ---


In [13]:
data.head(30)

Unnamed: 0,Raw_Text,Num_dialog,genre,Text
0,"00:00:02,025 | 00:00:04,823 | So , let 's take...",1,"Comedy,Family","So , let 's take a look at what 's going on a..."
1,"00:00:04,857 | 00:00:06,241 | You know what ?",1,"Comedy,Family",You know what ?
2,"00:00:06,275 | 00:00:09,527 | Just bear with m...",1,"Comedy,Family",Just bear with me while I take off these anno...
3,"00:00:09,562 | 00:00:12,263 | Looks like Scotc...",1,"Comedy,Family",Looks like Scotch Wilkinson had a liquid lunc...
4,"00:00:13,971 | 00:00:16,206 | Another hurrican...",1,"Comedy,Family",Another hurricane is headed for the tropics .
5,"00:00:16,224 | 00:00:18,758 | Maybe they 'll n...",1,"Comedy,Family",Maybe they 'll name this one after my ex-wife .
6,"00:00:18,809 | 00:00:21,311 | Call it Hurrican...",1,"Comedy,Family",Call it Hurricane Bi ...
7,"00:00:21,345 | 00:00:27,167 | If you 're gonna...",1,"Comedy,Family",If you 're gonna spend two months ' salary on...
8,"00:00:27,218 | 00:00:30,136 | You know what re...",1,"Comedy,Family",You know what really makes my head explode ?
9,"00:00:30,171 | - | The way society has conditi...",1,"Comedy,Family",The way society has conditioned women to base...


In [None]:
# Drop useless column
data.drop(['Raw_Text'], axis=1, inplace = True)

In [None]:
### Get the token of each sentences in extra column
start_time = time.time()
data["Token"] = data["Text"].apply(nltk.word_tokenize)
print("---Execution time :  %s seconds ---" % (time.time() - start_time))

In [None]:
### Get the lenght of the token
start_time = time.time()
data["Size"] = data["Token"].apply(len)
print("---Execution time :  %s seconds ---" % (time.time() - start_time))

In [None]:
### No need of the token (Question of size)
data.drop(['Token'], axis=1, inplace = True)

In [None]:
### Save as csv or pickle
data.to_pickle("dataset_cleaned.pkl")