In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('data/data_frame_searchengine.csv')

In [3]:
df.shape

(24749, 4)

In [4]:
df.columns

Index(['num', 'name', 'content', 'file_content'], dtype='object')

In [5]:
df_new = df[['num', 'name', 'file_content']]

In [6]:
df_new

Unnamed: 0,num,name,file_content
0,9504163,mis.juhtus.andres.lapeteusega.(1966).eng.1cd,"ï»¿1\r\n00:00:42,908 --> 00:00:48,933\r\nWHAT ..."
1,9442913,michael.(2023).eng.1cd,"1\r\n00:00:06,000 --> 00:00:12,074\r\napi.Open..."
2,9391327,king.of.the.hill.s12.e02.bobby.rae.(2007).eng.1cd,"ï»¿1\r\n00:00:06,000 --> 00:00:12,074\r\nSuppo..."
3,9247893,doc.martin.s01.e02.gentlemen.prefer.(2004).eng...,"1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch an..."
4,9418015,class.s01.e06.episode.1.6.(2023).eng.1cd,"ï»¿1\r\n00:00:06,000 --> 00:00:12,074\r\napi.O..."
...,...,...,...
24744,9466890,csi.miami.s06.e20.down.to.the.wire.(2008).eng.1cd,"ï»¿1\r\n00:00:04,071 --> 00:00:10,242\r\n(phon..."
24745,9268430,nova.s46.e18.look.whos.driving.(2019).eng.1cd,"ï»¿1\r\n00:00:01,835 --> 00:00:03,531\r\nA let..."
24746,9323572,entourage.s05.e06.redomption.(2008).eng.1cd,"ï»¿1\r\n00:00:07,441 --> 00:00:09,474\r\n<i>(t..."
24747,9267122,kaidan.semushi.otoko.(1965).eng.1cd,"ï»¿1\r\n00:00:06,000 --> 00:00:12,074\r\nAdver..."


In [7]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [8]:
lemmatizer = WordNetLemmatizer()

In [9]:
def preprocess(raw_text):
    # Remove HTML tags
    text = BeautifulSoup(raw_text, 'html.parser').get_text()
    
    # Removing special characters and digits
    special_char = re.sub("[^a-zA-Z]", " ", text)
    
    # change sentence to lower case
    lowered = special_char.lower()
    
    
    sentence = re.sub('\s+', ' ', lowered)

    tokens = sentence.split()
    
    # Lemmatization
    clean_tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    return pd.Series([" ".join(clean_tokens), len(clean_tokens)])

In [10]:
from tqdm import tqdm, tqdm_notebook
import re
from bs4 import BeautifulSoup

In [11]:
# Register `pandas.progress_apply` and `pandas.Series.map_apply` with `tqdm`

tqdm.pandas()

In [12]:
# Now you can use `progress_apply` instead of `apply`
# and `progress_map` instead of `map`

temp_df = df_new['file_content'].progress_apply(lambda x: preprocess(x))

temp_df.head()

  text = BeautifulSoup(raw_text, 'html.parser').get_text()
100%|████████████████████████████████████████████████████████████████████████████| 24749/24749 [14:03<00:00, 29.34it/s]


Unnamed: 0,0,1
0,what happened to andres lapeteus andres lapete...,7420
1,api opensubtitles org is deprecated please imp...,8124
2,support u and become vip member to remove all ...,2791
3,watch any video online with open subtitle free...,4726
4,api opensubtitles org is deprecated please imp...,3947


In [13]:
temp_df.columns = ['clean_text_lemma', 'text_length_lemma']

temp_df.head()

Unnamed: 0,clean_text_lemma,text_length_lemma
0,what happened to andres lapeteus andres lapete...,7420
1,api opensubtitles org is deprecated please imp...,8124
2,support u and become vip member to remove all ...,2791
3,watch any video online with open subtitle free...,4726
4,api opensubtitles org is deprecated please imp...,3947


In [14]:
df_new = pd.concat([df_new, temp_df], axis=1)
df_new.head()

Unnamed: 0,num,name,file_content,clean_text_lemma,text_length_lemma
0,9504163,mis.juhtus.andres.lapeteusega.(1966).eng.1cd,"ï»¿1\r\n00:00:42,908 --> 00:00:48,933\r\nWHAT ...",what happened to andres lapeteus andres lapete...,7420
1,9442913,michael.(2023).eng.1cd,"1\r\n00:00:06,000 --> 00:00:12,074\r\napi.Open...",api opensubtitles org is deprecated please imp...,8124
2,9391327,king.of.the.hill.s12.e02.bobby.rae.(2007).eng.1cd,"ï»¿1\r\n00:00:06,000 --> 00:00:12,074\r\nSuppo...",support u and become vip member to remove all ...,2791
3,9247893,doc.martin.s01.e02.gentlemen.prefer.(2004).eng...,"1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch an...",watch any video online with open subtitle free...,4726
4,9418015,class.s01.e06.episode.1.6.(2023).eng.1cd,"ï»¿1\r\n00:00:06,000 --> 00:00:12,074\r\napi.O...",api opensubtitles org is deprecated please imp...,3947


In [18]:
df_10 = df_new.sample(frac=0.1)

In [19]:
df_10

Unnamed: 0,num,name,file_content,clean_text_lemma,text_length_lemma
17759,9293242,military.prosecutor.do.bae.man.s01.e09.the.lea...,"ï»¿1\r\n00:00:06,000 --> 00:00:12,074\r\nAdver...",advertise your product or brand here contact w...,6809
16487,9500947,alfred.hitchcock.presents.s01.e20.and.so.died....,ï»¿[Script Info]\r\nTitle: Default file\r\nScr...,script info title default file scripttype v wr...,8256
18634,9452413,big.time.rush.s02.e16.big.time.girl.group.(201...,"ï»¿1\r\n00:00:11,500 --> 00:00:13,375\r\n- Gre...",great news boy the news is never great when he...,2816
17531,9406362,quest.for.love.(2022).eng.1cd,"1\r\n00:00:00,760 --> 00:00:04,093\r\n(soft or...",soft orchestral music advertise your product o...,11234
2708,9515069,ultra.city.smiths.s01.e03.hot.clues.(2021).eng...,ï»¿[Script Info]\r\nTitle: Default file\r\nScr...,script info title default file scripttype v wr...,5428
...,...,...,...,...,...
16046,9225836,the.croods.family.tree.s04.e02.ballincup.(2022...,"ï»¿1\r\n00:00:06,000 --> 00:00:12,074\r\napi.O...",api opensubtitles org is deprecated please imp...,3254
13812,9202357,thank.you.(2022).eng.1cd,"ï»¿1\r\n00:00:06,000 --> 00:00:12,074\r\napi.O...",api opensubtitles org is deprecated please imp...,9525
19133,9250167,desperate.housewives.s02.e13.theres.something....,"ï»¿1\r\n00:00:01,335 --> 00:00:02,934\r\n<i>Pr...",previously on desperate housewife if you come ...,5249
20320,9509822,hometown.s01.e11.jo.junghyun.(2021).eng.1cd,"ï»¿1\n00:00:13,998 --> 00:00:15,360\n(The char...",the character place institution case and organ...,4233


In [20]:
# df_10.to_csv('data/search_eng_data_21.csv', index=False, escapechar="\\")

In [22]:
df_10.sample(50)

Unnamed: 0,num,name,file_content,clean_text_lemma,text_length_lemma
16568,9386361,trigun.stampede.s01.e01.nomans.land.(2023).eng...,[Script Info]\r\nTitle: English (US)\r\nOrigin...,script info title english u original script to...,3010
17274,9446672,quarantine.for.two.(2021).eng.1cd,"ï»¿1\r\n00:00:06,000 --> 00:00:12,074\r\nSuppo...",support u and become vip member to remove all ...,9497
21694,9395434,fast.furious.(2009).eng.1cd,ï»¿[Script Info]\r\n; Script generated by Aegi...,script info script generated by aegisub http w...,8422
9570,9400870,buck.and.the.preacher.(1972).eng.1cd,"1\r\n00:00:06,000 --> 00:00:12,074\r\nAdvertis...",advertise your product or brand here contact w...,4373
3075,9512307,love.is.blind.s04.e07.second.times.the.charm.(...,"ï»¿1\r\n00:00:05,964 --> 00:00:07,966\r\n[dram...",dramatic music playing an chelsea sigh well he...,7850
18162,9509838,dimension.20.s12.e06.a.starstruck.odyssey.on.t...,WEBVTT\n\nï»¿1\n00:00:00.144 --> 00:00:02.459\...,webvtt air whoosh softly upbeat groove music a...,21121
11318,9366192,the.following.s01.e07.let.me.go.(2013).eng.1cd,"ï»¿1\r\n00:00:02,135 --> 00:00:06,664\r\nMAN [...",man on tv carroll wa convicted in for the murd...,3313
21580,9425675,final.moments.s01.e08.the.streets.are.talking....,"1\r\n00:00:06,000 --> 00:00:12,074\r\nAdvertis...",advertise your product or brand here contact w...,5743
21493,9272092,the.addams.family.(1991).eng.1cd,"ï»¿1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch...",watch any video online with open subtitle free...,6195
21434,9417455,downtown.precinct.s02.e05.episode.2.5.(2014).e...,"1\r\n00:00:05,520 --> 00:00:07,480\r\nWe know ...",we know who is the partner of the contractor i...,4043


In [23]:
df_20 = df_new.sample(frac=0.2)

In [24]:
# df_20.to_csv('data/search_eng_data_20_percent.csv', index=False, escapechar="\\")

In [25]:
df_20

Unnamed: 0,num,name,file_content,clean_text_lemma,text_length_lemma
14613,9426103,les.bonnes.femmes.(1960).eng.1cd,"1\r\n00:00:06,000 --> 00:00:12,074\r\nSupport ...",support u and become vip member to remove all ...,5266
19429,9432248,dam.s01.e05.the.past.is.not.dead.(2021).eng.1cd,"1\r\n00:00:00,480 --> 00:00:03,480\r\n(water b...",water boiling advertise your product or brand ...,2393
17979,9266391,little.manhattan.(2005).eng.1cd,"1\r\n00:00:06,000 --> 00:00:12,074\r\nAdvertis...",advertise your product or brand here contact w...,10585
1534,9250897,jamtara.sabka.number.ayega.s02.e01.episode.2.1...,"ï»¿1\r\n00:00:11,400 --> 00:00:13,520\r\nThere...",there wa a time when dacoit would kill people ...,4051
14433,9517056,my.sisters.serial.killer.boyfriend.(2023).eng.1cd,"1\r\n00:00:06,000 --> 00:00:12,074\r\nSupport ...",support u and become vip member to remove all ...,8336
...,...,...,...,...,...
8498,9216854,sover.dolly.pa.ryggen.(2012).eng.1cd,"ï»¿1\r\n00:00:09,166 --> 00:00:12,166\r\nAnd y...",and you re sure about this yes you re you ve g...,7767
22555,9237797,young.sheldon.s05.e03.potential.energy.and.hoo...,"ï»¿1\r\n00:00:02,295 --> 00:00:03,689\r\n[adul...",adult sheldon previously on young sheldon did ...,3153
18170,9469768,australian.survivor.s07.e05.episode.7.5.(2022)...,"ï»¿1\r\n00:00:01,000 --> 00:00:07,600\r\nJONAT...",jonathan lapaglia previously on australian sur...,7316
5687,9488726,being.human.s04.e04.the.panic.womb.(2014).eng.1cd,"ï»¿1\r\n00:00:02,836 --> 00:00:04,605\r\n- Pre...",previously travelling through time l il smokie...,6082
