In [2]:
import pandas as pd
import re
import seaborn as sns
import matplotlib.pyplot as plt
import random

In [3]:
df = pd.read_csv('./dataset/ted_talks_en.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4005 entries, 0 to 4004
Data columns (total 19 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   talk_id         4005 non-null   int64  
 1   title           4005 non-null   object 
 2   speaker_1       4005 non-null   object 
 3   all_speakers    4001 non-null   object 
 4   occupations     3483 non-null   object 
 5   about_speakers  3502 non-null   object 
 6   views           4005 non-null   int64  
 7   recorded_date   4004 non-null   object 
 8   published_date  4005 non-null   object 
 9   event           4005 non-null   object 
 10  native_lang     4005 non-null   object 
 11  available_lang  4005 non-null   object 
 12  comments        3350 non-null   float64
 13  duration        4005 non-null   int64  
 14  topics          4005 non-null   object 
 15  related_talks   4005 non-null   object 
 16  url             4005 non-null   object 
 17  description     4005 non-null   o

In [4]:
df = df[['transcript']]

In [5]:
df.head()

Unnamed: 0,transcript
0,"Thank you so much, Chris. And it's truly a gre..."
1,"About 10 years ago, I took on the task to teac..."
2,"(Music: ""The Sound of Silence,"" Simon & Garfun..."
3,If you're here today — and I'm very happy that...
4,Good morning. How are you? (Audience) Good. It...


In [6]:
def split_transcript_random(transcript, min_words=90, max_words=250):
    words = transcript.split()  # Pisahkan teks menjadi kata-kata
    chunks = []
    i = 0
    while i < len(words):
        # Tentukan ukuran chunk secara acak
        chunk_size = random.randint(min_words, max_words)
        # Ambil chunk kata
        chunk = words[i:i + chunk_size]
        # Gabungkan kembali kata-kata menjadi teks
        chunks.append(' '.join(chunk))
        # Pindah ke indeks berikutnya
        i += chunk_size
    return chunks

In [7]:
split_rows = []
for idx, row in df.iterrows():
    transcript = row['transcript']
    if pd.notnull(transcript):  # Cek jika transkrip tidak kosong
        chunks = split_transcript_random(transcript, min_words=90, max_words=250)  # Potong teks dengan panjang acak
        for chunk in chunks:
            split_rows.append({'original_index': idx, 'transcript': chunk})  # Simpan potongan

# Buat DataFrame baru
split_df = pd.DataFrame(split_rows)

In [8]:
print(split_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44516 entries, 0 to 44515
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   original_index  44516 non-null  int64 
 1   transcript      44516 non-null  object
dtypes: int64(1), object(1)
memory usage: 695.7+ KB
None


In [9]:
print(split_df.head())

   original_index                                         transcript
0               0  Thank you so much, Chris. And it's truly a gre...
1               0  from our home in Nashville to a little farm we...
2               0  to us, and she lowered her voice so much, I ha...
3               0  refueling. I woke up, they opened the door, I ...
4               0  President Al Gore announced in Nigeria yesterd...


In [10]:
split_df.shape

(44516, 2)

In [11]:
split_df = split_df.drop_duplicates(keep='first')

In [12]:
split_df = split_df.dropna()

In [13]:
split_df.shape

(44516, 2)

In [14]:
def clean_text(text):
    text = re.sub(r'\([^)]*\)', '', text)
    text = re.sub(r'[^A-Za-z0-9]+', ' ', text) # Hapus karakter special
    text = re.sub(r'\s+', ' ', text).strip() # Hapus spasi berlebih
    # lower case text
    text = text.lower()
    return text

split_df['transcript'] = split_df['transcript'].apply(clean_text)

In [15]:
split_df

Unnamed: 0,original_index,transcript
0,0,thank you so much chris and it s truly a great...
1,0,from our home in nashville to a little farm we...
2,0,to us and she lowered her voice so much i had ...
3,0,refueling i woke up they opened the door i wen...
4,0,president al gore announced in nigeria yesterd...
...,...,...
44511,4004,can you guess what you re looking at is it a f...
44512,4004,draw water in and out over an internal lung li...
44513,4004,fibrils to slide them together stiffening the ...
44514,4004,the seabed sea cucumbers are found everywhere ...


In [16]:
filler_words_list = ['um', 'uh']
def insert_fillers(text, max_fillers=12):
    words = text.split()  # Pisahkan teks menjadi kata-kata
    num_fillers = random.randint(0, max_fillers)  # Tentukan jumlah filler secara acak
    
    for _ in range(num_fillers):
        if len(words) > 1:  # Pastikan ada cukup kata untuk menyisipkan filler
            insert_pos = random.randint(0, len(words) - 1)  # Pilih posisi acak
            filler = random.choice(filler_words_list)  # Pilih filler secara acak
            words.insert(insert_pos, filler)  # Sisipkan filler
    
    return ' '.join(words)  # Gabungkan kembali teks

# Terapkan fungsi ke setiap baris
split_df['transcript_with_fillers'] = split_df['transcript'].apply(insert_fillers)

# Tampilkan hasil
print(split_df[['transcript', 'transcript_with_fillers']].head())


                                          transcript  \
0  thank you so much chris and it s truly a great...   
1  from our home in nashville to a little farm we...   
2  to us and she lowered her voice so much i had ...   
3  refueling i woke up they opened the door i wen...   
4  president al gore announced in nigeria yesterd...   

                             transcript_with_fillers  
0  thank you so much chris and it s truly a great...  
1  from our home in nashville to a little farm we...  
2  to us and she lowered her voice so much i had ...  
3  uh refueling i woke up they opened the door i ...  
4  president al gore announced in nigeria yesterd...  


In [17]:
filler_words_list = ['um', 'uh']
df['filler_words'] = df['transcript'].apply(
    lambda text: len(re.findall(r'\b(?:%s)\b' % '|'.join(filler_words_list), str(text), re.IGNORECASE))
)

# Tampilkan beberapa baris pertama
print(df.head())

                                          transcript  filler_words
0  Thank you so much, Chris. And it's truly a gre...             0
1  About 10 years ago, I took on the task to teac...             0
2  (Music: "The Sound of Silence," Simon & Garfun...             2
3  If you're here today — and I'm very happy that...             0
4  Good morning. How are you? (Audience) Good. It...             0


In [18]:
meaningless_connectors = ["so", "like", "and", "but", "because"]

# Fungsi untuk menghitung fitur
def calculate_features(text):
    # Filler words count
    filler_count = sum(1 for word in text.split() if word in filler_words_list)
    
    # Total words in the text
    total_words = len(text.split())
    
    # Meaningless connectors count
    meaningless_connectors_count = sum(1 for word in text.split() if word in meaningless_connectors)
    
    # Connector ratio
    connector_ratio = meaningless_connectors_count / total_words if total_words > 0 else 0
    
    # Return calculated features
    return {
        "filler_words_count": filler_count,
        "meaningless_connectors_count": meaningless_connectors_count,
        "connector_ratio": connector_ratio,
        "word_count": total_words
    }

# Terapkan ke kolom transcript_with_fillers
features = split_df['transcript_with_fillers'].apply(calculate_features)
features_df = pd.DataFrame(list(features))  # Ubah hasil menjadi DataFrame

# Gabungkan ke DataFrame utama
split_df = pd.concat([split_df, features_df], axis=1)

# Tampilkan hasil
print(split_df.head())


   original_index                                         transcript  \
0               0  thank you so much chris and it s truly a great...   
1               0  from our home in nashville to a little farm we...   
2               0  to us and she lowered her voice so much i had ...   
3               0  refueling i woke up they opened the door i wen...   
4               0  president al gore announced in nigeria yesterd...   

                             transcript_with_fillers  filler_words_count  \
0  thank you so much chris and it s truly a great...                   6   
1  from our home in nashville to a little farm we...                   6   
2  to us and she lowered her voice so much i had ...                   0   
3  uh refueling i woke up they opened the door i ...                  10   
4  president al gore announced in nigeria yesterd...                   3   

   meaningless_connectors_count  connector_ratio  word_count  
0                             7         0.05000

In [19]:
# Tambahkan penanganan NaN dalam fungsi
def generate_speak_duration(word_count, stutter_prob=0.2):
    # Jika word_count adalah NaN atau 0, kembalikan durasi minimum
    if pd.isna(word_count) or word_count == 0:
        return 30  # Durasi minimum
    
    # Kecepatan bicara rata-rata (kata per menit)
    avg_speech_speed = random.randint(130, 150)  # Random dalam range
    
    # Estimasi durasi (dalam detik) berdasarkan jumlah kata
    estimated_duration = (word_count / avg_speech_speed) * 60
    
    # Tambahkan probabilitas untuk stuttering
    if random.random() < stutter_prob:
        # Jika stuttering, tambahkan durasi acak
        stutter_duration = random.uniform(estimated_duration + 10, estimated_duration + 30)
        return int(min(max(30, stutter_duration), 90))  # Batas 30–90 detik, dibulatkan ke bilangan bulat
    
    # Durasi normal dengan noise kecil
    normal_duration = random.uniform(estimated_duration - 5, estimated_duration + 5)
    return int(min(max(30, normal_duration), 90))  # Batas 30–90 detik, dibulatkan ke bilangan bulat

# Periksa dan isi nilai NaN di kolom word_count
split_df['word_count'] = split_df['word_count'].fillna(0)

# Terapkan fungsi dengan pembulatan
split_df['speak_duration'] = split_df['word_count'].apply(generate_speak_duration)

# Tampilkan hasil
print(split_df[['word_count', 'speak_duration']].head())


   word_count  speak_duration
0         140              63
1         153              67
2         183              90
3         148              82
4         227              90


In [21]:
def calculate_confidence(row):
    # Toleransi berbasis durasi atau jumlah kata
    filler_words_tolerance = 4 + 0.02 * row['word_count']  # Toleransi filler bertambah seiring panjang kata
    meaningless_tolerance = 7 + 0.03 * row['word_count']  # Toleransi meaningless bertambah
    
    # Kecepatan bicara
    avg_words_per_minute = row['word_count'] / (row['speak_duration'] / 60) if row['speak_duration'] > 0 else 0
    
    # Aturan untuk confidence = 1
    if (
        row['filler_words_count'] <= filler_words_tolerance and
        row['meaningless_connectors_count'] <= meaningless_tolerance and
        100 <= avg_words_per_minute <= 170 and  # Kecepatan bicara wajar
        50 <= row['word_count'] <= 300 and      # Jumlah kata wajar
        30 <= row['speak_duration'] <= 90       # Lama bicara dalam batas
    ):
        return 1  # Percaya diri
    else:
        return 0  # Tidak percaya diri

# Terapkan fungsi ke DataFrame
split_df['confidence'] = split_df.apply(calculate_confidence, axis=1)

# Tampilkan beberapa baris pertama
print(split_df[['transcript_with_fillers', 'filler_words_count', 'meaningless_connectors_count', 'word_count', 'speak_duration', 'confidence']].head())


                             transcript_with_fillers  filler_words_count  \
0  thank you so much chris and it s truly a great...                   6   
1  from our home in nashville to a little farm we...                   6   
2  to us and she lowered her voice so much i had ...                   0   
3  uh refueling i woke up they opened the door i ...                  10   
4  president al gore announced in nigeria yesterd...                   3   

   meaningless_connectors_count  word_count  speak_duration  confidence  
0                             7         140              63           1  
1                             7         153              67           1  
2                             8         183              90           1  
3                             9         148              82           0  
4                            10         227              90           1  


In [22]:
split_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44516 entries, 0 to 44515
Data columns (total 9 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   original_index                44516 non-null  int64  
 1   transcript                    44516 non-null  object 
 2   transcript_with_fillers       44516 non-null  object 
 3   filler_words_count            44516 non-null  int64  
 4   meaningless_connectors_count  44516 non-null  int64  
 5   connector_ratio               44516 non-null  float64
 6   word_count                    44516 non-null  int64  
 7   speak_duration                44516 non-null  int64  
 8   confidence                    44516 non-null  int64  
dtypes: float64(1), int64(6), object(2)
memory usage: 3.1+ MB


In [23]:
split_df.head()

Unnamed: 0,original_index,transcript,transcript_with_fillers,filler_words_count,meaningless_connectors_count,connector_ratio,word_count,speak_duration,confidence
0,0,thank you so much chris and it s truly a great...,thank you so much chris and it s truly a great...,6,7,0.05,140,63,1
1,0,from our home in nashville to a little farm we...,from our home in nashville to a little farm we...,6,7,0.045752,153,67,1
2,0,to us and she lowered her voice so much i had ...,to us and she lowered her voice so much i had ...,0,8,0.043716,183,90,1
3,0,refueling i woke up they opened the door i wen...,uh refueling i woke up they opened the door i ...,10,9,0.060811,148,82,0
4,0,president al gore announced in nigeria yesterd...,president al gore announced in nigeria yesterd...,3,10,0.044053,227,90,1


In [24]:
split_df = split_df[['transcript_with_fillers','filler_words_count', 'meaningless_connectors_count', 'word_count', 'speak_duration', 'confidence']]

In [25]:
split_df.head()

Unnamed: 0,transcript_with_fillers,filler_words_count,meaningless_connectors_count,word_count,speak_duration,confidence
0,thank you so much chris and it s truly a great...,6,7,140,63,1
1,from our home in nashville to a little farm we...,6,7,153,67,1
2,to us and she lowered her voice so much i had ...,0,8,183,90,1
3,uh refueling i woke up they opened the door i ...,10,9,148,82,0
4,president al gore announced in nigeria yesterd...,3,10,227,90,1


In [27]:
split_df.to_csv('./dataset/ted_talks_en_processed.csv', index=False)