MENATA DATA

In [None]:
import pandas as pd
from openpyxl import Workbook
from openpyxl.styles import Border, Side

# Membaca file CSV
file_path = '/content/tweets-data/motor_listrik.csv'  # Sesuaikan dengan path file CSV Anda
df = pd.read_csv(file_path)

# Memfilter tweet yang berhubungan dengan motor listrik
tweets_motor_listrik = df[df['full_text'].str.contains('', case=False)]

# Menyimpan hasil filter ke dalam file Excel dengan format yang diinginkan
output_excel_path = 'tweets_motor_listrik.xlsx'  # Nama file Excel untuk menyimpan hasil

# Inisialisasi workbook Excel
wb = Workbook()
ws = wb.active

# Menambahkan header dengan garis vertikal
headers = ['Tanggal', 'Nama Pengguna', 'Isi Tweet']
ws.append(headers)

# Mengatur border untuk header (agar terlihat seperti kolom terpisah)
thin_border = Border(left=Side(style='thin'), right=Side(style='thin'))

# Menulis data ke dalam Excel dengan border di antara kolom
for index, row in tweets_motor_listrik.iterrows():
    ws.append([row['created_at'], row['username'], row['full_text']])

# Mengatur border untuk setiap sel
for row in ws.iter_rows(min_row=1, max_row=len(tweets_motor_listrik)+1, min_col=1, max_col=3):
    for cell in row:
        cell.border = thin_border

# Menyimpan workbook ke dalam file Excel
wb.save(output_excel_path)

print(f"Data tweet yang berhubungan dengan motor listrik telah disimpan ke dalam file Excel: {output_excel_path}")


Data tweet yang berhubungan dengan motor listrik telah disimpan ke dalam file Excel: tweets_motor_listrik.xlsx


MENGHAPUS DUPLIKAT

In [None]:
import pandas as pd

# Path file Excel yang ingin dihapus duplikatnya
file_path = '/content/sample_data/dupli.xlsx'  # Sesuaikan dengan path file Excel Anda

# Membaca file Excel ke dalam DataFrame
df = pd.read_excel(file_path)

# Menghapus duplikat berdasarkan kolom 'Isi Tweet'
df.drop_duplicates(subset=['Isi Tweet'], keep='first', inplace=True)

# Menyimpan DataFrame yang telah dihapus duplikat ke dalam file Excel baru
output_excel_path = 'tweets_motor_listrik_no_duplicates.xlsx'  # Nama file baru untuk menyimpan hasil
df.to_excel(output_excel_path, index=False)

print(f"Data tanpa duplikat telah disimpan ke dalam file Excel: {output_excel_path}")


Data tanpa duplikat telah disimpan ke dalam file Excel: tweets_motor_listrik_no_duplicates.xlsx


MENGHAPUS USERNAME

In [None]:
import re
from openpyxl import load_workbook
from openpyxl.styles import Border, Side

# Fungsi untuk membersihkan teks tweet dari username, hashtag, dan link
def clean_tweet_text(text):
    if text:
        # Menghapus username
        text = re.sub(r'@\w+', '', text)
        # Menghapus hashtag
        text = re.sub(r'#\w+', '', text)
        # Menghapus link
        text = re.sub(r'http\S+', '', text)
        return text.strip()
    return text

# Membaca file Excel
file_path = '/content/sample_data/TB21.xlsx'  # Sesuaikan dengan path file Excel Anda
wb = load_workbook(file_path)
ws = wb.active

# Mengatur border untuk setiap sel
thin_border = Border(left=Side(style='thin'), right=Side(style='thin'))

# Menghapus username, hashtag, dan link dari kolom 'Isi Tweet' dan mengatur border untuk setiap sel
for row in ws.iter_rows(min_row=2, max_col=3, max_row=ws.max_row):
    tanggal, nama_pengguna, isi_tweet = row[0], row[1], row[2]
    isi_tweet.value = clean_tweet_text(isi_tweet.value)

    # Mengatur border untuk setiap sel
    for cell in row:
        cell.border = thin_border

# Menyimpan workbook ke dalam file Excel
wb.save(file_path)

print(f"Data tweet yang berhubungan dengan motor listrik telah dimodifikasi dan disimpan ke dalam file Excel: {file_path}")


Data tweet yang berhubungan dengan motor listrik telah dimodifikasi dan disimpan ke dalam file Excel: /content/sample_data/TB21.xlsx


CRAWLING DATA

In [None]:
# Install pandas
!pip install pandas

# Install Node.js (required for tweet-harvest)
!sudo apt-get update
!sudo apt-get install -y ca-certificates curl gnupg
!sudo mkdir -p /etc/apt/keyrings
!curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key | sudo gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg

!NODE_MAJOR=20 && echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_$NODE_MAJOR.x nodistro main" | sudo tee /etc/apt/sources.list.d/nodesource.list

!sudo apt-get update
!sudo apt-get install nodejs -y

# Check Node.js version
!node -v

Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:3 https://deb.nodesource.com/node_20.x nodistro InRelease
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:5 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:8 https://ppa.launchpadcontent.net/c2d4u.team/c2d4u4.0+/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists... Done
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ca-certificates is already the newest version (20230311ubuntu0.22.04.1).
cu

In [None]:
import pandas as pd
import os

# Pastikan direktori tweets-data sudah ada
tweets_data_dir = 'tweets-data'
if not os.path.exists(tweets_data_dir):
    os.makedirs(tweets_data_dir)

# Crawl Data
filename = 'motor_listrik.csv'
search_keyword = 'motor listrik lang:id since:2020-01-01 until:2024-01-01'
limit = 5000

# Jalankan tweet-harvest
!npx -y tweet-harvest@2.6.1 -o "{filename}" -s "{search_keyword}" --tab "LATEST" -l {limit} --token {twitter_auth_token}

# Path file CSV
file_path = f"{tweets_data_dir}/{filename}"

try:
    # Baca file CSV ke DataFrame
    df = pd.read_csv(file_path)

    # Tampilkan DataFrame
    display(df)

    # Lakukan operasi lainnya di sini jika diperlukan

except FileNotFoundError:
    print(f"File CSV '{file_path}' tidak ditemukan. Pastikan proses crawling telah berjalan dengan baik.")


[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K[1m[32mTweet Harvest [v2.6.1][39m[22m
[1m[32m[39m[22m
[34mResearch by [39m[1m[34mHelmi Satria[39m[22m[34m[39m
[34mUse it for Educational Purposes only![39m
[34m[39m
[33mThis script uses Chromium Browser to crawl data from Twitter with [1myour Twitter auth token[22m.[39m
[33mPlease enter your Twitter auth token when prompted.[39m
[33m[39m
[31m[1mNote:[22m[39m Keep your access token secret! Don't share it with anyone else.
[31m[1mNote:[22m[39m This script only runs on your local device.

[34m[39m
[34mOpening twitter search page...[39m
[34m[39m
[34m[39m
[34mFound existing file ./tweets-data/motor_listrik.csv, renaming to ./tweets-data/motor_listrik.old.csv[39m
[90m[39m
[90m-- Scrolling... (1)[39m[90m (2)[39m[33m[39m
[33mFilling in keywords: motor listrik lang:id since:2010-01-01 until:2020-01-01[39m
[33m[39m
[90m (3)[39m[90m (4)[39m[90m (5

Unnamed: 0,conversation_id_str,created_at,favorite_count,full_text,id_str,image_url,in_reply_to_screen_name,lang,location,quote_count,reply_count,retweet_count,tweet_url,user_id_str,username
0,1211669764856139776,Mon Dec 30 15:25:58 +0000 2019,0,cuma terompet aja bawa bawa kafir... giliran l...,1211669764856139776,,,in,,0,0,0,https://x.com/alfr3d_marb03n/status/1211669764...,2228499032,alfr3d_marb03n
1,1211661541231095811,Mon Dec 30 14:53:17 +0000 2019,0,Sepeda listrik : gak ada pajaknya Motor listri...,1211661541231095811,,,in,,0,0,0,https://x.com/parttimerant/status/121166154123...,1123371439548129280,parttimerant
2,1211642965891993600,Mon Dec 30 13:39:28 +0000 2019,0,Bupati Artha Langsung Coba Sepeda Motor Listri...,1211642965891993600,https://pbs.twimg.com/media/ENCe2zhXYAAZCqZ.jpg,,in,"Denpasar, Bali, Indonesia.",0,0,0,https://x.com/BaliEkbis/status/121164296589199...,742765756945813504,BaliEkbis
3,1211623639835185152,Mon Dec 30 12:27:23 +0000 2019,1,@abiyyu991223 @elonmusk sejak pertama kali mot...,1211624824520339456,,aliflammeme,in,,0,1,1,https://x.com/Fajar_al_fakir/status/1211624824...,774880456290349056,Fajar_al_fakir
4,1211613684851560448,Mon Dec 30 11:43:07 +0000 2019,1,Sing kowar Bid'ah mbok tko hp motor mobil tko ...,1211613684851560448,,,in,,0,0,0,https://x.com/Rofiq61342136/status/12116136848...,1030249264323018752,Rofiq61342136
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2450,1167028748731813889,Thu Aug 29 10:58:30 +0000 2019,0,Benersih kalo gojek dan grab buat akselerasi p...,1167028748731813889,,,in,Tak tunggu ng enggok-enggokan,0,0,0,https://x.com/Dapi__/status/1167028748731813889,610707603,Dapi__
2451,1166948874616885248,Thu Aug 29 10:54:52 +0000 2019,0,@medcom_id Pemerintah harus membuat regulasi a...,1167027831978651648,,medcom_id,in,berau,0,0,0,https://x.com/bangjalipunya73/status/116702783...,816259938439741440,bangjalipunya73
2452,1167017970419163136,Thu Aug 29 10:15:40 +0000 2019,0,#MostPopuler Gojek dan Grab Didapuk Jadi 'Coro...,1167017970419163136,,,in,"ÜT: -6.311655,106.753334",0,0,0,https://x.com/detikoto/status/1167017970419163136,80543284,detikoto
2453,1167017244746031104,Thu Aug 29 10:12:47 +0000 2019,11,Selis merupakan salah satu merek kendaraan lis...,1167017244746031104,,,in,,0,2,3,https://x.com/detikcom/status/1167017244746031104,69183155,detikcom
