# Analisis Dataset Log Nasa July

In [1]:
import pandas as pd
from datetime import timedelta

# --- Ganti 'path/ke/file_log_anda.csv' dengan lokasi file Anda ---
log_file_path = 'NASA_Jul95_cleaned.csv'
# -----------------------------------------------------------------

# Tentukan batas waktu sesi
SESSION_TIMEOUT = timedelta(minutes=30)

# Daftar ekstensi file aset yang akan difilter (dibuang)
ASSET_EXTENSIONS = [
    '.css', '.js', '.jpg', '.jpeg', '.png', '.gif', '.ico',
    '.txt', '.svg', '.woff', '.woff2', '.ttf', '.eot', '.zip'
]

def load_data(file_path):
    """
    Memuat data log dari file CSV dan melakukan pengecekan awal.
    """
    print(f"Membaca log dari {file_path}...")

    col_names = [
        'IP', 'Logname', 'User', 'Time', 'Method',
        'URI', 'Protocol', 'Status', 'Size'
    ]

    df = None

    try:
        df = pd.read_csv(
            file_path,
            sep=',', header=None, names=col_names,
            skiprows=1, quotechar='"', on_bad_lines='skip',
            engine='python'
        )
    except FileNotFoundError:
        print(f"Error: File tidak ditemukan di {file_path}")
        return None
    except Exception as e:
        print(f"Error saat membaca file: {e}")
        return None

    if df is None:
        print("Dataframe could not be loaded. Stopping execution.")
        return None

    print("Pratinjau data mentah berhasil dibaca.")
    print("-" * 50)

    print(f"üìä Jumlah baris data mentah (Awal): {len(df)}")

    ip_counts = df['IP'].value_counts()
    print("\nüèÜ Top 10 IP dengan akses terbanyak:")
    print(ip_counts.head(10))

    dupe_ips = ip_counts[ip_counts > 1]
    print(f"\n‚ö†Ô∏è Jumlah Unik IP: {len(ip_counts)}")
    print(f"‚ö†Ô∏è Jumlah IP yang 'Duplikat' (akses > 1 kali): {len(dupe_ips)}")
    print("-" * 50)

    print("\nüîç Mengecek nilai NaN dalam dataset mentah...")
    nan_counts = df.isnull().sum()
    nan_percentages = (df.isnull().sum() / len(df)) * 100
    nan_info = pd.DataFrame({'NaN Count': nan_counts, 'NaN Percentage': nan_percentages})
    print(nan_info[nan_info['NaN Count'] > 0].to_string())
    print("Total NaN values in raw data:", df.isnull().sum().sum())
    print("-" * 50)

    return df

def preprocess_data(df, asset_extensions=ASSET_EXTENSIONS):
    """
    Melakukan pembersihan dan filtering data log.
    """
    if df is None:
        return None

    print("Memulai preprocessing (filter GET, 200, dan aset)... ")

    # 1. Filter: Hanya ambil Request Method 'GET'
    df_filtered = df[df['Method'] == 'GET'].copy()

    # 2. Filter: Hanya ambil Status '200' (OK)
    df_filtered['Status'] = pd.to_numeric(df_filtered['Status'], errors='coerce')
    df_filtered = df_filtered.dropna(subset=['Status'])
    df_filtered = df_filtered[df_filtered['Status'] == 200].copy()

    # 3. Filter: Hanya ambil yang berakhiran .html
    df_clean = df_filtered[df_filtered['URI'].fillna('').str.lower().str.endswith('.html')].copy()

    # Konversi kolom 'Time' ke format datetime
    print("Mengonversi waktu (dengan format ISO8601)...")
    df_clean['Time'] = pd.to_datetime(df_clean['Time'], format='ISO8601')

    print("Data setelah preprocessing (sebelum pengelompokan sesi):")
    print(df_clean.head().to_string())
    print("-" * 50)

    jumlah_awal = len(df)
    jumlah_akhir = len(df_clean)
    jumlah_dibuang = jumlah_awal - jumlah_akhir

    print(f"‚úÖ Jumlah data setelah diproses (df_clean): {jumlah_akhir}")
    print(f"üóëÔ∏è Jumlah data 'sampah' (aset/error) yang dibuang: {jumlah_dibuang}")
    print(f"üìâ Persentase penyusutan data: {(jumlah_dibuang/jumlah_awal)*100:.2f}%")
    print("-" * 50)

    return df_clean

def sessionize_data(df_clean, session_timeout=SESSION_TIMEOUT):
    """
    Mengelompokkan data menjadi sesi berdasarkan IP dan batas waktu.
    """
    if df_clean is None:
        return None

    print(f"Mengelompokkan sesi (timeout: {session_timeout})...")

    df_clean = df_clean.sort_values(by=['IP', 'Time'])

    # Mengelompokkan berdasarkan IP dan menghitung SessionID secara terpisah untuk setiap IP
    df_clean['SessionID'] = df_clean.groupby('IP').apply(
        lambda x: ((x['Time'].diff() > session_timeout) | (x['Time'].diff().isna())).cumsum()
    ).reset_index(level=0, drop=True)

    print("\n--- PRATINJAU LOG DENGAN SESI ---")
    cols_to_show = ['SessionID', 'IP', 'Time', 'URI', 'Status']
    print(df_clean[cols_to_show].head(15).to_string())
    print("-" * 50)

    try:
        df_clean.to_csv('hasil_log_dengan_sesiNASA.csv', index=False)
        print("\nData log yang sudah dikelompokkan disimpan ke 'hasil_log_dengan_sesiNASA.csv'")
    except Exception as e:
        print(f"\nGagal menyimpan file 'hasil_log_dengan_sesiNASA.csv': {e}")

    return df_clean

def create_matrix(df_sessionized):
    """
    Membuat matriks Sesi x Halaman dari data yang sudah disesikan.
    """
    if df_sessionized is None:
        return None

    print("\nMembuat matriks Sesi x Halaman...")

    matrix = pd.crosstab(
        index=[df_sessionized['SessionID'], df_sessionized['IP']],
        columns=df_sessionized['URI']
    )
    matrix_binary = (matrix > 0).astype(int)
    final_output = matrix_binary.reset_index().drop('SessionID', axis=1)
    final_output = final_output.sort_values(by='IP') # Tambahkan baris ini untuk mengurutkan berdasarkan IP

    # print("\n--- HASIL AKHIR (MATRIKS) ---")
    # print(final_output.to_string())

    try:
        final_output.to_csv('hasil_matriks_sesiNASA.csv', index=False)
        print("\nMatriks hasil juga disimpan ke 'hasil_matriks_sesiNASA.csv'")
    except Exception as e:
        print(f"\nGagal menyimpan file 'hasil_matriks_sesiNASA.csv': {e}")

    return final_output

# Removed the main() call from here as it will be orchestrated by new cells
# main()


In [2]:
df_raw = load_data(log_file_path)
display(df_raw.head())

Membaca log dari NASA_Jul95_cleaned.csv...
Error: File tidak ditemukan di NASA_Jul95_cleaned.csv


AttributeError: 'NoneType' object has no attribute 'head'

In [8]:
df_cleaned = preprocess_data(df_raw)
display(df_cleaned.head())

Memulai preprocessing (filter GET, 200, dan aset)... 
Mengonversi waktu (dengan format ISO8601)...
Data setelah preprocessing (sebelum pengelompokan sesi):
                           IP Logname User                      Time Method                                           URI  Protocol  Status   Size
2              199.120.110.21       -    - 1995-07-01 00:00:09+00:00    GET  /shuttle/missions/sts-73/mission-sts-73.html  HTTP/1.0     200   4085
7             205.212.115.106       -    - 1995-07-01 00:00:12+00:00    GET             /shuttle/countdown/countdown.html  HTTP/1.0     200   3985
18  ppptky391.asahi-net.or.jp       -    - 1995-07-01 00:00:18+00:00    GET                         /facts/about_ksc.html  HTTP/1.0     200   3977
22   waters-gw.starway.net.au       -    - 1995-07-01 00:00:25+00:00    GET      /shuttle/missions/51-l/mission-51-l.html  HTTP/1.0     200   6723
37     gayle-gaston.tenet.edu       -    - 1995-07-01 00:00:50+00:00    GET  /shuttle/missions/sts-71/mission

Unnamed: 0,IP,Logname,User,Time,Method,URI,Protocol,Status,Size
2,199.120.110.21,-,-,1995-07-01 00:00:09+00:00,GET,/shuttle/missions/sts-73/mission-sts-73.html,HTTP/1.0,200,4085
7,205.212.115.106,-,-,1995-07-01 00:00:12+00:00,GET,/shuttle/countdown/countdown.html,HTTP/1.0,200,3985
18,ppptky391.asahi-net.or.jp,-,-,1995-07-01 00:00:18+00:00,GET,/facts/about_ksc.html,HTTP/1.0,200,3977
22,waters-gw.starway.net.au,-,-,1995-07-01 00:00:25+00:00,GET,/shuttle/missions/51-l/mission-51-l.html,HTTP/1.0,200,6723
37,gayle-gaston.tenet.edu,-,-,1995-07-01 00:00:50+00:00,GET,/shuttle/missions/sts-71/mission-sts-71.html,HTTP/1.0,200,12040


In [9]:
df_session = sessionize_data(df_cleaned, SESSION_TIMEOUT)
display(df_session.head())

Mengelompokkan sesi (timeout: 0:30:00)...


  df_clean['SessionID'] = df_clean.groupby('IP').apply(



--- PRATINJAU LOG DENGAN SESI ---
         SessionID                               IP                      Time                                                     URI  Status
727799           1                      ***.novo.dk 1995-07-11 08:17:09+00:00                                               /ksc.html     200
727853           1                      ***.novo.dk 1995-07-11 08:17:48+00:00                         /shuttle/missions/missions.html     200
728070           1                      ***.novo.dk 1995-07-11 08:21:05+00:00            /shuttle/missions/sts-35/mission-sts-35.html     200
728078           1                      ***.novo.dk 1995-07-11 08:21:19+00:00            /shuttle/missions/sts-35/mission-sts-35.html     200
728185           1                      ***.novo.dk 1995-07-11 08:23:01+00:00               /shuttle/resources/orbiters/columbia.html     200
452773           1                   007.thegap.com 1995-07-06 17:24:28+00:00            /shuttle/missions/sts-71

Unnamed: 0,IP,Logname,User,Time,Method,URI,Protocol,Status,Size,SessionID
727799,***.novo.dk,-,-,1995-07-11 08:17:09+00:00,GET,/ksc.html,HTTP/1.0,200,7067,1
727853,***.novo.dk,-,-,1995-07-11 08:17:48+00:00,GET,/shuttle/missions/missions.html,HTTP/1.0,200,8678,1
728070,***.novo.dk,-,-,1995-07-11 08:21:05+00:00,GET,/shuttle/missions/sts-35/mission-sts-35.html,HTTP/1.0,200,12118,1
728078,***.novo.dk,-,-,1995-07-11 08:21:19+00:00,GET,/shuttle/missions/sts-35/mission-sts-35.html,HTTP/1.0,200,12118,1
728185,***.novo.dk,-,-,1995-07-11 08:23:01+00:00,GET,/shuttle/resources/orbiters/columbia.html,HTTP/1.0,200,6922,1


In [10]:
final_matrix = create_matrix(df_session)
# display(final_matrix.head())


Membuat matriks Sesi x Halaman...

Matriks hasil juga disimpan ke 'hasil_matriks_sesiNASA.csv'


In [11]:
# Hitung frekuensi setiap IP
ip_counts_final_matrix = final_matrix['IP'].value_counts()

# Filter IP yang muncul lebih dari satu kali
duplicate_ips = ip_counts_final_matrix[ip_counts_final_matrix > 1]

if not duplicate_ips.empty:
    print("IP yang muncul lebih dari satu kali dalam matriks akhir:")
    display(duplicate_ips)
else:
    print("Tidak ada IP yang muncul lebih dari satu kali dalam matriks akhir.")


IP yang muncul lebih dari satu kali dalam matriks akhir:


IP
piweba3y.prodigy.com    291
piweba1y.prodigy.com    288
www-d1.proxy.aol.com    268
www-a2.proxy.aol.com    265
www-b3.proxy.aol.com    259
                       ... 
158.44.25.177             2
194.65.6.10               2
156.79.6.46               2
157.253.104.189           2
134.39.2.181              2
Name: count, Length: 18172, dtype: int64

In [12]:
# Simpan IP yang muncul lebih dari satu kali ke file CSV
duplicate_ips.to_csv('duplicate_ipsNASA.csv', header=['Count'])
print("Daftar IP yang muncul lebih dari satu kali telah disimpan ke 'duplicate_ipsNASA.csv'")

Daftar IP yang muncul lebih dari satu kali telah disimpan ke 'duplicate_ipsNASA.csv'


In [13]:
import pandas as pd

# Muat kembali file hasil_matriks_sesiNASA.csv
final_matrix_loaded = pd.read_csv('hasil_matriks_sesiNASA.csv')

# Mengelompokkan berdasarkan IP dan mengambil nilai maksimum untuk setiap URI
# Ini akan menghasilkan 1 jika IP mengunjungi URI tersebut setidaknya sekali
summarized_matrix = final_matrix_loaded.groupby('IP').max().reset_index()

print("Ringkasan Matriks Akhir dengan IP unik:")
display(summarized_matrix.head())

print(f"\nBentuk Matriks (Baris x Kolom) setelah dirangkum: {summarized_matrix.shape}")

# Simpan hasil ringkasan ke file CSV baru
summarized_matrix.to_csv('hasil_matriks_dirangkumNASA.csv', index=False)
print("\nMatriks yang sudah dirangkum disimpan ke 'hasil_matriks_dirangkumNASA.csv'")

Ringkasan Matriks Akhir dengan IP unik:


Unnamed: 0,IP,/%7Edowns/home.html,/%7edowns/home.html,/.ksc.html,//facilities/spaceport.html,//ksc.html,//shuttle/missions/missions.html,/Harvest/brokers/WWW/admin/admin.html,/Harvest/brokers/WWW/query.html,/Harvest/brokers/WWW/summary.html,...,/~downs/harvest-1.2/INSTRUCTIONS.html,/~downs/harvest-1.2/brokers/Attributes.html,/~downs/harvest-1.2/brokers/www/admin/admin.html,/~downs/harvest-1.2/brokers/www/index.html,/~downs/harvest-1.2/brokers/www/query-glimpse.html,/~downs/harvest-1.2/brokers/www/query.html,/~downs/home.html,/~downs/www/index.html,/~downs/www/query.html,/~downs/www/summary.html
0,***.novo.dk,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,007.thegap.com,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,01-dynamic-c.rotterdam.luna.net,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,01-dynamic-c.wokingham.luna.net,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,02-dynamic-c.wokingham.luna.net,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0



Bentuk Matriks (Baris x Kolom) setelah dirangkum: (68286, 746)

Matriks yang sudah dirangkum disimpan ke 'hasil_matriks_dirangkumNASA.csv'


Setelah Anda menjalankan sel-sel yang disebutkan di atas, `hasil_matriks_sesiNASA.csv` akan diperbarui. Sekarang, mari kita muat file tersebut dan lihat ringkasannya:

In [18]:
import pandas as pd

# Muat file CSV yang sudah diperbarui
final_matrix_loaded = pd.read_csv('hasil_matriks_dirangkumNASA.csv')

print("Pratinjau Matriks Sesi x Halaman yang sudah diurutkan berdasarkan IP:")
display(final_matrix_loaded.head())

Pratinjau Matriks Sesi x Halaman yang sudah diurutkan berdasarkan IP:


Unnamed: 0,IP,/%7Edowns/home.html,/%7edowns/home.html,/.ksc.html,//facilities/spaceport.html,//ksc.html,//shuttle/missions/missions.html,/Harvest/brokers/WWW/admin/admin.html,/Harvest/brokers/WWW/query.html,/Harvest/brokers/WWW/summary.html,...,/~downs/harvest-1.2/INSTRUCTIONS.html,/~downs/harvest-1.2/brokers/Attributes.html,/~downs/harvest-1.2/brokers/www/admin/admin.html,/~downs/harvest-1.2/brokers/www/index.html,/~downs/harvest-1.2/brokers/www/query-glimpse.html,/~downs/harvest-1.2/brokers/www/query.html,/~downs/home.html,/~downs/www/index.html,/~downs/www/query.html,/~downs/www/summary.html
0,***.novo.dk,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,007.thegap.com,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,01-dynamic-c.rotterdam.luna.net,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,01-dynamic-c.wokingham.luna.net,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,02-dynamic-c.wokingham.luna.net,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
print("\n--- Ringkasan Matriks Akhir ---")
print(f"Bentuk Matriks (Baris x Kolom): {final_matrix_loaded.shape}")
print(f"Jumlah IP Unik: {final_matrix_loaded['IP'].nunique()}")
print("Daftar Kolom (URI):\n", final_matrix_loaded.columns.tolist()[1:])

# Menghitung jumlah sesi yang mengunjungi setiap URI (kolom)
# Kecuali kolom 'IP' itu sendiri
uri_counts = final_matrix_loaded.drop(columns=['IP']).sum()
print("\nTop 5 URI yang paling banyak diakses:")
print(uri_counts.nlargest(5).to_string())

print("\n5 URI yang paling sedikit diakses (jika ada lebih dari 5):")
print(uri_counts.nsmallest(5).to_string())



--- Ringkasan Matriks Akhir ---
Bentuk Matriks (Baris x Kolom): (68286, 746)
Jumlah IP Unik: 68286
Daftar Kolom (URI):
 ['/%7Edowns/home.html', '/%7edowns/home.html', '/.ksc.html', '//facilities/spaceport.html', '//ksc.html', '//shuttle/missions/missions.html', '/Harvest/brokers/WWW/admin/admin.html', '/Harvest/brokers/WWW/query.html', '/Harvest/brokers/WWW/summary.html', '/Harvest/brokers/queryhelp.html', '/Harvest/gatherers/WWW/tmp/index.html', '/base-ops/procurement/procurement.html', '/biomed/bibliography/biblio.html', '/biomed/climate/airqual.html', '/biomed/climate/climate.html', '/biomed/env.html', '/biomed/fire/fire.html', '/biomed/glossary/glossary.html', '/biomed/groundwater/groundwater.html', '/biomed/history/history.html', '/biomed/intro.html', '/biomed/lan/lan4.html', '/biomed/lan/lan6.html', '/biomed/program.html', '/biomed/soils/soils.html', '/biomed/threat/animals.html', '/biomed/threat/bldeagle.html', '/biomed/threat/indigo.html', '/biomed/threat/manatee.html', '/biom

In [20]:
import pandas as pd

# Muat file CSV ke dalam DataFrame
matrix_df = pd.read_csv('hasil_matriks_dirangkumNASA.csv')

# Tampilkan 5 data awal
display(matrix_df.head())

Unnamed: 0,IP,/%7Edowns/home.html,/%7edowns/home.html,/.ksc.html,//facilities/spaceport.html,//ksc.html,//shuttle/missions/missions.html,/Harvest/brokers/WWW/admin/admin.html,/Harvest/brokers/WWW/query.html,/Harvest/brokers/WWW/summary.html,...,/~downs/harvest-1.2/INSTRUCTIONS.html,/~downs/harvest-1.2/brokers/Attributes.html,/~downs/harvest-1.2/brokers/www/admin/admin.html,/~downs/harvest-1.2/brokers/www/index.html,/~downs/harvest-1.2/brokers/www/query-glimpse.html,/~downs/harvest-1.2/brokers/www/query.html,/~downs/home.html,/~downs/www/index.html,/~downs/www/query.html,/~downs/www/summary.html
0,***.novo.dk,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,007.thegap.com,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,01-dynamic-c.rotterdam.luna.net,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,01-dynamic-c.wokingham.luna.net,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,02-dynamic-c.wokingham.luna.net,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
# Tampilkan informasi DataFrame (total kolom, tipe data, dll.)
print("Informasi DataFrame (total kolom, tipe data, dll.):")
matrix_df.info()

# Tampilkan daftar nama kolom
print("\nDaftar nama kolom:")
print(matrix_df.columns.tolist())

Informasi DataFrame (total kolom, tipe data, dll.):
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68286 entries, 0 to 68285
Columns: 746 entries, IP to /~downs/www/summary.html
dtypes: int64(745), object(1)
memory usage: 388.7+ MB

Daftar nama kolom:
['IP', '/%7Edowns/home.html', '/%7edowns/home.html', '/.ksc.html', '//facilities/spaceport.html', '//ksc.html', '//shuttle/missions/missions.html', '/Harvest/brokers/WWW/admin/admin.html', '/Harvest/brokers/WWW/query.html', '/Harvest/brokers/WWW/summary.html', '/Harvest/brokers/queryhelp.html', '/Harvest/gatherers/WWW/tmp/index.html', '/base-ops/procurement/procurement.html', '/biomed/bibliography/biblio.html', '/biomed/climate/airqual.html', '/biomed/climate/climate.html', '/biomed/env.html', '/biomed/fire/fire.html', '/biomed/glossary/glossary.html', '/biomed/groundwater/groundwater.html', '/biomed/history/history.html', '/biomed/intro.html', '/biomed/lan/lan4.html', '/biomed/lan/lan6.html', '/biomed/program.html', '/biomed/soils/soi