In [1]:
import pandas as pd
from datetime import timedelta
from scipy.sparse import coo_matrix

# Patch lokasi file
log_file_path = 'NASA_Jul95_cleaned.csv'
# -----------------------------------------------------------------

# Tentukan batas waktu sesi
SESSION_TIMEOUT = timedelta(minutes=30)

# Daftar ekstensi file aset yang akan difilter (dibuang)
ASSET_EXTENSIONS = [
    '.css', '.js', '.jpg', '.jpeg', '.png', '.gif', '.ico',
    '.txt', '.svg', '.woff', '.woff2', '.ttf', '.eot', '.zip'
]

def load_data(file_path):
    """
    Memuat data log dari file CSV dan melakukan pengecekan awal.
    """
    print(f"Membaca log dari {file_path}...")

    col_names = [
        'IP', 'Logname', 'User', 'Time', 'Method',
        'URI', 'Protocol', 'Status', 'Size'
    ]

    df = None

    try:
        df = pd.read_csv(
            file_path,
            sep=',', header=None, names=col_names,
            skiprows=1, quotechar='"', on_bad_lines='skip',
            engine='python'
        )
    except FileNotFoundError:
        print(f"Error: File tidak ditemukan di {file_path}")
        return None
    except Exception as e:
        print(f"Error saat membaca file: {e}")
        return None

    if df is None:
        print("Dataframe could not be loaded. Stopping execution.")
        return None

    print("Pratinjau data mentah berhasil dibaca.")
    print("-" * 50)

    print(f"üìä Jumlah baris data mentah (Awal): {len(df)}")

    ip_counts = df['IP'].value_counts()
    print("\nüèÜ Top 10 IP dengan akses terbanyak:")
    print(ip_counts.head(10))

    dupe_ips = ip_counts[ip_counts > 1]
    print(f"\n‚ö†Ô∏è Jumlah Unik IP: {len(ip_counts)}")
    print(f"‚ö†Ô∏è Jumlah IP yang 'Duplikat' (akses > 1 kali): {len(dupe_ips)}")
    print("-" * 50)

    print("\nüîç Mengecek nilai NaN dalam dataset mentah...")
    nan_counts = df.isnull().sum()
    nan_percentages = (df.isnull().sum() / len(df)) * 100
    nan_info = pd.DataFrame({'NaN Count': nan_counts, 'NaN Percentage': nan_percentages})
    print(nan_info[nan_info['NaN Count'] > 0].to_string())
    print("Total NaN values in raw data:", df.isnull().sum().sum())
    print("-" * 50)

    return df

def preprocess_data(df, asset_extensions=ASSET_EXTENSIONS):
    """
    Melakukan pembersihan dan filtering data log.
    """
    if df is None:
        return None

    print("Memulai preprocessing (filter GET, 200, dan aset)...")

    # 1. Filter: Hanya ambil Request Method 'GET'
    df_filtered = df[df['Method'] == 'GET'].copy()

    # 2. Filter: Hanya ambil Status '200' (OK)
    df_filtered['Status'] = pd.to_numeric(df_filtered['Status'], errors='coerce')
    df_filtered = df_filtered.dropna(subset=['Status'])
    df_filtered = df_filtered[df_filtered['Status'] == 200].copy()

    # 3. Filter: Hanya ambil yang berakhiran .html
    df_clean = df_filtered[df_filtered['URI'].fillna('').str.lower().str.endswith('.html')].copy()

    # Konversi kolom 'Time' ke format datetime
    print("Mengonversi waktu (dengan format ISO8601)...")
    df_clean['Time'] = pd.to_datetime(df_clean['Time'], format='ISO8601')

    print("Data setelah preprocessing (sebelum pengelompokan sesi):")
    print(df_clean.head().to_string())
    print("-" * 50)

    jumlah_awal = len(df)
    jumlah_akhir = len(df_clean)
    jumlah_dibuang = jumlah_awal - jumlah_akhir

    print(f"‚úÖ Jumlah data setelah diproses (df_clean): {jumlah_akhir}")
    print(f"üóëÔ∏è Jumlah data 'sampah' (aset/error) yang dibuang: {jumlah_dibuang}")
    print(f"üìâ Persentase penyusutan data: {(jumlah_dibuang/jumlah_awal)*100:.2f}%")
    print("-" * 50)

    return df_clean

def sessionize_data(df_clean, session_timeout=SESSION_TIMEOUT):
    """
    Mengelompokkan data menjadi sesi berdasarkan IP dan batas waktu.
    """
    if df_clean is None:
        return None

    print(f"Mengelompokkan sesi (timeout: {session_timeout})...")

    df_clean = df_clean.sort_values(by=['IP', 'Time'])
    time_diff = df_clean.groupby('IP')['Time'].diff()
    is_new_session = (time_diff.isna()) | (time_diff > session_timeout)
    df_clean['SessionID'] = is_new_session.cumsum()

    print("\n--- PRATINJAU LOG DENGAN SESI ---")
    cols_to_show = ['SessionID', 'IP', 'Time', 'URI', 'Status']
    print(df_clean[cols_to_show].head(15).to_string())
    print("-" * 50)

    try:
        df_clean.to_csv('hasil_log_dengan_sesiNASA.csv', index=False)
        print("\nData log yang sudah dikelompokkan disimpan ke 'hasil_log_dengan_sesiNASA.csv'")
    except Exception as e:
        print(f"\nGagal menyimpan file 'hasil_log_dengan_sesiNASA.csv': {e}")

    return df_clean

def create_matrix(df_sessionized):
    """
    Membuat matriks Sesi x Halaman dari data yang sudah disesikan (Optimized).
    """
    if df_sessionized is None:
        return None

    print("\nMembuat matriks Sesi x Halaman (Optimized)...")

    # Create a unique identifier for each session (SessionID, IP) combination
    # and keep track of the original IP for the final DataFrame column
    session_info = df_sessionized[['SessionID', 'IP']].drop_duplicates().sort_values(by=['SessionID', 'IP']).reset_index(drop=True)
    session_info['Session_Key'] = session_info['SessionID'].astype(str) + '_' + session_info['IP']

    # Get unique URIs
    unique_uris = df_sessionized['URI'].unique()

    # Create mappings from session keys/URIs to integer indices
    session_key_to_idx = {key: i for i, key in enumerate(session_info['Session_Key'])}
    uri_to_idx = {uri: i for i, uri in enumerate(unique_uris)}

    # Prepare data for COO sparse matrix
    # Map each entry in df_sessionized to its corresponding row and column index
    df_temp = df_sessionized.copy()
    df_temp['Session_Key'] = df_temp['SessionID'].astype(str) + '_' + df_temp['IP']

    rows = df_temp['Session_Key'].map(session_key_to_idx).to_numpy()
    cols = df_temp['URI'].map(uri_to_idx).to_numpy()
    data = [1] * len(df_temp) # Data values will be 1, indicating presence

    # Create COO sparse matrix
    sparse_matrix = coo_matrix(
        (data, (rows, cols)),
        shape=(len(session_info), len(unique_uris))
    )
    sparse_matrix = sparse_matrix.tocsr() # Convert to CSR for efficiency

    # Convert sparse matrix to a (dense) DataFrame.
    # This is where memory usage will increase if the matrix is truly huge,
    # but it avoids the intermediate dense matrix of pd.crosstab.
    final_output = pd.DataFrame(
        sparse_matrix.toarray(), # Convert to dense NumPy array
        index=session_info['Session_Key'], # Use Session_Key as index temporarily
        columns=unique_uris
    )

    # Convert to binary (if not already handled by 'data = [1]')
    final_output = (final_output > 0).astype(int)

    # Reset index to get Session_Key as a column, then split it to get SessionID and IP
    final_output = final_output.reset_index()
    final_output[['SessionID', 'IP']] = final_output['Session_Key'].str.split('_', n=1, expand=True)

    # Drop Session_Key and SessionID columns as per original code's final structure
    final_output = final_output.drop(columns=['Session_Key', 'SessionID'])

    # Reorder columns to have IP first, then URI columns
    cols_order = ['IP'] + [col for col in final_output.columns if col != 'IP']
    final_output = final_output[cols_order]

    print("\n--- HASIL AKHIR (MATRIKS) ---")
    print(final_output.head().to_string()) # Use .head() to avoid printing too much

    try:
        final_output.to_csv('hasil_matriks_sesiNASA.csv', index=False)
        print("\nMatriks hasil juga disimpan ke 'hasil_matriks_sesiNASA.csv'")
    except Exception as e:
        print(f"\nGagal menyimpan file 'hasil_matriks_sesiNASA.csv': {e}")

    return final_output

# Removed the main() call from here as it will be orchestrated by new cells
# main()

In [2]:
df_raw = load_data(log_file_path)
df_raw

Membaca log dari NASA_Jul95_cleaned.csv...
Pratinjau data mentah berhasil dibaca.
--------------------------------------------------
üìä Jumlah baris data mentah (Awal): 623161

üèÜ Top 10 IP dengan akses terbanyak:
IP
piweba3y.prodigy.com    7128
alyssa.prodigy.com      5174
piweba1y.prodigy.com    3530
www-b6.proxy.aol.com    2439
disarray.demon.co.uk    2312
piweba2y.prodigy.com    2179
news.ti.com             1875
piweba4y.prodigy.com    1691
www-d4.proxy.aol.com    1632
www-d1.proxy.aol.com    1592
Name: count, dtype: int64

‚ö†Ô∏è Jumlah Unik IP: 38410
‚ö†Ô∏è Jumlah IP yang 'Duplikat' (akses > 1 kali): 35043
--------------------------------------------------

üîç Mengecek nilai NaN dalam dataset mentah...
Empty DataFrame
Columns: [NaN Count, NaN Percentage]
Index: []
Total NaN values in raw data: 0
--------------------------------------------------


Unnamed: 0,IP,Logname,User,Time,Method,URI,Protocol,Status,Size
0,199.72.81.55,-,-,1995-07-01T00:00:01Z,GET,/history/apollo/,HTTP/1.0,200,6245
1,unicomp6.unicomp.net,-,-,1995-07-01T00:00:06Z,GET,/shuttle/countdown/,HTTP/1.0,200,3985
2,199.120.110.21,-,-,1995-07-01T00:00:09Z,GET,/shuttle/missions/sts-73/mission-sts-73.html,HTTP/1.0,200,4085
3,burger.letters.com,-,-,1995-07-01T00:00:11Z,GET,/shuttle/countdown/liftoff.html,HTTP/1.0,304,0
4,199.120.110.21,-,-,1995-07-01T00:00:11Z,GET,/shuttle/missions/sts-73/sts-73-patch-small.gif,HTTP/1.0,200,4179
...,...,...,...,...,...,...,...,...,...
623156,disarray.demon.co.uk,-,-,1995-07-09T14:20:42Z,GET,/images/ksclogo-medium.gif,HTTP/1.0,200,5866
623157,nowaksg.chem.nd.edu,-,-,1995-07-09T14:20:42Z,GET,/images/WORLD-logosmall.gif,HTTP/1.0,200,669
623158,disarray.demon.co.uk,-,-,1995-07-09T14:20:52Z,GET,/images/NASA-logosmall.gif,HTTP/1.0,200,786
623159,disarray.demon.co.uk,-,-,1995-07-09T14:20:52Z,GET,/images/MOSAIC-logosmall.gif,HTTP/1.0,304,0


In [3]:
df_cleaned = preprocess_data(df_raw)
df_cleaned

Memulai preprocessing (filter GET, 200, dan aset)...
Mengonversi waktu (dengan format ISO8601)...
Data setelah preprocessing (sebelum pengelompokan sesi):
                           IP Logname User                      Time Method                                           URI  Protocol  Status   Size
2              199.120.110.21       -    - 1995-07-01 00:00:09+00:00    GET  /shuttle/missions/sts-73/mission-sts-73.html  HTTP/1.0     200   4085
7             205.212.115.106       -    - 1995-07-01 00:00:12+00:00    GET             /shuttle/countdown/countdown.html  HTTP/1.0     200   3985
18  ppptky391.asahi-net.or.jp       -    - 1995-07-01 00:00:18+00:00    GET                         /facts/about_ksc.html  HTTP/1.0     200   3977
22   waters-gw.starway.net.au       -    - 1995-07-01 00:00:25+00:00    GET      /shuttle/missions/51-l/mission-51-l.html  HTTP/1.0     200   6723
37     gayle-gaston.tenet.edu       -    - 1995-07-01 00:00:50+00:00    GET  /shuttle/missions/sts-71/mission-

Unnamed: 0,IP,Logname,User,Time,Method,URI,Protocol,Status,Size
2,199.120.110.21,-,-,1995-07-01 00:00:09+00:00,GET,/shuttle/missions/sts-73/mission-sts-73.html,HTTP/1.0,200,4085
7,205.212.115.106,-,-,1995-07-01 00:00:12+00:00,GET,/shuttle/countdown/countdown.html,HTTP/1.0,200,3985
18,ppptky391.asahi-net.or.jp,-,-,1995-07-01 00:00:18+00:00,GET,/facts/about_ksc.html,HTTP/1.0,200,3977
22,waters-gw.starway.net.au,-,-,1995-07-01 00:00:25+00:00,GET,/shuttle/missions/51-l/mission-51-l.html,HTTP/1.0,200,6723
37,gayle-gaston.tenet.edu,-,-,1995-07-01 00:00:50+00:00,GET,/shuttle/missions/sts-71/mission-sts-71.html,HTTP/1.0,200,12040
...,...,...,...,...,...,...,...,...,...
623139,slip-3-4.ots.utexas.edu,-,-,1995-07-09 14:20:22+00:00,GET,/history/history.html,HTTP/1.0,200,1602
623142,ip43.herndon2.va.interramp.com,-,-,1995-07-09 14:20:26+00:00,GET,/history/history.html,HTTP/1.0,200,1602
623144,150.253.53.53,-,-,1995-07-09 14:20:31+00:00,GET,/shuttle/missions/sts-32/mission-sts-32.html,HTTP/1.0,200,5463
623145,disarray.demon.co.uk,-,-,1995-07-09 14:20:31+00:00,GET,/ksc.html,HTTP/1.0,200,7067


In [4]:
df_session = sessionize_data(df_cleaned, SESSION_TIMEOUT)
df_session

Mengelompokkan sesi (timeout: 0:30:00)...

--- PRATINJAU LOG DENGAN SESI ---
        SessionID                               IP                      Time                                                     URI  Status
452773          1                   007.thegap.com 1995-07-06 17:24:28+00:00            /shuttle/missions/sts-71/mission-sts-71.html     200
452946          1                   007.thegap.com 1995-07-06 17:26:24+00:00  /shuttle/missions/sts-71/sts-71-day-09-highlights.html     200
453128          1                   007.thegap.com 1995-07-06 17:28:35+00:00            /shuttle/missions/sts-71/mission-sts-71.html     200
454019          1                   007.thegap.com 1995-07-06 17:37:44+00:00             /shuttle/missions/sts-71/images/images.html     200
462535          2                   007.thegap.com 1995-07-06 19:23:26+00:00                            /shuttle/countdown/tour.html     200
462738          2                   007.thegap.com 1995-07-06 19:26:24+00:00 

Unnamed: 0,IP,Logname,User,Time,Method,URI,Protocol,Status,Size,SessionID
452773,007.thegap.com,-,-,1995-07-06 17:24:28+00:00,GET,/shuttle/missions/sts-71/mission-sts-71.html,HTTP/1.0,200,12722,1
452946,007.thegap.com,-,-,1995-07-06 17:26:24+00:00,GET,/shuttle/missions/sts-71/sts-71-day-09-highlig...,HTTP/1.0,200,6929,1
453128,007.thegap.com,-,-,1995-07-06 17:28:35+00:00,GET,/shuttle/missions/sts-71/mission-sts-71.html,HTTP/1.0,200,12722,1
454019,007.thegap.com,-,-,1995-07-06 17:37:44+00:00,GET,/shuttle/missions/sts-71/images/images.html,HTTP/1.0,200,7634,1
462535,007.thegap.com,-,-,1995-07-06 19:23:26+00:00,GET,/shuttle/countdown/tour.html,HTTP/1.0,200,4347,2
...,...,...,...,...,...,...,...,...,...,...
593333,zzmsowte.slip.cc.uq.oz.au,-,-,1995-07-08 17:01:14+00:00,GET,/shuttle/missions/sts-71/sts-71-day-09-highlig...,HTTP/1.0,200,6949,45612
593370,zzmsowte.slip.cc.uq.oz.au,-,-,1995-07-08 17:02:00+00:00,GET,/shuttle/missions/sts-71/sts-71-day-10-highlig...,HTTP/1.0,200,6743,45612
593425,zzmsowte.slip.cc.uq.oz.au,-,-,1995-07-08 17:03:03+00:00,GET,/shuttle/missions/sts-71/sts-71-day-11-highlig...,HTTP/1.0,200,6540,45612
4241,zzsbtafe.slip.cc.uq.oz.au,-,-,1995-07-01 01:13:28+00:00,GET,/shuttle/missions/sts-71/mission-sts-71.html,HTTP/1.0,200,12040,45613


In [5]:
final_matrix = create_matrix(df_session)
final_matrix


Membuat matriks Sesi x Halaman (Optimized)...

--- HASIL AKHIR (MATRIKS) ---
                                IP  /shuttle/missions/sts-71/mission-sts-71.html  /shuttle/missions/sts-71/sts-71-day-09-highlights.html  /shuttle/missions/sts-71/images/images.html  /shuttle/countdown/tour.html  /facilities/mila.html  /shuttle/technology/sts-newsref/sts-jsc.html  /shuttle/missions/sts-71/movies/movies.html  /shuttle/missions/sts-68/ksc-srl-image.html  /shuttle/missions/sts-70/mission-sts-70.html  /shuttle/resources/orbiters/discovery.html  /ksc.html  /shuttle/missions/sts-71/sts-71-day-07-highlights.html  /shuttle/missions/missions.html  /shuttle/missions/sts-71/sts-71-day-08-highlights.html  /shuttle/missions/100th.html  /shuttle/resources/orbiters/challenger.html  /shuttle/missions/51-l/mission-51-l.html  /shuttle/missions/sts-61/mission-sts-61.html  /shuttle/missions/sts-67/mission-sts-67.html  /shuttle/missions/sts-63/sts-63-info.html  /shuttle/countdown/liftoff.html  /history/apollo/apo

Unnamed: 0,IP,/shuttle/missions/sts-71/mission-sts-71.html,/shuttle/missions/sts-71/sts-71-day-09-highlights.html,/shuttle/missions/sts-71/images/images.html,/shuttle/countdown/tour.html,/facilities/mila.html,/shuttle/technology/sts-newsref/sts-jsc.html,/shuttle/missions/sts-71/movies/movies.html,/shuttle/missions/sts-68/ksc-srl-image.html,/shuttle/missions/sts-70/mission-sts-70.html,...,/facts/internet/html-primer.html,/software/webadmin/faq.html,/shuttle/technology/sts-newsref/pam-d.html,/msfc/description/instruments/hut-tech.html,/statistics/1994/Oct/Oct94_reverse_domains.html,/statistics/1994/Apr/Apr94.html,/statistics/1994/Feb/Feb94_reverse_domains.html,/msfc/visitor/visitors.html,/msfc/description/instruments/uit-tech.html,/statistics/1994/May/May94_reverse_domains.html
0,007.thegap.com,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,007.thegap.com,0,0,0,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,01-dynamic-c.wokingham.luna.net,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,02-dynamic-c.wokingham.luna.net,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,03-dynamic-c.wokingham.luna.net,1,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45608,zygos.demon.co.uk,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
45609,zzgograd.slip.cc.uq.oz.au,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
45610,zzmsowte.slip.cc.uq.oz.au,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
45611,zzmsowte.slip.cc.uq.oz.au,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Untuk mengkonfirmasi hal ini, mari kita periksa jumlah URI unik di `df_session`.

In [6]:
print(f"Jumlah URI unik: {df_session['URI'].nunique()}")

Jumlah URI unik: 630
