# Tiktok Data Solution

This notebook gives the key calculations used in the report/presentation from the dataset in `data/tiktok_travel_visa_dataset.csv`.

It outputs summary tables to `finalreport/summary_tables/` that are used to create charts (bar charts, line charts, etc.) in Excel or in Python later.

## Outputs

### EDA / descriptive results
- `hook_distribution.csv`
- `engagement_by_hook.csv`
- `duration_band_performance.csv`
- `category_engagement.csv`
- `trending_sound_lift.csv`

### Performance labels / class balance
- `class_distribution.csv`

### Engineered features
- `engineered_features_dataset.csv`

### Model outputs (Random Forest)
- `model_accuracy_random_forest.csv`
- `model_performance_metrics_random_forest.csv`
- `confusion_matrix_random_forest.csv`
- `feature_importance_random_forest.csv`

### Combined Excel workbook (optional)
- `reproducible_summary_tables.xlsx`


## Environment setup (run once)

If you see errors like `ModuleNotFoundError: No module named 'sklearn'`, it means the packages are not installed in the **current notebook kernel**.

1. Run the install cell below.
2. Restart the kernel.
3. Run the notebook from the top.


In [23]:
import sys

print('Kernel Python:', sys.executable)

# Install packages into THIS kernel's environment
!{sys.executable} -m pip install -U pip
!{sys.executable} -m pip install -U scikit-learn numpy pandas matplotlib seaborn openpyxl


Kernel Python: /usr/local/bin/python3.10


In [25]:
# Verify installs (run after restarting the kernel)

import numpy as np
import pandas as pd
import sklearn

print('numpy:', np.__version__)
print('pandas:', pd.__version__)
print('scikit-learn:', sklearn.__version__)


numpy: 2.2.6
pandas: 2.3.3
scikit-learn: 1.7.2


In [26]:
import csv
import os
from pathlib import Path
from collections import defaultdict
from dataclasses import dataclass
from typing import Dict, Iterable, List, Tuple

# Optional Excel export (if openpyxl is installed)
try:
    from openpyxl import Workbook
except Exception:
    Workbook = None


def find_project_root(start: Path) -> Path:
    """Find the project root by walking upwards until the data file exists."""
    target_rel = Path('data') / 'tiktok_travel_visa_dataset.csv'
    p = start.resolve()
    for _ in range(10):
        if (p / target_rel).exists():
            return p
        if p.parent == p:
            break
        p = p.parent
    raise FileNotFoundError(
        f"Could not find project root containing {target_rel}. "
        f"Current working directory: {Path.cwd()}"
    )


PROJECT_ROOT = find_project_root(Path.cwd())
DATASET_CSV = PROJECT_ROOT / 'data' / 'tiktok_travel_visa_dataset.csv'
OUTPUT_DIR = PROJECT_ROOT / 'finalreport' / 'summary_tables'
OUTPUT_XLSX = OUTPUT_DIR / 'reproducible_summary_tables.xlsx'
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

print('Project root:', PROJECT_ROOT)
print('Dataset:', DATASET_CSV)
print('Output dir:', OUTPUT_DIR)
print('Excel export:', 'enabled' if Workbook is not None else 'disabled (install openpyxl)')

Project root: /Users/richarddanquah/Desktop/Tuc/data analysis/Tiktok-Data-Analysis-Project
Dataset: /Users/richarddanquah/Desktop/Tuc/data analysis/Tiktok-Data-Analysis-Project/data/tiktok_travel_visa_dataset.csv
Output dir: /Users/richarddanquah/Desktop/Tuc/data analysis/Tiktok-Data-Analysis-Project/finalreport/summary_tables
Excel export: enabled


In [27]:
@dataclass
class Row:
    niche_category: str
    hook_style: str
    trending_sound_used: int
    video_duration_seconds: float
    views: int
    likes: int
    comments: int
    shares: int
    saves_or_favorites: int


def _to_int(value: str) -> int:
    s = (value or '').strip().replace(',', '')
    if not s:
        return 0
    return int(float(s))


def _to_float(value: str) -> float:
    s = (value or '').strip().replace(',', '')
    if not s:
        return 0.0
    return float(s)


def load_rows(csv_path: os.PathLike) -> List[Row]:
    with open(csv_path, newline='', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        out: List[Row] = []
        for r in reader:
            out.append(
                Row(
                    niche_category=(r.get('niche_category') or '').strip(),
                    hook_style=(r.get('hook_style') or '').strip(),
                    trending_sound_used=_to_int(r.get('trending_sound_used') or '0'),
                    video_duration_seconds=_to_float(r.get('video_duration_seconds') or '0'),
                    views=_to_int(r.get('views') or '0'),
                    likes=_to_int(r.get('likes') or '0'),
                    comments=_to_int(r.get('comments') or '0'),
                    shares=_to_int(r.get('shares') or '0'),
                    saves_or_favorites=_to_int(r.get('saves_or_favorites') or '0'),
                )
            )
    return out


rows = load_rows(DATASET_CSV)
print('Loaded rows:', len(rows))
rows[0]

Loaded rows: 520


Row(niche_category='Travel Hacks', hook_style='Question', trending_sound_used=0, video_duration_seconds=11.23, views=5856, likes=195, comments=42, shares=36, saves_or_favorites=14)

## Helper functions
We compute engagement rate in two ways:
- **No saves**: `(likes + comments + shares) / views` (matches the formula used in the slides)
- **With saves**: `(likes + comments + shares + saves) / views` (if you want to include saves)


In [28]:
def er_no_saves(r: Row) -> float:
    if r.views <= 0:
        return 0.0
    return (r.likes + r.comments + r.shares) / r.views


def er_with_saves(r: Row) -> float:
    if r.views <= 0:
        return 0.0
    return (r.likes + r.comments + r.shares + r.saves_or_favorites) / r.views


def duration_band(seconds: float) -> str:
    if seconds < 15:
        return 'under_15'
    if seconds < 30:
        return '15_30'
    if seconds <= 45:
        return '30_45'
    if seconds <= 60:
        return '45_60'
    return 'over_60'


def mean(values: Iterable[float]) -> float:
    vals = list(values)
    return sum(vals) / len(vals) if vals else 0.0


def grouped_counts(rows: List[Row], key_fn) -> Dict[str, int]:
    out: Dict[str, int] = defaultdict(int)
    for r in rows:
        out[key_fn(r)] += 1
    return dict(out)


def grouped_means(rows: List[Row], group_key_fn, value_fn) -> Dict[str, float]:
    buckets: Dict[str, List[float]] = defaultdict(list)
    for r in rows:
        buckets[group_key_fn(r)].append(value_fn(r))
    return {k: mean(v) for k, v in buckets.items()}


## 1) Hook distribution
Counts and percentages of each `hook_style`.


In [29]:
hook_counts = grouped_counts(rows, lambda r: r.hook_style)
n = len(rows)
hook_table = [[k, hook_counts[k], round(hook_counts[k] / n * 100, 2)] for k in sorted(hook_counts)]
hook_table[:10]


[['Dramatic', 43, 8.27],
 ['Question', 146, 28.08],
 ['Statistic', 80, 15.38],
 ['Story', 104, 20.0],
 ['Tutorial', 147, 28.27]]

## 2) Engagement rate by hook type
Average engagement rate grouped by `hook_style`.


In [30]:
hook_er_no = grouped_means(rows, lambda r: r.hook_style, er_no_saves)
hook_er_w = grouped_means(rows, lambda r: r.hook_style, er_with_saves)
hook_er_table = [[k, round(hook_er_no.get(k, 0.0) * 100, 2), round(hook_er_w.get(k, 0.0) * 100, 2)]
                 for k in sorted(set(hook_er_no) | set(hook_er_w))]
hook_er_table


[['Dramatic', 4.39, 4.71],
 ['Question', 5.64, 6.01],
 ['Statistic', 5.39, 5.76],
 ['Story', 5.27, 5.6],
 ['Tutorial', 6.22, 6.67]]

## 3) Video duration vs performance
We bucket video length into bands and compute counts, average views, and average engagement rates.

Duration bands:
- under 15s
- 15–30s
- 30–45s
- 45–60s
- over 60s


In [31]:
band_counts = grouped_counts(rows, lambda r: duration_band(r.video_duration_seconds))
band_avg_views = grouped_means(rows, lambda r: duration_band(r.video_duration_seconds), lambda r: float(r.views))
band_avg_er_no = grouped_means(rows, lambda r: duration_band(r.video_duration_seconds), er_no_saves)
band_avg_er_w = grouped_means(rows, lambda r: duration_band(r.video_duration_seconds), er_with_saves)

band_order = ['under_15', '15_30', '30_45', '45_60', 'over_60']
duration_table = [[b, band_counts.get(b, 0), round(band_avg_views.get(b, 0.0), 0),
                   round(band_avg_er_no.get(b, 0.0) * 100, 2),
                   round(band_avg_er_w.get(b, 0.0) * 100, 2)]
                  for b in band_order]
duration_table


[['under_15', 66, 57550.0, 5.07, 5.38],
 ['15_30', 113, 116630.0, 5.65, 6.03],
 ['30_45', 206, 154582.0, 5.92, 6.34],
 ['45_60', 96, 89334.0, 5.36, 5.74],
 ['over_60', 39, 75914.0, 5.03, 5.35]]

## 4) Content category performance
Average engagement rate by `niche_category`.


In [32]:
cat_avg_er_no = grouped_means(rows, lambda r: r.niche_category, er_no_saves)
cat_avg_er_w = grouped_means(rows, lambda r: r.niche_category, er_with_saves)
category_table = [[c, round(cat_avg_er_no.get(c, 0.0) * 100, 2), round(cat_avg_er_w.get(c, 0.0) * 100, 2)]
                  for c in sorted(set(cat_avg_er_no) | set(cat_avg_er_w))]
category_table


[['Budget Travel', 5.53, 5.88],
 ['Destination Guides', 5.09, 5.44],
 ['Travel Hacks', 5.65, 6.02],
 ['Visa Application Tips', 5.93, 6.36]]

## 5) Trending sound visibility lift
We compare average views for videos with and without `trending_sound_used = 1`.


In [33]:
trend_views = [r.views for r in rows if r.trending_sound_used == 1]
nontrend_views = [r.views for r in rows if r.trending_sound_used == 0]
avg_trend = mean([float(v) for v in trend_views])
avg_non = mean([float(v) for v in nontrend_views])
lift = ((avg_trend / avg_non) - 1.0) * 100 if avg_non else 0.0
trending_table = [
    ['trending_sound_used=1', len(trend_views), round(avg_trend, 0)],
    ['trending_sound_used=0', len(nontrend_views), round(avg_non, 0)],
    ['lift_percent', '', round(lift, 2)],
]
trending_table


[['trending_sound_used=1', 210, 140559.0],
 ['trending_sound_used=0', 310, 99486.0],
 ['lift_percent', '', 41.29]]

## Solution summary tables
This writes the summary tables to CSV. If `openpyxl` is installed, it will also write one Excel workbook with multiple sheets.


In [34]:
def write_csv(path: str, header: List[str], data: List[List[object]]) -> None:
    with open(path, 'w', newline='', encoding='utf-8') as f:
        w = csv.writer(f)
        w.writerow(header)
        w.writerows(data)


def write_xlsx(path: str, sheets: Dict[str, Tuple[List[str], List[List[object]]]]) -> None:
    if Workbook is None:
        raise RuntimeError('openpyxl is not installed. Install: python3 -m pip install openpyxl')
    wb = Workbook()
    first = True
    for name, (header, data) in sheets.items():
        if first:
            ws = wb.active
            ws.title = name
            first = False
        else:
            ws = wb.create_sheet(title=name)
        ws.append(header)
        for row in data:
            ws.append(row)
    wb.save(path)


write_csv(os.path.join(OUTPUT_DIR, 'hook_distribution.csv'), ['hook_style', 'count', 'percent'], hook_table)
write_csv(
    os.path.join(OUTPUT_DIR, 'engagement_by_hook.csv'),
    ['hook_style', 'avg_engagement_rate_no_saves_percent', 'avg_engagement_rate_with_saves_percent'],
    hook_er_table,
)
write_csv(
    os.path.join(OUTPUT_DIR, 'duration_band_performance.csv'),
    ['duration_band', 'count', 'avg_views', 'avg_engagement_rate_no_saves_percent', 'avg_engagement_rate_with_saves_percent'],
    duration_table,
)
write_csv(
    os.path.join(OUTPUT_DIR, 'category_engagement.csv'),
    ['niche_category', 'avg_engagement_rate_no_saves_percent', 'avg_engagement_rate_with_saves_percent'],
    category_table,
)
write_csv(os.path.join(OUTPUT_DIR, 'trending_sound_lift.csv'), ['group', 'count', 'avg_views'], trending_table)

if Workbook is not None:
    write_xlsx(
        OUTPUT_XLSX,
        {
            'hook_distribution': (['hook_style', 'count', 'percent'], hook_table),
            'engagement_by_hook': (['hook_style', 'avg_er_no_saves_percent', 'avg_er_with_saves_percent'], hook_er_table),
            'duration_band_perf': (['duration_band', 'count', 'avg_views', 'avg_er_no_saves_percent', 'avg_er_with_saves_percent'], duration_table),
            'category_engagement': (['niche_category', 'avg_engagement_rate_no_saves_percent', 'avg_engagement_rate_with_saves_percent'], category_table),
            'trending_sound_lift': (['group', 'count', 'avg_views'], trending_table),
        },
    )
    print('Wrote Excel workbook:', OUTPUT_XLSX)

print('Wrote CSV summary tables to:', OUTPUT_DIR)


Wrote Excel workbook: /Users/richarddanquah/Desktop/Tuc/data analysis/Tiktok-Data-Analysis-Project/finalreport/summary_tables/reproducible_summary_tables.xlsx
Wrote CSV summary tables to: /Users/richarddanquah/Desktop/Tuc/data analysis/Tiktok-Data-Analysis-Project/finalreport/summary_tables


## 6) Performance classification distribution (High / Medium / Low)
This reproduces the class distribution used for modeling and highlights class imbalance.


In [35]:
import pandas as pd

# Load full dataset for modeling/extra analysis

df = pd.read_csv(DATASET_CSV)
print('Shape:', df.shape)
df.head(3)

Shape: (520, 24)


Unnamed: 0,short_description,date_posted,post_time_hour_24h,day_of_week,niche_category,content_format,content_format_clean,hook_style,trending_sound_used,video_duration_seconds,...,saves_or_favorites,engagement_rate,performance_label,followers_gained,watch_time_percentage,avg_watch_time_seconds,hashtag_1,hashtag_2,hashtag_3,total_hashtag_count
0,How to find cheaper flights?,12/01/2026,10:13,Monday,Travel Hacks,text_overlay,text_overlay,Question,0,11.23,...,14,0.04901,Medium,0,22.54%,2.92,travelhacks,cheapflights,creatorsearchinsight,3
1,Budget breakdown for a 7-day trip?,19/01/2026,13:37,Monday,Budget Travel,voiceover,voiceover,Question,0,38.56,...,448,0.062649,High,10,11.98%,5.16,travelsavings,traveltips,workvisa,4
2,My experience: Visa interview questions to pre...,20/11/2025,22:30,Thursday,Visa Application Tips,text_overlay,text_overlay,Story,0,23.56,...,316,0.057706,Medium,5,24.62%,4.42,visaguide,immigration,visa,5


In [36]:
# Basic cleaning / type fixes

df['post_time_hour_24h'] = df['post_time_hour_24h'].astype(str)

# Convert watch_time_percentage like "28.13%" -> float 28.13
if 'watch_time_percentage' in df.columns:
    df['watch_time_percentage_num'] = (
        df['watch_time_percentage']
        .astype(str)
        .str.replace('%', '', regex=False)
        .replace('nan', pd.NA)
        .astype(float)
    )

# Ensure numeric columns are numeric
num_cols = [
    'video_duration_seconds','views','likes','comments','shares','saves_or_favorites',
    'followers_gained','avg_watch_time_seconds','total_hashtag_count'
]
for c in num_cols:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors='coerce')

print(df.dtypes)

short_description             object
date_posted                   object
post_time_hour_24h            object
day_of_week                   object
niche_category                object
content_format                object
content_format_clean          object
hook_style                    object
trending_sound_used            int64
video_duration_seconds       float64
views                          int64
likes                          int64
comments                       int64
shares                         int64
saves_or_favorites             int64
engagement_rate              float64
performance_label             object
followers_gained               int64
watch_time_percentage         object
avg_watch_time_seconds       float64
hashtag_1                     object
hashtag_2                     object
hashtag_3                     object
total_hashtag_count            int64
watch_time_percentage_num    float64
dtype: object


In [37]:
# Class distribution (performance_label)

if 'performance_label' not in df.columns:
    raise ValueError('performance_label column not found in dataset')

class_counts = df['performance_label'].value_counts(dropna=False)
class_perc = (class_counts / len(df) * 100).round(2)

class_dist = (
    pd.DataFrame({'count': class_counts, 'percent': class_perc})
    .reset_index()
    .rename(columns={'index': 'performance_label'})
)
class_dist

Unnamed: 0,performance_label,count,percent
0,High,260,50.0
1,Medium,259,49.81
2,Low,1,0.19


In [38]:
# Export class distribution

class_dist_path = OUTPUT_DIR / 'class_distribution.csv'
class_dist.to_csv(class_dist_path, index=False)
print('Wrote:', class_dist_path)

Wrote: /Users/richarddanquah/Desktop/Tuc/data analysis/Tiktok-Data-Analysis-Project/finalreport/summary_tables/class_distribution.csv


## 7) Engineered features analyzed

This section creates engineered features commonly used for TikTok performance modeling.

Examples:
- Engagement rate (with/without saves)
- Share rate, comment rate, save rate
- Duration bands
- Posting time features (hour)
- Trending sound usage


In [39]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

# Ensure df_engineered exists (in case cells were run out of order)

if 'df_engineered' not in globals():
    import numpy as np
    import pandas as pd

    if 'df' not in globals():
        df = pd.read_csv(DATASET_CSV)

    # Basic type fixes
    df = df.copy()
    df['post_time_hour_24h'] = df['post_time_hour_24h'].astype(str)

    if 'watch_time_percentage' in df.columns and 'watch_time_percentage_num' not in df.columns:
        df['watch_time_percentage_num'] = (
            df['watch_time_percentage']
            .astype(str)
            .str.replace('%', '', regex=False)
            .replace('nan', pd.NA)
            .astype(float)
        )

    num_cols = [
        'video_duration_seconds','views','likes','comments','shares','saves_or_favorites',
        'followers_gained','avg_watch_time_seconds','total_hashtag_count'
    ]
    for c in num_cols:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors='coerce')

    df_model = df.copy()

    # Engagement rates
    df_model['engagement_rate_no_saves'] = (
        (df_model['likes'] + df_model['comments'] + df_model['shares']) / df_model['views']
    ).replace([np.inf, -np.inf], np.nan)

    df_model['engagement_rate_with_saves'] = (
        (df_model['likes'] + df_model['comments'] + df_model['shares'] + df_model['saves_or_favorites']) / df_model['views']
    ).replace([np.inf, -np.inf], np.nan)

    # Rate features
    df_model['like_rate'] = (df_model['likes'] / df_model['views']).replace([np.inf, -np.inf], np.nan)
    df_model['comment_rate'] = (df_model['comments'] / df_model['views']).replace([np.inf, -np.inf], np.nan)
    df_model['share_rate'] = (df_model['shares'] / df_model['views']).replace([np.inf, -np.inf], np.nan)
    df_model['save_rate'] = (df_model['saves_or_favorites'] / df_model['views']).replace([np.inf, -np.inf], np.nan)

    def parse_hour(v: str):
        try:
            s = str(v).strip()
            if ':' in s:
                return int(s.split(':', 1)[0])
            return int(float(s))
        except Exception:
            return np.nan

    df_model['post_hour'] = df_model['post_time_hour_24h'].apply(parse_hour)

    def duration_band_str(seconds: float) -> str:
        if pd.isna(seconds):
            return 'unknown'
        if seconds < 15:
            return 'under_15'
        if seconds < 30:
            return '15_30'
        if seconds <= 45:
            return '30_45'
        if seconds <= 60:
            return '45_60'
        return 'over_60'

    df_model['duration_band'] = df_model['video_duration_seconds'].apply(duration_band_str)
    df_model['performance_label'] = df_model['performance_label'].astype(str)

    engineered_cols = [
        'performance_label',
        'hook_style','niche_category','content_format_clean','day_of_week',
        'trending_sound_used','video_duration_seconds','duration_band','post_hour',
        'views','likes','comments','shares','saves_or_favorites','followers_gained',
        'avg_watch_time_seconds','watch_time_percentage_num','total_hashtag_count',
        'engagement_rate_no_saves','engagement_rate_with_saves',
        'like_rate','comment_rate','share_rate','save_rate'
    ]

    engineered_cols = [c for c in engineered_cols if c in df_model.columns]
    df_engineered = df_model[engineered_cols].copy()

# Define target

target_col = 'performance_label'

# Clean labels (avoid stray spaces / "nan" strings)
y_raw = df_engineered[target_col].astype(str).str.strip()
y_raw = y_raw.replace({'': np.nan, 'nan': np.nan, 'None': np.nan})

# Choose feature columns
exclude_cols = {target_col}
X_raw = df_engineered[[c for c in df_engineered.columns if c not in exclude_cols]].copy()

# Drop rows with missing label
mask = y_raw.notna()
X = X_raw.loc[mask].copy()
y = y_raw.loc[mask].copy()

# If any class has < 2 samples, stratified split will fail.
# Drop those rare-class rows for modeling and report what happened.
class_counts = y.value_counts()
rare_classes = class_counts[class_counts < 2].index.tolist()

if rare_classes:
    print('Warning: dropping rows from rare classes (count < 2):', rare_classes)
    keep_mask = ~y.isin(rare_classes)
    X = X.loc[keep_mask].copy()
    y = y.loc[keep_mask].copy()

print('Modeling rows:', len(y))
print('Class counts:')
print(y.value_counts())

# Separate numeric/categorical
numeric_features = [c for c in X.columns if pd.api.types.is_numeric_dtype(X[c])]
categorical_features = [c for c in X.columns if c not in numeric_features]

numeric_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='median')),
    ]
)

categorical_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore')),
    ]
)

preprocess = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
    ]
)

# Stratify only if all remaining classes have >= 2 samples
stratify_arg = y if y.value_counts().min() >= 2 else None
if stratify_arg is None:
    print('Note: stratify disabled because at least one class has < 2 samples.')

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=stratify_arg
)

rf = RandomForestClassifier(
    n_estimators=400,
    random_state=42,
    class_weight='balanced',
    n_jobs=-1,
)

model = Pipeline(steps=[('preprocess', preprocess), ('rf', rf)])
model

Modeling rows: 519
Class counts:
performance_label
High      260
Medium    259
Name: count, dtype: int64


0,1,2
,steps,"[('preprocess', ...), ('rf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,400
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [40]:
# Export engineered dataset used for modeling

# Ensure df_engineered exists before export
if 'df_engineered' not in globals():
    raise NameError("df_engineered is not defined. Run the modeling/feature engineering cell above first.")

engineered_path_csv = OUTPUT_DIR / 'engineered_features_dataset.csv'
df_engineered.to_csv(engineered_path_csv, index=False)
print('Wrote:', engineered_path_csv)

Wrote: /Users/richarddanquah/Desktop/Tuc/data analysis/Tiktok-Data-Analysis-Project/finalreport/summary_tables/engineered_features_dataset.csv


## 8) Modeling: Random Forest (Best Model)

This section trains a Random Forest classifier to predict `performance_label` (High/Medium/Low) from engineered features.

### Handling class imbalance
We handle class imbalance using:
- `train_test_split(..., stratify=y)` to preserve label proportions
- `class_weight='balanced'` inside Random Forest to up-weight minority classes


In [41]:
# (Deprecated) This cell used to duplicate the Random Forest setup.
# To avoid NameError / out-of-order execution issues, please use the Random Forest cell above.
#
# If you need to re-run the model, re-run:
# - Engineered features + Random Forest setup cell
# - Training cell
# - Metrics/feature-importance export cells

print('Skip: duplicate Random Forest setup cell (use the one above).')

Skip: duplicate Random Forest setup cell (use the one above).


In [42]:
# Train the model

model.fit(X_train, y_train)

# Predict

y_pred = model.predict(X_test)

acc = accuracy_score(y_test, y_pred)
print('Random Forest accuracy:', round(acc, 4))

Random Forest accuracy: 1.0


In [43]:
# Class-wise performance and confusion matrix

labels_sorted = sorted(y.unique())

report_dict = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
report_df = pd.DataFrame(report_dict).transpose().reset_index().rename(columns={'index': 'label'})
report_df

Unnamed: 0,label,precision,recall,f1-score,support
0,High,1.0,1.0,1.0,52.0
1,Medium,1.0,1.0,1.0,52.0
2,accuracy,1.0,1.0,1.0,1.0
3,macro avg,1.0,1.0,1.0,104.0
4,weighted avg,1.0,1.0,1.0,104.0


In [44]:
cm = confusion_matrix(y_test, y_pred, labels=labels_sorted)
cm_df = pd.DataFrame(cm, index=[f"true_{l}" for l in labels_sorted], columns=[f"pred_{l}" for l in labels_sorted])
cm_df

Unnamed: 0,pred_High,pred_Medium
true_High,52,0
true_Medium,0,52


## 9) Feature Importance Analysis

We extract feature importances from the trained Random Forest and export them for the report.


In [45]:
# Extract feature names after preprocessing and pair with importances

pre = model.named_steps['preprocess']
rf_est = model.named_steps['rf']

feature_names = []

# Numeric feature names
feature_names.extend(numeric_features)

# Categorical feature names (from OneHotEncoder)
if categorical_features:
    ohe = pre.named_transformers_['cat'].named_steps['onehot']
    ohe_feature_names = list(ohe.get_feature_names_out(categorical_features))
    feature_names.extend(ohe_feature_names)

importances = rf_est.feature_importances_

fi = (
    pd.DataFrame({'feature': feature_names, 'importance': importances})
    .sort_values('importance', ascending=False)
    .reset_index(drop=True)
)

fi.head(20)

Unnamed: 0,feature,importance
0,engagement_rate_with_saves,0.379487
1,engagement_rate_no_saves,0.164595
2,like_rate,0.142557
3,save_rate,0.053004
4,hook_style_Tutorial,0.033096
5,comment_rate,0.028345
6,share_rate,0.02699
7,saves_or_favorites,0.017314
8,likes,0.017153
9,comments,0.015586


In [47]:
# Export model results

metrics_path = OUTPUT_DIR / 'model_performance_metrics_random_forest.csv'
report_df.to_csv(metrics_path, index=False)
print('Wrote:', metrics_path)

cm_path = OUTPUT_DIR / 'confusion_matrix_random_forest.csv'
cm_df.reset_index().to_csv(cm_path, index=False)
print('Wrote:', cm_path)

acc_path = OUTPUT_DIR / 'model_accuracy_random_forest.csv'
pd.DataFrame([{'model': 'Random Forest', 'accuracy': acc}]).to_csv(acc_path, index=False)
print('Wrote:', acc_path)

fi_path = OUTPUT_DIR / 'feature_importance_random_forest.csv'
fi.to_csv(fi_path, index=False)
print('Wrote:', fi_path)

Wrote: /Users/richarddanquah/Desktop/Tuc/data analysis/Tiktok-Data-Analysis-Project/finalreport/summary_tables/model_performance_metrics_random_forest.csv
Wrote: /Users/richarddanquah/Desktop/Tuc/data analysis/Tiktok-Data-Analysis-Project/finalreport/summary_tables/confusion_matrix_random_forest.csv
Wrote: /Users/richarddanquah/Desktop/Tuc/data analysis/Tiktok-Data-Analysis-Project/finalreport/summary_tables/model_accuracy_random_forest.csv
Wrote: /Users/richarddanquah/Desktop/Tuc/data analysis/Tiktok-Data-Analysis-Project/finalreport/summary_tables/feature_importance_random_forest.csv
