# App Type Classifier with fastText embeddings + LightGBM

This notebook builds a multi-class model to detect whether a process/app is a **Game**, **Browser**, or another category using the last column `label`.

**Pipeline overview**
1. Load the CSV (strings + numerics).
2. Concatenate all **string columns** into a single text field.
3. Train **fastText** (unsupervised) on the corpus and obtain a dense sentence embedding per row.
4. Combine the fastText embedding with the numeric features.
5. Train a **LightGBM** classifier and evaluate with F1/precision/recall and a confusion matrix.
6. Save the fastText model, LightGBM model, and preprocessing artifacts.

> **Note:** If `fasttext` installation fails on your platform, switch to the fallback TF-IDF baseline cell near the end.

In [None]:
%%python --version
# Install dependencies (run once per environment)
%pip -q install lightgbm scikit-learn pandas matplotlib seaborn joblib
# For windows fasttext_win for Linux fasttext
%pip install fasttext_win

# In a notebook cell or terminal
#%pip install fasttext-wheel

In [None]:
import os, re, json, math, gc, warnings
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, f1_score, confusion_matrix
import joblib
warnings.filterwarnings('ignore')
SEED = 42
np.random.seed(SEED)

## 1) Load data

In [None]:
# Path to your CSV (update if you renamed/moved it)
CSV_PATH = '27112025/Data.csv'

# Robust CSV load: allow long fields, mixed quoting, and bad lines skipped (log count).
kwargs = dict(engine='python')
try:
    df = pd.read_csv(CSV_PATH, on_bad_lines='skip', **kwargs)
except TypeError:
    # pandas<1.3 doesn't support on_bad_lines; use error_bad_lines=False
    df = pd.read_csv(CSV_PATH, error_bad_lines=False, **kwargs)

print(df.shape)
df = df.drop('PID', axis=1)
df.head(3)
print(df.columns)

## 2) Column typing: text vs numeric
We will treat `object` dtype columns (except the label) as text. Numeric-like strings are coerced to numeric.

In [None]:
# Clean column names
df.columns = [c.strip() for c in df.columns]
target_col = "label"
assert target_col in df.columns, "Expected a column named 'label'"

# Identify text vs numeric columns by dtype
all_cols = df.columns.tolist()
text_cols = [c for c in all_cols if (df[c].dtype=='object') and (c != target_col)]
# Attempt to coerce all non-text, non-target columns to numeric
num_candidate_cols = [c for c in all_cols if c not in text_cols + [target_col]]
for c in num_candidate_cols:
    df[c] = pd.to_numeric(df[c], errors='coerce')
numeric_cols = [c for c in num_candidate_cols if pd.api.types.is_numeric_dtype(df[c])]

print('Text columns ({}):'.format(len(text_cols)), text_cols[:10], '...')
print('Numeric columns ({}):'.format(len(numeric_cols)), numeric_cols[:10], '...')
print('Target distribution:')
print(df[target_col].value_counts(dropna=False))

## 3) Build a single text field and train fastText
We concatenate all string columns into one field and train a **fastText unsupervised** model (`skipgram`).
Then, for each row, we obtain a dense sentence vector (default `dim=100`).

In [None]:
# Concatenate text columns
def _norm_text(x):
    if pd.isna(x):
        return ''
    s = str(x)
    #s = s.replace('\n', ' ').replace('\t', ' ')
    #s = re.sub(r'[^A-Za-z0-9_:\-]+', ' ', s)
    return s.lower()

df['__text__'] = df[text_cols].astype(str).apply(lambda r: ' '.join(_norm_text(v) for v in r.values), axis=1)

# Write corpus for fastText
corpus_path = 'corpus.txt'
with open(corpus_path, 'w', encoding='utf-8') as f:
    for line in df['__text__'].tolist():
        f.write(line.strip() + '\n')

%pip install fasttext-wheel

import fasttext
ft_dim = 128
ft_model = fasttext.train_unsupervised(corpus_path, model='skipgram', dim=ft_dim, epoch=10, minn=2, maxn=5, lr=0.05, thread=4)
#min
ft_model.save_model('fasttext_model.bin')
print('fastText trained. dim=', ft_dim)

In [None]:
# Build sentence embeddings for each row
def sent_vec(s: str):
    return ft_model.get_sentence_vector(s)

emb = np.vstack([sent_vec(s) for s in df['__text__'].values])
emb_df = pd.DataFrame(emb, columns=[f'ft_{i}' for i in range(emb.shape[1])], index=df.index)
print(emb_df.shape)
emb_df.head()

## 4) Assemble features + target

In [None]:
# Prepare numeric matrix
X_num = df[numeric_cols].copy() if numeric_cols else pd.DataFrame(index=df.index)
# Fill missing numeric values with 0 (LightGBM can handle NaN too; choose per preference)
X_num = X_num.fillna(0)

# Concatenate numeric + embedding features
X = pd.concat([X_num, emb_df], axis=1)
y_raw = df[target_col].astype(str).values

# Encode labels
le = LabelEncoder()
y = le.fit_transform(y_raw)
classes = list(le.classes_)
print('Classes:', classes)

# Train/val split
# Handle case where a class has only one sample
if np.min(np.bincount(y)) < 2:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=SEED
    )
else:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=SEED, stratify=y
    )
X_train.shape, X_test.shape

## 5) Train LightGBM

In [None]:
import lightgbm as lgb
lgbm = lgb.LGBMClassifier(
    objective='multiclass',
    num_class=len(classes),
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=-1,
    subsample=0.8, colsample_bytree=0.8,
    reg_alpha=0.1, reg_lambda=0.1,
    random_state=SEED,
    n_jobs=-1
)

lgbm.fit(
    X_train, y_train,
    eval_set=[(X_train, y_train), (X_test, y_test)],
    eval_metric='multi_logloss',
    callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)]
)

print('Best iteration:', lgbm.best_iteration_)

## 6) Evaluation

In [None]:
from sklearn.metrics import precision_recall_fscore_support
y_pred = lgbm.predict(X_test)

print(sorted(set(y_test)))

print(classification_report(y_test, y_pred, labels=np.arange(len(classes)), target_names=classes, digits=4, zero_division=0))
macro_f1 = f1_score(y_test, y_pred, average='macro')
print('Macro F1:', round(macro_f1, 4))

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=classes, yticklabels=classes)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.tight_layout()
plt.show()

In [None]:
# Display misclassified data
misclassified_mask = y_pred != y_test
misclassified_indices = X_test.index[misclassified_mask]
misclassified_data = df.loc[misclassified_indices].copy()
misclassified_data['predicted_label'] = le.inverse_transform(y_pred[misclassified_mask])
print('Misclassified Data:')
misclassified_data[['label', 'predicted_label'] + text_cols].head(20)

## 7) Feature importance (numeric features only)
Embeddings are many anonymous dimensions; to interpret, we highlight the top original numeric features.

In [None]:
feat_imp = pd.DataFrame({
    'feature': X.columns,
    'gain': lgbm.booster_.feature_importance(importance_type='gain')
}).sort_values('gain', ascending=False)
top_num = feat_imp[~feat_imp['feature'].str.startswith('ft_')].head(20)
plt.figure(figsize=(7,6))
sns.barplot(data=top_num, y='feature', x='gain', orient='h', palette='viridis')
plt.title('Top numeric features by gain')
plt.tight_layout()
plt.show()
top_num

## 8) Save artifacts

In [None]:
artifacts_dir = Path('artifacts')
artifacts_dir.mkdir(exist_ok=True)

# Save fastText model already saved as fasttext_model.bin
# Save LightGBM model
# joblib.dump(lgbm, artifacts_dir / 'lgbm_model.joblib')
lgbm.booster_.save_model(artifacts_dir / 'lgbm_model.txt') # Save LightGBM model in a C++ compatible format
# Save label encoder and column lists
meta = {
    'classes': classes,
    'text_cols': text_cols,
    'numeric_cols': numeric_cols,
    'embedding_dim': emb_df.shape[1]
}
json.dump(meta, open(artifacts_dir / 'meta.json', 'w'))
print('Saved to', artifacts_dir.resolve())

## 9) Inference helper
Provide a function that takes a row (dict or pandas.Series) and returns the predicted label.

In [None]:
def predict_row(row: dict):
    if not isinstance(row, dict):
        row = row.to_dict()
    # Build text
    text = ' '.join(_norm_text(row.get(c, '')) for c in text_cols)
    vec = ft_model.get_sentence_vector(text).reshape(1, -1)
    # Numeric
    xnum = []
    for c in numeric_cols:
        v = row.get(c, np.nan)
        try:
            v = float(v)
        except Exception:
            v = np.nan
        xnum.append(0 if (v is None or (isinstance(v, float) and math.isnan(v))) else v)
    xnum = np.array(xnum, dtype=float).reshape(1, -1) if len(xnum)>0 else np.zeros((1,0))
    X_infer = np.hstack([xnum, vec]) if xnum.shape[1]>0 else vec
    pred = lgbm.predict(X_infer)[0]
    return le.inverse_transform([pred])[0]

# Quick sanity check on a random held-out sample
sample_idx = X_test.sample(1, random_state=SEED).index[0]
pred_label = predict_row(df.loc[sample_idx])
true_label = le.inverse_transform([y_test[list(X_test.index).index(sample_idx)]])[0]
print('Pred vs True:', pred_label, ' / ', true_label)

## (Optional) Baseline without fastText: TFâ€‘IDF + LightGBM
If fastText is not available on your machine, try this quick baseline.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
tfidf = TfidfVectorizer(min_df=2, max_df=0.9, ngram_range=(1,2))
X_tfidf = tfidf.fit_transform(df['__text__'])
Xb = X_tfidf
yb = y
Xb_train, Xb_test, yb_train, yb_test = train_test_split(Xb, yb, test_size=0.2, random_state=SEED, stratify=yb)
lgbm_b = lgb.LGBMClassifier(objective='multiclass', num_class=len(classes), n_estimators=500, learning_rate=0.1, random_state=SEED)
lgbm_b.fit(Xb_train, yb_train)
pred_b = lgbm_b.predict(Xb_test)
print(classification_report(yb_test, pred_b, target_names=classes, digits=4))