<a href="https://colab.research.google.com/github/prashanth741/NLP-LAB/blob/main/nlp-project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
from IPython.display import HTML

html_content = """
<!doctype html>
<html lang="en">
<head>
  <meta charset="utf-8" />
  <meta name="viewport" content="width=device-width,initial-scale=1" />
  <title>Char-CNN + FastText — Demo & Guide</title>
  <style>
    :root{--bg:#0f1724;--card:#0b1220;--muted:#9aa6b2;--accent:#4f46e5;--glass:rgba(255,255,255,0.03)}
    html,body{height:100%;margin:0;font-family:Inter,system-ui,Segoe UI,Roboto,Helvetica,Arial,sans-serif;background:linear-gradient(180deg,#071024 0%, #071b2b 100%);color:#e6eef6}
    .wrap{max-width:1100px;margin:36px auto;padding:28px;background:var(--card);border-radius:12px;box-shadow:0 8px 40px rgba(2,6,23,0.6)}
    header{display:flex;align-items:center;gap:18px}
    h1{margin:0;font-size:22px}
    p.lead{margin:6px 0 18px;color:var(--muted)}
    nav{display:flex;gap:8px;flex-wrap:wrap;margin-bottom:18px}
    button.tab{background:var(--glass);border:1px solid rgba(255,255,255,0.04);color:var(--muted);padding:8px 12px;border-radius:8px;cursor:pointer}
    button.tab.active{background:linear-gradient(90deg,var(--accent),#06b6d4);color:white;border:0}
    .grid{display:grid;grid-template-columns:1fr;gap:18px}
    .card{background:linear-gradient(180deg,rgba(255,255,255,0.02),transparent);padding:18px;border-radius:10px;border:1px solid rgba(255,255,255,0.03)}
    pre{background:#041024;padding:12px;border-radius:8px;overflow:auto;color:#dbeafe}
    code{font-family:ui-monospace, SFMono-Regular, Menlo, Monaco, "Roboto Mono", "Helvetica Neue", monospace;font-size:13px}
    label{display:block;margin-bottom:6px;color:var(--muted);font-size:13px}
    .demo-input{display:flex;gap:8px}
    input[type=text], textarea{width:100%;padding:10px;border-radius:8px;border:1px solid rgba(255,255,255,0.04);background:transparent;color:inherit}
    .result{padding:10px;border-radius:8px;background:rgba(255,255,255,0.02);border:1px solid rgba(255,255,255,0.03)}
    footer{margin-top:18px;font-size:13px;color:var(--muted)}
    .small{font-size:13px;color:var(--muted)}
    .code-row{display:grid;grid-template-columns:1fr;gap:12px}
    @media(min-width:880px){.grid{grid-template-columns:1fr 360px}}
  </style>
</head>
<body>
  <div class="wrap">
    <header>
      <div style="width:56px;height:56px;border-radius:10px;background:linear-gradient(135deg,#2563eb,#06b6d4);display:flex;align-items:center;justify-content:center;font-weight:700">C+</div>
      <div>
        <h1>Char-CNN + FastText — Demo & Guide</h1>
        <p class="lead">A simple single-page website that explains both approaches and provides copy-paste code for training and inference.</p>
      </div>
    </header>

    <nav>
      <button class="tab active" data-tab="overview">Overview</button>
      <button class="tab" data-tab="charcnn">Char-CNN (Keras)</button>
      <button class="tab" data-tab="fasttext">FastText</button>
      <button class="tab" data-tab="demo">Client Demo</button>
      <button class="tab" data-tab="run">How to run</button>
    </nav>

    <section class="grid">
      <main class="card" id="content">
        <!-- content injected by JS -->
      </main>

      <aside class="card">
        <h3>Quick notes</h3>
        <p class="small">Char-CNN works on characters (good for misspellings, tiny vocab). FastText is fast and uses word / subword n-grams — great for production and low-resource training.</p>
        <h4 style="margin-top:12px">Files suggested</h4>
        <ul class="small">
          <li>train.csv / test.csv (text,label)</li>
          <li>train_charcnn.py (Keras)</li>
          <li>fasttext_train.txt (fastText format)</li>
        </ul>
      </aside>
    </section>

    <footer>
      <div class="small">Want a Flask or Java backend to serve models? Ask and I'll add one. (This page includes training code — you need Python + GPU for reasonable char-CNN training speed.)</div>
    </footer>
  </div>

<script>
const TABS = {
  overview: `
  <h2>What's inside</h2>
  <p>Two common text classification approaches:</p>
  <ol>
    <li><strong>Char-CNN</strong> — convolutional neural network over character embeddings. Useful for tasks where morphology, misspelling, or small vocab matters.</li>
    <li><strong>FastText</strong> — shallow linear classifier on top of word / subword embeddings. Extremely fast to train and serve, good baseline and production option.</li>
  </ol>
  <h3>When to use which</h3>
  <ul>
    <li><b>Char-CNN:</b> noisy text (typos), languages with complex morphology, when you want model to learn character patterns.</li>
    <li><b>FastText:</b> large datasets, quick iteration, production inference with low latency.</li>
  </ul>
  `,

  charcnn: `
  <h2>Char-CNN (Keras) — Example</h2>
  <p class="small">Below is a compact training script using Keras. This example builds a simple character vocabulary, embeds characters, applies 1D convs and global max-pooling.</p>
  <div class="code-row"><pre><code># train_charcnn.py (compact)
import json
import numpy as np
from tensorflow.keras import layers, models
from tensorflow.keras.utils import to_categorical

# --- simple char vocabulary ---
CHARS = "\\n" + "abcdefghijklmnopqrstuvwxyz0123456789.,!?;:'\\"-()[]{} "
char_to_idx = {c:i+1 for i,c in enumerate(CHARS)}  # 0 reserved for padding
MAXLEN = 512

def text_to_seq(s):
    s = s.lower()[:MAXLEN]
    seq = [char_to_idx.get(c,0) for c in s]
    if len(seq)<MAXLEN:
        seq += [0]*(MAXLEN-len(seq))
    return seq

# Example data
texts = ["I love this product!", "Terrible, will not buy again."]
labels = [1,0]
X = np.array([text_to_seq(t) for t in texts])
Y = to_categorical(labels, num_classes=2)

# Build model
inp = layers.Input(shape=(MAXLEN,), dtype='int32')
emb = layers.Embedding(input_dim=len(char_to_idx)+1, output_dim=64, input_length=MAXLEN)(inp)
conv1 = layers.Conv1D(256, kernel_size=7, activation='relu', padding='same')(emb)
conv2 = layers.Conv1D(256, kernel_size=7, activation='relu', padding='same')(conv1)
pool = layers.GlobalMaxPooling1D()(conv2)
fc = layers.Dense(128, activation='relu')(pool)
out = layers.Dense(2, activation='softmax')(fc)
model = models.Model(inp, out)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
print(model.summary())

# Train (toy example)
model.fit(X, Y, epochs=6, batch_size=8)
# Save model
model.save('char_cnn_model')
</code></pre></div>
  <p class="small">Notes: tune embedding size, number of filters, kernel sizes and MAXLEN for your dataset. For long datasets use a GPU.</p>
  `,

  fasttext: `
  <h2>FastText — CLI & Python</h2>
  <p class="small">FastText (Facebook) is lightning-fast. Data: each line <code>__label__<label> <text></code></p>
  <pre><code># Example fasttext training (command line)
# prepare file: train_fasttext.txt where each line: __label__POS This product is great
# Train
!fasttext supervised -input train_fasttext.txt -output model_fasttext -lr 1.0 -epoch 25 -wordNgrams 2 -dim 100
# Predict
!fasttext predict model_fasttext.bin "I like this"

# Or use the python wrapper (fasttext package)
import fasttext
model = fasttext.train_supervised('train_fasttext.txt', lr=1.0, epoch=25, wordNgrams=2, dim=100)
print(model.predict("bad quality"))
</code></pre>
  <p class="small">FastText supports subword n-grams which helps generalize to unseen words. It's ideal as a fast baseline and production model.</p>
  `,

  demo: `
  <h2>Client-side Demo — character features</h2>
  <p class="small">This small demo shows how text maps to character indices and simple n-gram counts — useful to visualise what a Char-CNN or FastText subword features might see.</p>
  <label>Type text</label>
  <div class="demo-input"><input id="demoText" type="text" placeholder="Type some text..." value="Hello, FastText!" /><button id="runDemo">Encode</button></div>
  <div style="height:12px"></div>
  <div class="result" id="demoOut"></div>
  `,

  run: `
  <h2>How to run (recommended)</h2>
  <ol>
    <li>Install Python packages: <code>pip install tensorflow fasttext gensim numpy pandas</code></li>
    <li>Prepare data: a CSV with <code>text</code>,<code>label</code> or fastText formatted file.</li>
    <li>Run <code>train_charcnn.py</code> to train the char-CNN; or use the fastText commands above.</li>
    <li>To serve: use a small Flask API that loads the saved model and exposes a <code>/predict</code> endpoint.</li>
  </ol>
  <pre><code># Flask skeleton (save as app.py)
from flask import Flask, request, jsonify
import tensorflow as tf
import fasttext

app = Flask(__name__)
# Load models (example)
cnn = tf.keras.models.load_model('char_cnn_model')
ft = fasttext.load_model('model_fasttext.bin')

@app.route('/predict', methods=['POST'])
def predict():
    data = request.json
    text = data.get('text','')
    # convert text -> seq for cnn (same function as training)
    # call cnn.predict and ft.predict and return both
    return jsonify({'error':'Implement conversion & model inference'})

if __name__=='__main__':
    app.run(debug=True, port=5000)
</code></pre>
  <p class="small">If you want a Java (Spring Boot) example to serve models, tell me which model and I will add a ready project structure.</p>
  `
}

function setContent(name){
  document.getElementById('content').innerHTML = TABS[name];
  // attach demo listener
  if(name==='demo'){
    document.getElementById('runDemo').addEventListener('click', ()=>{
      const txt = document.getElementById('demoText').value;
      const chars = txt.toLowerCase().split('');
      const uniq = {};
      chars.forEach((c,i)=> uniq[c] = (uniq[c]||0)+1);
      const ng = {};
      for(let i=0;i<chars.length-1;i++){ const g = chars[i]+chars[i+1]; ng[g]=(ng[g]||0)+1 }
      document.getElementById('demoOut').innerHTML = `<strong>Chars:</strong> ${JSON.stringify(uniq)}<br/><strong>Bigrams:</strong> ${JSON.stringify(ng)}`;
    })
  }
}

// tab switching
document.querySelectorAll('button.tab').forEach(btn=>btn.addEventListener('click', e=>{
  document.querySelectorAll('button.tab').forEach(b=>b.classList.remove('active'));
  e.currentTarget.classList.add('active');
  setContent(e.currentTarget.dataset.tab);
}))

// init
setContent('overview');
</script>
</body>
</html>
"""
display(HTML(html_content))