In [5]:
import pandas as pd
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
import tensorflow as tf
from tensorflow.keras import layers
import joblib
import numpy as np

# Step 1: Load and clean data
df = pd.read_csv("alldata_1_for_kaggle.csv", header=None, encoding='ISO-8859-1')
df.columns = ['index', 'label', 'text']

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['clean_text'] = df['text'].apply(clean_text)

# Step 2: Remove rare labels (optional, but fixes sparse issues)
label_counts = df['label'].value_counts()
valid_labels = label_counts[label_counts >= 5].index
df_filtered = df[df['label'].isin(valid_labels)].copy()

# Step 3: Encode labels after filtering
le = LabelEncoder()
df_filtered['encoded_label'] = le.fit_transform(df_filtered['label'])

# Save the encoder
joblib.dump(le, "label_encoder.joblib")

# Step 4: Split data
X_train, X_test, y_train, y_test = train_test_split(
    df_filtered['clean_text'], df_filtered['encoded_label'], test_size=0.2, random_state=42)

# Step 5: TF-IDF vectorizer
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Save the vectorizer
joblib.dump(tfidf, "tfidf_vectorizer.joblib")

# Step 6: Model
num_classes = len(le.classes_)

model = tf.keras.Sequential([
    layers.Input(shape=(X_train_tfidf.shape[1],)),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(num_classes, activation='softmax')
])

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Step 7: Train
model.fit(X_train_tfidf.toarray(), y_train, epochs=5, batch_size=32, validation_split=0.1)

# Step 8: Save model
model.save("medical_text_classifier.h5")
print("✅ Model training complete and saved!")

# Step 9: Predict on test set
y_pred_probs = model.predict(X_test_tfidf.toarray())
y_pred = np.argmax(y_pred_probs, axis=1)

# Step 10: Evaluation
print("\n🧪 Test Set Evaluation:")
print(f"✅ Accuracy: {accuracy_score(y_test, y_pred):.4f}\n")
print("📊 Classification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))


Epoch 1/5
[1m171/171[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 14ms/step - accuracy: 0.6810 - loss: 0.8587 - val_accuracy: 0.9043 - val_loss: 0.2853
Epoch 2/5
[1m171/171[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.9348 - loss: 0.2247 - val_accuracy: 0.9472 - val_loss: 0.1322
Epoch 3/5
[1m171/171[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - accuracy: 0.9422 - loss: 0.1311 - val_accuracy: 0.9554 - val_loss: 0.1031
Epoch 4/5
[1m171/171[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.9479 - loss: 0.1050 - val_accuracy: 0.9604 - val_loss: 0.0871
Epoch 5/5
[1m171/171[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.9532 - loss: 0.0913 - val_accuracy: 0.9620 - val_loss: 0.0805




✅ Model training complete and saved!
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step

🧪 Test Set Evaluation:
✅ Accuracy: 0.9617

📊 Classification Report:
                precision    recall  f1-score   support

  Colon_Cancer       0.95      0.94      0.94       517
   Lung_Cancer       1.00      1.00      1.00       407
Thyroid_Cancer       0.95      0.95      0.95       590

      accuracy                           0.96      1514
     macro avg       0.97      0.96      0.96      1514
  weighted avg       0.96      0.96      0.96      1514



In [9]:
!pip install fastapi nest-asyncio pyngrok uvicorn

Collecting fastapi
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting pyngrok
  Downloading pyngrok-7.2.9-py3-none-any.whl.metadata (9.3 kB)
Collecting uvicorn
  Downloading uvicorn-0.34.2-py3-none-any.whl.metadata (6.5 kB)
Collecting starlette<0.47.0,>=0.40.0 (from fastapi)
  Downloading starlette-0.46.2-py3-none-any.whl.metadata (6.2 kB)
Downloading fastapi-0.115.12-py3-none-any.whl (95 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.2/95.2 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyngrok-7.2.9-py3-none-any.whl (25 kB)
Downloading uvicorn-0.34.2-py3-none-any.whl (62 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.5/62.5 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading starlette-0.46.2-py3-none-any.whl (72 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.0/72.0 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: uvicorn, pyngrok, s

In [23]:

from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware

app = FastAPI()

app.add_middleware(
CORSMiddleware,
allow_origins=['*'],
allow_credentials=True,
allow_methods=['*'],
allow_headers=['*'],
)

from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import joblib
import tensorflow as tf
import numpy as np
import re

# Load saved artifacts
model = tf.keras.models.load_model("medical_text_classifier.h5")
tfidf_vectorizer = joblib.load("tfidf_vectorizer.joblib")
label_encoder = joblib.load("label_encoder.joblib")


# Pydantic model for request body
class TextInput(BaseModel):
    text: str

# Text cleaning function
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Predict endpoint
@app.post("/predict")
def predict(input: TextInput):
    try:
        cleaned = clean_text(input.text)
        vectorized = tfidf_vectorizer.transform([cleaned])
        prediction_probs = model.predict(vectorized.toarray())
        predicted_index = np.argmax(prediction_probs, axis=1)[0]
        predicted_label = label_encoder.inverse_transform([predicted_index])[0]
        confidence = float(np.max(prediction_probs))

        return {
            "predicted_label": predicted_label,
            "confidence": round(confidence, 4)
        }
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))




In [None]:
!pip install pyngrok
import nest_asyncio
from pyngrok import ngrok
import uvicorn

# Get your authtoken from https://dashboard.ngrok.com/get-started/your-authtoken
auth_token = "2xmaGUkgfunyh5duVClAtxhx3Jh_3smPzAHYvWVLsdH7oT61F"

# Set the authtoken
ngrok.set_auth_token(auth_token)

# Connect to ngrok
ngrok_tunnel = ngrok.connect(8000)

# Print the public URL
print('Public URL:', ngrok_tunnel.public_url)

# Apply nest_asyncio
nest_asyncio.apply()

# Run the uvicorn server
uvicorn.run(app, port=8000)

Public URL: https://566d-34-141-220-78.ngrok-free.app


INFO:     Started server process [758]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:8000 (Press CTRL+C to quit)


INFO:     2401:4900:9273:36d6:518b:5901:f991:3c87:0 - "GET / HTTP/1.1" 404 Not Found
INFO:     2401:4900:9273:36d6:518b:5901:f991:3c87:0 - "GET /favicon.ico HTTP/1.1" 404 Not Found
INFO:     2401:4900:9273:36d6:518b:5901:f991:3c87:0 - "GET /docs HTTP/1.1" 200 OK
INFO:     2401:4900:9273:36d6:518b:5901:f991:3c87:0 - "GET /openapi.json HTTP/1.1" 200 OK
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 165ms/step
INFO:     2401:4900:9273:36d6:518b:5901:f991:3c87:0 - "POST /predict HTTP/1.1" 200 OK
