In [2]:
!pip install gensim
!pip install nltk

Collecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Downloading scipy-1.13.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (38.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.6/38.6 MB[0m [31m37.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: scipy
  Attempting uninstall: scipy
    Found existing installation: scipy 1.14.1
    Uninstalling scipy-1.14.1:
      Successfully uninstalled scipy-1.14.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cesium 0.12.3 requires numpy<3.0,>=2.0, but you have numpy 1.26.4 which is incompatible.
libpysal 4.9.2 requires packaging>=2

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk

nltk.download('punkt')
nltk.download('punkt_tab')
# nltk.download('stopwords')
stopwords = stopwords.words('english')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [4]:
class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return [self._preprocess(text) for text in X]

    def _preprocess(self, text):
        tokens = word_tokenize(text.lower())
        tokens = [word for word in tokens if word.isalnum() and word not in stopwords]
        return tokens

In [5]:
class Word2VecVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, vector_size=100, window=5, min_count=1, workers=4):
        self.vector_size = vector_size
        self.window = window
        self.min_count = min_count
        self.workers = workers
        self.model = None

    def fit(self, X, y=None):
        self.model = Word2Vec(sentences=X, vector_size=self.vector_size, 
                              window=self.window, min_count=self.min_count, workers=self.workers)
        return self

    def transform(self, X):
        return np.array([self._get_embedding(tokens) for tokens in X])

    def _get_embedding(self, tokens):
        if len(tokens) == 0:
            return np.zeros(self.vector_size)
        embeddings = [self.model.wv[word] for word in tokens if word in self.model.wv]
        if len(embeddings) > 0:
            return np.mean(embeddings, axis=0)
        else:
            return np.zeros(self.vector_size)

In [6]:
pipeline = Pipeline([
    ('preprocessor', TextPreprocessor()), 
    ('w2v_vectorizer', Word2VecVectorizer(vector_size=100)),
    ('classifier', LogisticRegression(max_iter=1000))
])

In [7]:
df = pd.read_csv('/kaggle/input/diagt-data/train_v3_drcat_02.csv')

In [8]:
X = df['text']
y = df['label']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 42)

In [10]:
pipeline.fit(X_train, y_train)

In [11]:
train_pred = pipeline.predict(X_train)
print(classification_report(y_train, train_pred))

              precision    recall  f1-score   support

           0       0.96      0.98      0.97     20618
           1       0.98      0.97      0.98     28332

    accuracy                           0.97     48950
   macro avg       0.97      0.97      0.97     48950
weighted avg       0.97      0.97      0.97     48950



In [12]:
test_pred = pipeline.predict(X_test)
print(classification_report(y_test, test_pred))

              precision    recall  f1-score   support

           0       0.96      0.97      0.97      6752
           1       0.98      0.97      0.98      9565

    accuracy                           0.97     16317
   macro avg       0.97      0.97      0.97     16317
weighted avg       0.97      0.97      0.97     16317



In [13]:
import pickle
pickle.dump(pipeline, open("w2v.h5","wb"))