In [14]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
df = pd.read_csv('/kaggle/input/diagt-for-train/output.csv')
df = df.drop('Unnamed: 0', axis=1)
df = df.dropna()

In [11]:
df['error_count'].max()
X = df['text']
y = df['label']

In [5]:
from sklearn.metrics import f1_score
max_f1 = -1
for i in range(df['error_count'].max()):
    pred = df['error_count'].apply(lambda x: x < i)
    f1 = f1_score(y, pred)
    if f1 > max_f1:
        max_f1 = f1
        max_i = i

In [15]:
print(max_f1, max_i)

0.776800665372886 11


In [13]:
pred = df['error_count'].apply(lambda x: x < 11)
print(classification_report(y, pred))

              precision    recall  f1-score   support

           0       0.78      0.37      0.50     27370
           1       0.67      0.92      0.78     37897

    accuracy                           0.69     65267
   macro avg       0.72      0.65      0.64     65267
weighted avg       0.72      0.69      0.66     65267



In [None]:
df = pd.read_csv('/kaggle/input/sentences/train_v3_drcat_02.csv')

In [30]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_features=100)
tfidf_vectorizer.fit(X_train)
X_train_tfidf = tfidf_vectorizer.transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [32]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

y_pred_train = model.predict(X_train_tfidf)
print(classification_report(y_train, y_pred_train))

y_pred = model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.89      0.90     20618
           1       0.92      0.93      0.93     28331

    accuracy                           0.91     48949
   macro avg       0.91      0.91      0.91     48949
weighted avg       0.91      0.91      0.91     48949

              precision    recall  f1-score   support

           0       0.90      0.89      0.90      6752
           1       0.92      0.93      0.93      9565

    accuracy                           0.91     16317
   macro avg       0.91      0.91      0.91     16317
weighted avg       0.91      0.91      0.91     16317



In [33]:
stop_words = stopwords.words('english')

def preprocess(text):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    return tokens

X_train_tokens = [preprocess(text) for text in X_train]
X_test_tokens = [preprocess(text) for text in X_test]

from gensim.models import Word2Vec

w2v_model = Word2Vec(sentences=X_train_tokens, vector_size=100, window=5, min_count=1, workers=4)

def get_w2v_embedding(tokens, model, vector_size=100):
    if len(tokens) == 0:
        return np.zeros(vector_size)
    embeddings = [model.wv[word] for word in tokens if word in model.wv]
    if len(embeddings) > 0:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(vector_size)

X_train_embedded = np.array([get_w2v_embedding(tokens, w2v_model) for tokens in X_train_tokens])
X_test_embedded = np.array([get_w2v_embedding(tokens, w2v_model) for tokens in X_test_tokens])

clf_logreg = LogisticRegression(max_iter=1000)
clf_logreg.fit(X_train_embedded, y_train)


y_train_logreg = clf_logreg.predict(X_train_embedded)
print(classification_report(y_train, y_train_logreg))

y_test_logreg = clf_logreg.predict(X_test_embedded)
print(classification_report(y_test, y_test_logreg))

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
              precision    recall  f1-score   support

           0       0.96      0.98      0.97     20618
           1       0.98      0.97      0.98     28331

    accuracy                           0.97     48949
   macro avg       0.97      0.98      0.97     48949
weighted avg       0.98      0.97      0.98     48949

              precision    recall  f1-score   support

           0       0.96      0.98      0.97      6752
           1       0.98      0.97      0.98      9565

    accuracy                           0.97     16317
   macro avg       0.97      0.97      0.97     16317
weighted avg       0.97      0.97      0.97     16317

