In [None]:
import kagglehub
import pandas as pd
import os
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import LabelEncoder



In [None]:

path = kagglehub.dataset_download('uciml/sms-spam-collection-dataset')

data = pd.read_csv(f'{path}/{os.listdir(path)[0]}', encoding='ISO-8859-1')
data = data[['v1', 'v2']]
data.columns = ['label', 'text']

In [10]:
#Беру из датасета первые 5560 строк, чтобы обучить модель (данные будут использоваться для обучения 
# и тестирования 75% на 25%), остальные строки датасета оставлю для наглядного тестирования

# Выделяем первые 5560 строк для обучения и тестирования модели
train_test_data = data.iloc[:5560]

In [None]:


label_encoder = LabelEncoder()

train_test_data['label'] = label_encoder.fit_transform(train_test_data['label'])

ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(train_test_data['text'].values.reshape(-1, 1), train_test_data['label'])

balanced_df = pd.DataFrame({'text': X_resampled.flatten(), 'label': y_resampled})

balanced_df['label'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_test_data['label'] = label_encoder.fit_transform(train_test_data['label'])


label
0    4815
1    4815
Name: count, dtype: int64

In [None]:

X_train, X_test, y_train, y_test = train_test_split(
    balanced_df['text'], balanced_df['label'], test_size=0.2, random_state=42
)

tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


model = LogisticRegression()
model.fit(X_train_tfidf, y_train)


y_pred = model.predict(X_test_tfidf)
y_prob = model.predict_proba(X_test_tfidf)[:, 1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1-Score:", f1_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_pred))


Accuracy: 0.9875389408099688
F1-Score: 0.9877049180327869
ROC-AUC: 0.9875682741839455


### Чтобы протестировать модель возьму столбцы датасета, которые не были использованы в обучении модели. Выделенные строки относятся к SPAM.

1. No. I meant the calculation is the same. That  &lt;#&gt; units at  &lt;#&gt; . This school is really expensive. Have you started practicing your accent.
2. Because its important. And have you decided if you are doing 4years of dental school or if you'll just do the nmde exam.
3. "Sorry, I'll call later"
4. if you aren't here in the next  &lt;#&gt;  hours imma flip my shit
5. Anything lor. Juz both of us lor.
6. Get me out of this dump heap. My mom decided to come to lowes. BORING.
7. Ok lor... Sony ericsson salesman... I ask shuhui then she say quite gd 2 use so i considering...
8. Ard 6 like dat lor.
9. Why don't you wait 'til at least wednesday to see if you get your .
10. Huh y lei...

**11. "REMINDER FROM O2: To get 2.50 pounds free call credit and details of great offers pls reply 2 this text with your valid name, house no and postcode"**


**12. "This is the 2nd time we have tried 2 contact u. U have won the �750 Pound prize. 2 claim is easy, call 087187272008 NOW1! Only 10p per minute. BT-national-rate."**

13. Will �_ b going to esplanade fr home?
14. "Pity, * was in mood for that. So...any other suggestions?"
15. The guy did some bitching but I acted like i'd be interested in buying something else next week and he gave it to us for free
16. Rofl. Its true to its name




In [18]:
# Пример новых данных
new_data = [
    "No. I meant the calculation is the same. That <#> units at <#>. This school is really expensive. Have you started practicing your accent. Because it's important. And have you decided if you are doing 4 years of dental school or if you'll just do the nmde exam.",
    "Because its important. And have you decided if you are doing 4years of dental school or if you'll just do the nmde exam.",
    "Sorry, I'll call later",
    "if you aren't here in the next <#> hours imma flip my shit",
    "Anything lor. Juz both of us lor.",
    "Get me out of this dump heap. My mom decided to come to lowes. BORING.",
    "Ok lor... Sony ericsson salesman... I ask shuhui then she say quite gd 2 use so i considering...",
    "Ard 6 like dat lor.",
    "Why don't you wait 'til at least wednesday to see if you get your .",
    "Huh y lei...",
    "REMINDER FROM O2: To get 2.50 pounds free call credit and details of great offers pls reply 2 this text with your valid name, house no and postcode",
    "This is the 2nd time we have tried 2 contact u. U have won the £750 Pound prize. To claim is easy, call 087187272008 NOW1! Only 10p per minute. BT-national-rate.",
    "Will �_ b going to esplanade fr home?",
    "'Pity, * was in mood for that. So...any other suggestions?'",
    "The guy did some bitching but I acted like i'd be interested in buying something else next week and he gave it to us for free",
    "Rofl. Its true to its name"
]

y_manual = model.predict(tfidf.transform(new_data))
predicted = pd.DataFrame({'text': new_data, 'label': label_encoder.inverse_transform(y_manual)})


predicted

Unnamed: 0,text,label
0,No. I meant the calculation is the same. That ...,ham
1,Because its important. And have you decided if...,ham
2,"Sorry, I'll call later",ham
3,if you aren't here in the next <#> hours imma ...,ham
4,Anything lor. Juz both of us lor.,ham
5,Get me out of this dump heap. My mom decided t...,ham
6,Ok lor... Sony ericsson salesman... I ask shuh...,ham
7,Ard 6 like dat lor.,ham
8,Why don't you wait 'til at least wednesday to ...,ham
9,Huh y lei...,ham
