In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import pandas as pd

In [24]:
vectorizer = TfidfVectorizer(max_features=5000)

In [25]:
corpus = ["This do not like pizza",
          "I love to swim",
          "I like this movie very much",
          "Nothing good in this article"]
X = vectorizer.fit_transform(corpus)

In [26]:
print("Vocabulary:", vectorizer.get_feature_names_out())
print("IDF Scores:", vectorizer.idf_)

Vocabulary: ['article' 'do' 'good' 'in' 'like' 'love' 'movie' 'much' 'not' 'nothing'
 'pizza' 'swim' 'this' 'to' 'very']
IDF Scores: [1.91629073 1.91629073 1.91629073 1.91629073 1.51082562 1.91629073
 1.91629073 1.91629073 1.91629073 1.91629073 1.91629073 1.91629073
 1.22314355 1.91629073 1.91629073]


In [27]:
model = LogisticRegression()

In [28]:
y = [0, 1, 1, 0]  # Example binary classification labels
model.fit(X, y)

In [29]:
new_data = ["I do not like to swim"]
new_X = vectorizer.transform(new_data)
predictions = model.predict(new_X)
print("Predictions:", predictions)

Predictions: [1]


In [30]:
df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# Display the DataFrame
print("DataFrame:")
print(df)

DataFrame:
   article        do     good       in      like     love     movie      much  \
0  0.00000  0.498197  0.00000  0.00000  0.392784  0.00000  0.000000  0.000000   
1  0.00000  0.000000  0.00000  0.00000  0.000000  0.57735  0.000000  0.000000   
2  0.00000  0.000000  0.00000  0.00000  0.392784  0.00000  0.498197  0.498197   
3  0.47633  0.000000  0.47633  0.47633  0.000000  0.00000  0.000000  0.000000   

        not  nothing     pizza     swim      this       to      very  
0  0.498197  0.00000  0.498197  0.00000  0.317993  0.00000  0.000000  
1  0.000000  0.00000  0.000000  0.57735  0.000000  0.57735  0.000000  
2  0.000000  0.00000  0.000000  0.00000  0.317993  0.00000  0.498197  
3  0.000000  0.47633  0.000000  0.00000  0.304035  0.00000  0.000000  
