In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# Sample dataset
data = {'Text': ['I love programming in Python.',
                 'Python is awesome for data science.',
                 'I love solving problems with Python.',
                 'Data science is a growing field.']}

# Create DataFrame
df = pd.DataFrame(data)

# Display original DataFrame
print("Original DataFrame:")
print(df)

Original DataFrame:
                                   Text
0         I love programming in Python.
1   Python is awesome for data science.
2  I love solving problems with Python.
3      Data science is a growing field.


In [None]:
# Initialize the CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the text data to create the Bag-of-Words representation
X = vectorizer.fit_transform(df['Text'])

# Convert the result into a DataFrame for better visualization
bow_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# Display the Bag-of-Words DataFrame
print("\nBag-of-Words Representation:")
print(bow_df)


Bag-of-Words Representation:
   awesome  data  field  for  growing  in  is  love  problems  programming  \
0        0     0      0    0        0   1   0     1         0            1   
1        1     1      0    1        0   0   1     0         0            0   
2        0     0      0    0        0   0   0     1         1            0   
3        0     1      1    0        1   0   1     0         0            0   

   python  science  solving  with  
0       1        0        0     0  
1       1        1        0     0  
2       1        0        1     1  
3       0        1        0     0  


In [None]:
'''
(Optional) Remove Stopwords

If you want to improve your Bag-of-Words model by removing common words that don't add much value (like "the", "is", "in"),
you can use the stop_words='english' option in CountVectorizer.

'''

# Initialize CountVectorizer with stopwords removal
vectorizer = CountVectorizer(stop_words='english')

# Fit and transform the text data
X = vectorizer.fit_transform(df['Text'])

# Convert to a DataFrame
bow_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# Display the modified Bag-of-Words DataFrame
print("\nBag-of-Words Representation with Stopwords Removed:")
print(bow_df)



Bag-of-Words Representation with Stopwords Removed:
   awesome  data  field  growing  love  problems  programming  python  \
0        0     0      0        0     1         0            1       1   
1        1     1      0        0     0         0            0       1   
2        0     0      0        0     1         1            0       1   
3        0     1      1        1     0         0            0       0   

   science  solving  
0        0        0  
1        1        0  
2        0        1  
3        1        0  
