In [19]:
import pandas as pd
import numpy as np

reviews = pd.DataFrame({
    'text': [
        'I love this product!',
        'This is the worst thing I ever bought.',
        'It works well, but could be improved.',
        'Absolutely fantastic! Highly recommend.',
        'Not what I expected at all.'
    ],
    'label': [1, 0, 1, 1, 0]
})

reviews.head(2)

Unnamed: 0,text,label
0,I love this product!,1
1,This is the worst thing I ever bought.,0


In [15]:
corpus = [
        'I love this product!',
        'This is the worst thing I ever bought.',
        'It works well, but could be improved.',
        'Absolutely fantastic! Highly recommend.',
        'Not what I expected at all.'
    ]

import string
# preprocessing
# lower case
corpus = [text.lower() for text in corpus]
# remove punctuation
corpus = [text.translate(str.maketrans('', '', string.punctuation)) for text in corpus]

# remove stopwords
stopwords = set(['is', 'the', 'this', 'a', 'an', 'at', 'all', 'but', 'could', 'be', 'what', 'ever'])
corpus = [' '.join([word for word in text.split() if word not in stopwords]) for text in corpus]


In [16]:
# create unique tokens
unique_tokens = set()
for text in corpus:
    unique_tokens.update(text.split())

unique_tokens = list(unique_tokens)

# lets create the vector
vector = []
for text in corpus:
    vector.append([1 if token in text.split() else 0 for token in unique_tokens])



In [28]:
def preprocessing(df):
    # lower case
    df['text'] = df['text'].str.lower()
    # remove punctuation
    df['text'] = df['text'].str.translate(str.maketrans('', '', string.punctuation))
    # remove stopwords
    stopwords = set(['is', 'the', 'this', 'a', 'an', 'at', 'all', 'but', 'could', 'be', 'what', 'ever'])
    df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords]))
    print(df.columns)
    return df

# apply vectorisation
def vectorisation(df):
    '''return dataframe with unique words as columns'''
    unique_tokens = set()
    for text in df['text']:
        unique_tokens.update(text.split())
    unique_tokens = list(unique_tokens)

    # create the vector of size len(unique)
    vector_size = len(unique_tokens)
    vector = np.zeros((df.shape[0], vector_size))
    for i, text in enumerate(df['text']):
        for token in text.split():
            if token in unique_tokens:
                vector[i, unique_tokens.index(token)] = 1

    df[unique_tokens] = pd.DataFrame(vector, columns=unique_tokens)
    # df['labels'] = df['label']
    return df

# reviews = preprocessing(reviews)
# reviews = vectorisation(reviews)




In [25]:
reviews

Unnamed: 0,text,label,fantastic,i,love,not,expected,works,thing,highly,well,recommend,bought,it,worst,absolutely,improved,product,labels
0,i love product,1,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
1,worst thing i bought,0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0
2,it works well improved,1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1
3,absolutely fantastic highly recommend,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1
4,not i expected,0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [26]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(reviews[unique_tokens], reviews['labels'])

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [31]:
new_reviews = 'this is bad review'
df = pd.DataFrame([new_reviews], columns=['text'])
new_reviews = vectorisation(df)
model.predict(new_reviews[unique_tokens])


KeyError: "None of [Index(['fantastic', 'i', 'love', 'not', 'expected', 'works', 'thing', 'highly',\n       'well', 'recommend', 'bought', 'it', 'worst', 'absolutely', 'improved',\n       'product'],\n      dtype='object')] are in the [columns]"

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
vectorizer.get_feature_names_out()
print(X.shape)

(4, 9)


In [33]:
vectorizer.get_feature_names_out()

array(['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third',
       'this'], dtype=object)

In [None]:
# bow from skelearn
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
vectorizer.get_feature_names_out()
print(X.shape)