In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
import string

In [18]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS as stopwords
punctuations = string.punctuation


In [7]:
import spacy
nlp = spacy.load("en_core_web_md")

In [24]:
train = [('I love this sandwich.', 'pos'),
    ('this is an amazing place!', 'pos'),
    ('I feel very good about these beers.', 'pos'),
    ('this is my best work.', 'pos'),
    ("what an awesome view", 'pos'),
    ('I do not like this restaurant', 'neg'),
    ('I am tired of this stuff.', 'neg'),
    ("I can't deal with this", 'neg'),
    ('he is my sworn enemy!', 'neg'),
    ('my boss is horrible.', 'neg')
]

test = [('the beer was good.', 'pos'),
    ('I do not enjoy my job', 'neg'),
    ("I ain't feelin dandy today.", 'neg'),
    ("I feel amazing!", 'pos'),
    ('Gary is a good friend of mine.', 'pos'),
    ("I can't believe I'm doing this.", 'neg')
]

In [33]:
X_train = [s[0] for s in train]
X_train

['I love this sandwich.',
 'this is an amazing place!',
 'I feel very good about these beers.',
 'this is my best work.',
 'what an awesome view',
 'I do not like this restaurant',
 'I am tired of this stuff.',
 "I can't deal with this",
 'he is my sworn enemy!',
 'my boss is horrible.']

In [34]:
y_train = [s[1] for s in train]
y_train

['pos', 'pos', 'pos', 'pos', 'pos', 'neg', 'neg', 'neg', 'neg', 'neg']

In [36]:
X_test = [s[0] for s in test]
X_test

['the beer was good.',
 'I do not enjoy my job',
 "I ain't feelin dandy today.",
 'I feel amazing!',
 'Gary is a good friend of mine.',
 "I can't believe I'm doing this."]

In [38]:
y_test = [s[1] for s in test]
y_test

['pos', 'neg', 'neg', 'pos', 'pos', 'neg']

In [44]:
#تبدیل سفارشی
class Cleaner(TransformerMixin):
    # تابع خاص برای پاکسازی نوشته
    def clean_text(self, text):
        return text.strip().lower()
    
    def transform(self, X, **transform_params):
        return [self.clean_text(text) for text in X]
    
    def fit(self, X, y=None, **fit_params):
        return self
    
    def get_params(self, deep=True):
        return {}

In [49]:
cleaner = Cleaner()
cleaned_X_train = cleaner.transform(X_train)
cleaned_X_train

['i love this sandwich.',
 'this is an amazing place!',
 'i feel very good about these beers.',
 'this is my best work.',
 'what an awesome view',
 'i do not like this restaurant',
 'i am tired of this stuff.',
 "i can't deal with this",
 'he is my sworn enemy!',
 'my boss is horrible.']

In [48]:
#این توکن‌ها را می توان با بردار نیز جایگزین کرد
def spacy_tokenizer(sentence):
    tokens = nlp(sentence)
    tokens = [tok.lemma_.lower().strip() if tok.lemma_!= "-PRON-" else tok.lower_ for tok in tokens]
    tokens = [tok for tok in tokens if (tok not in stopwords and tok not in punctuations)] 
    return tokens

#ایجاد شی بردارساز برای تولید بردارهای ویژگی، ما از توکن‌ساز سفارشی استفاده کردیم
vectorizer = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1)) 

In [51]:
# Fit the vectorizer to the corpus and transform the documents
vectorized_X_train = vectorizer.fit_transform(cleaned_X_train)

# Get feature names (words)
feature_names = vectorizer.get_feature_names_out()

# Print the transformed matrix and feature names
print("\nFeature names:")
print(feature_names)

print("Transformed matrix:")
print(vectorized_X_train.toarray())


Feature names:
['amazing' 'awesome' 'beer' 'boss' 'deal' 'enemy' 'feel' 'good' 'horrible'
 'like' 'love' 'place' 'restaurant' 'sandwich' 'stuff' 'sworn' 'tired'
 'view' 'work']
Transformed matrix:
[[0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]
 [0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1]
 [0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0]
 [0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0]
 [0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0]
 [0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0]]


In [52]:
classifier = LinearSVC()

In [53]:
# Create the pipeline to clean, tokenize, vectorize, and classify
pipe = Pipeline([
    ("cleaner", Cleaner()),
    ('vectorizer', vectorizer),
    ('classifier', classifier)
])

In [56]:
# ایجاد مدل و اندازه‌گیری دقت
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

for (sample, pred) in zip(test, y_pred):
    print (sample, pred)
    
print ("Accuracy:", accuracy_score(y_test, y_pred))
# >> ('the beer was good.', 'pos') pos
# ('I do not enjoy my job', 'neg') neg
# ("I ain't feelin dandy today.", 'neg') neg
# ('I feel amazing!', 'pos') pos
# ('Gary is a good friend of mine.', 'pos') pos
# ("I can't believe I'm doing this.", 'neg') neg
# Accuracy: 1.0

('the beer was good.', 'pos') pos
('I do not enjoy my job', 'neg') neg
("I ain't feelin dandy today.", 'neg') neg
('I feel amazing!', 'pos') pos
('Gary is a good friend of mine.', 'pos') pos
("I can't believe I'm doing this.", 'neg') neg
Accuracy: 1.0


