In [20]:
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
nlp

<spacy.lang.en.English at 0x7fc8710482e0>

In [5]:
!pip list

Package                   Version
------------------------- ------------
altair                    5.1.2
anyio                     4.0.0
argon2-cffi               23.1.0
argon2-cffi-bindings      21.2.0
arrow                     1.3.0
asttokens                 2.4.0
async-lru                 2.0.4
attrs                     23.1.0
Babel                     2.13.0
backcall                  0.2.0
beautifulsoup4            4.12.2
bleach                    6.0.0
blinker                   1.6.2
blis                      0.7.9
Bottleneck                1.3.5
brotlipy                  0.7.0
cachetools                5.3.1
catalogue                 2.0.7
certifi                   2023.7.22
cffi                      1.15.1
charset-normalizer        2.0.4
click                     8.0.4
colorama                  0.4.6
comm                      0.1.4
confection                0.0.4
contourpy                 1.0.5
cryptography              41.0.3
cycler                    0.11.0
cymem              

In [9]:
from utils import SentimentTrain

In [11]:
st = SentimentTrain('data')

In [12]:
st.data_path

'/usr/src/nlp/sentiment/data'

In [16]:
df_reviews = st.prepareData()
tfvectorizer = TfidfVectorizer(tokenizer = st.spacy_tokenizer)
classifier_LG = LogisticRegression(verbose=True)

In [18]:
pipe2_LG = Pipeline([
            ('vectorizer', tfvectorizer),
            ('classifier', classifier_LG)], verbose=True)

In [21]:
X = df_reviews['Message']
ylabels = df_reviews['Target']
X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.3, random_state=42)
pipe2_LG.fit(X_train,y_train)



KeyboardInterrupt: 

In [None]:
def spacy_tokenizer(self,doc):
        """Function that serves as tokenizer in our pipeline
        Loads the 'en_core_web_sm' model, tokenize the string and perform pre processing. 
        Preprocessing includes lemmatizing tokens as well as removing stop words and punctuations. 
        Args:
            doc(str): sentence to tokenize.
        Returns: 
            list: preprocessed tokens. 
        """
        punctuations = string.punctuation
        nlp = spacy.load('en_core_web_sm')
        stop_words = spacy.lang.en.stop_words.STOP_WORDS
        tokens = nlp(doc)

        # Lemmatizing each token and converting each token into lowercase
        tokens = [word.lemma_.lower() for word in tokens if not word.is_space]        
        # Removing stop words and punctuations
        tokens = [ word for word in tokens if word not in stop_words and word not in punctuations ]
        # return preprocessed list of tokens
        return tokens

In [22]:
import string
punctuations = string.punctuation
nlp = spacy.load('en_core_web_sm')

In [23]:
stop_words = spacy.lang.en.stop_words.STOP_WORDS

In [45]:
df_reviews.head()
df_reviews.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2745 entries, 0 to 998
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Message  2745 non-null   object
 1   Target   2745 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 64.3+ KB


In [31]:
tokens = nlp('Now I am getting angry and I want my damn pho.')
# Lemmatizing each token and converting each token into lowercase
tokens = [word.lemma_.lower() for word in tokens if not word.is_space]        
# Removing stop words and punctuations
tokens = [ word for word in tokens if word not in stop_words and word not in punctuations ]
# return preprocessed list of tokens
tokens

['angry', 'want', 'damn', 'pho']

In [61]:
tfvectorizer = TfidfVectorizer(tokenizer = st.spacy_tokenizer)
# documents = ["Crust is not good",
#              "Not tasty and the texture was just nasty",
#             "Now I am getting angry and I want my damn pho."]
X = df_reviews['Message']

In [55]:
X[:100]

0                                    Crust is not good.
1             Not tasty and the texture was just nasty.
2     Stopped by during the late May bank holiday of...
3     The selection on the menu was great and so wer...
4        Now I am getting angry and I want my damn pho.
                            ...                        
95                                  Will be back again!
96                                Food arrived quickly!
97                                     It was not good.
98    On the up side, their cafe serves really good ...
99    Our server was fantastic and when he found out...
Name: Message, Length: 100, dtype: object

In [57]:
tfidf_matrix = tfvectorizer.fit_transform(X[:100])

In [58]:
tfidf_matrix.toarray()

array([[0.        , 0.        , 0.70710678, 0.        , 0.70710678,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.57735027, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.57735027, 0.57735027,
        0.        ],
       [0.        , 0.35355339, 0.        , 0.        , 0.        ,
        0.        , 0.35355339, 0.35355339, 0.35355339, 0.        ,
        0.        , 0.        , 0.        , 0.35355339, 0.35355339,
        0.        , 0.35355339, 0.35355339, 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.5       , 0.        , 0.        , 0.       

In [59]:
tfvectorizer.get_feature_names_out()

array(['angry', 'bank', 'crust', 'damn', 'good', 'great', 'holiday',
       'late', 'love', 'menu', 'nasty', 'pho', 'price', 'recommendation',
       'rick', 'selection', 'steve', 'stop', 'tasty', 'texture', 'want'],
      dtype=object)