## Bag of words & TF-IDF

In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv('train-preprocessed.csv')
test = pd.read_csv('test-preprocessed.csv')

## Tokenization

In this notebook I will do a little analysis of `Bag of Words`, `TF-IDF` with tokenization by words (by default) and with space-separated tokenization (because the pre-processed data contain exactly tokens with space-separated tokenization)

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [4]:
y_train = train['language']
y_test = test['language']

In [5]:
vectorizer = CountVectorizer(max_df=0.7, min_df=0.003)
vectorizer_split = CountVectorizer(max_df=0.7, min_df=0.003, tokenizer=lambda x: x.split())

In [6]:
X_train_vectorized = vectorizer.fit_transform(train['code'])
X_test_vectorized = vectorizer.transform(test['code'])

In [7]:
mnb = MultinomialNB().fit(X_train_vectorized, y_train)
preds = mnb.predict(X_test_vectorized)
print(classification_report(y_test, preds, zero_division=0))

              precision    recall  f1-score   support

           c       0.84      0.93      0.89       229
         cpp       0.87      0.85      0.86       230
         css       0.92      0.96      0.94       220
     haskell       0.91      0.87      0.89       221
        html       0.88      0.84      0.86       240
        java       0.95      0.96      0.96       216
  javascript       0.84      0.90      0.87       219
         lua       0.93      0.88      0.90       218
        objc       0.95      0.92      0.94       243
        perl       0.90      0.93      0.92       225
         php       0.76      0.80      0.78       237
      python       0.95      0.88      0.91       225
           r       0.89      0.87      0.88       214
        ruby       0.95      0.80      0.87       208
       scala       0.88      0.95      0.92       198
      sqlite       0.94      0.96      0.95       209
       swift       0.93      0.95      0.94       210

    accuracy              

In [8]:
X_train_vectorized = vectorizer_split.fit_transform(train['code'])
X_test_vectorized = vectorizer_split.transform(test['code'])

mnb = MultinomialNB().fit(X_train_vectorized, y_train)
preds = mnb.predict(X_test_vectorized)
print(classification_report(y_test, preds, zero_division=0))



              precision    recall  f1-score   support

           c       0.84      0.99      0.91       229
         cpp       0.96      0.85      0.90       230
         css       0.96      0.94      0.95       220
     haskell       0.91      0.90      0.90       221
        html       0.89      0.93      0.91       240
        java       0.94      0.95      0.95       216
  javascript       0.89      0.93      0.91       219
         lua       0.94      0.93      0.94       218
        objc       0.99      0.96      0.97       243
        perl       0.94      0.96      0.95       225
         php       0.92      0.94      0.93       237
      python       0.92      0.89      0.91       225
           r       0.90      0.93      0.92       214
        ruby       0.95      0.84      0.89       208
       scala       0.90      0.94      0.92       198
      sqlite       0.97      0.93      0.95       209
       swift       1.00      0.96      0.98       210

    accuracy              

In [9]:
text = 'def foo ( bar ) : return bar'

In [10]:
vectorized = vectorizer_split.transform([text])

In [11]:
mnb.predict(vectorized)

array(['scala'], dtype='<U10')

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

In [13]:
pipe = Pipeline(
    steps=[
        ('tfidf', TfidfVectorizer(token_pattern='\S+')),
        ('clf', RandomForestClassifier())
    ]
)

In [14]:
pipe.fit(train['code'], y_train)

In [15]:
preds = pipe.predict(test['code'])
print(classification_report(y_test, preds, zero_division=0))

              precision    recall  f1-score   support

           c       0.89      0.99      0.93       229
         cpp       0.99      0.86      0.92       230
         css       0.96      0.99      0.97       220
     haskell       0.99      0.98      0.98       221
        html       0.96      0.97      0.96       240
        java       0.97      0.97      0.97       216
  javascript       0.94      0.98      0.96       219
         lua       1.00      0.97      0.99       218
        objc       1.00      0.97      0.99       243
        perl       1.00      0.97      0.98       225
         php       0.98      0.95      0.97       237
      python       0.97      0.93      0.95       225
           r       0.99      0.99      0.99       214
        ruby       0.92      0.98      0.95       208
       scala       0.97      1.00      0.98       198
      sqlite       0.96      0.98      0.97       209
       swift       1.00      0.98      0.99       210

    accuracy              

## Stop words calculation try

https://www.researchgate.net/publication/318969652_AN_AUTO-GENERATED_APPROACH_OF_STOP_WORDS_USING_AGGREGATED_ANALYSIS

In [16]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Based on: https://github.com/bnriiitb/autostopwordgen

class AutoStopWordsGen:

    def __init__(self, corpus):
        """
        :param corpus: corpus
        """
        logging.info('initializing the AutoStopwordsGen started')
        cv = CountVectorizer(tokenizer=lambda x: x.split())
        cvft = cv.fit_transform(corpus)
        tfidfcv = TfidfVectorizer(tokenizer=lambda x: x.split())
        tfidfcvft = tfidfcv.fit_transform(corpus)
        self.tfidfcv=tfidfcv
        self.cvft=cvft
        logging.info('initializing the AutoStopwordsGen completed')

    def get_stopwords(self, top_n=.95, last_n=.1):

        """
        :param top_n: top n percent threshold
        :param last_n: last n percent threshold
        :return: returns stopwords
        """
        logging.info('generating stopwords started')
        word_freq_df = pd.DataFrame({'word': self.tfidfcv.get_feature_names_out(),
                                     'frequency':np.asarray(self.cvft.sum(axis=0)).ravel().tolist(),
                                     'idf':self.tfidfcv.idf_})
        word_freq_df.sort_values(by=['frequency'],ascending = False,inplace=True)

        word_freq_df['prob'] = word_freq_df.frequency/word_freq_df.shape[0]
        word_freq_df['entropy'] = word_freq_df.prob.apply(lambda x: x*np.log(1/x))
        word_freq_df['vp'] = np.power(word_freq_df.prob-word_freq_df.prob.mean(),2)/word_freq_df.shape[0]

        stopwords=dict({'frequency':[],'idf':[],'entropy':[],'vp':[]})
        cols=['frequency','entropy','vp', 'idf']

        for col in cols:
            # print(col,' : ',word_freq_df[col].quantile([last_n,top_n]).tolist())
            if(col=='frequency'):
                top_5_percent=word_freq_df[col].quantile([last_n,top_n]).tolist()[1]
                stopwords[col]=word_freq_df[word_freq_df[col]>=top_5_percent].word.tolist()
            else:
                last_10_percent=word_freq_df[col].quantile([last_n,top_n]).tolist()[0]
                stopwords[col]=word_freq_df[word_freq_df[col]<=last_10_percent].word.tolist()

        for key in stopwords.keys():
            stopwords[key]=set(stopwords[key])
            
        very_high_aggregation = word_freq_df[word_freq_df.frequency<2].word.tolist()
        
        very_high_aggregation.extend(list(stopwords['frequency'].intersection(stopwords['entropy']).intersection(stopwords['vp'])))
        very_high_aggregation=list(set(very_high_aggregation))
        logging.info('# stopwords generated :: '+str(len(very_high_aggregation)))
        logging.info('generating stopwords completed')
        return very_high_aggregation

In [17]:
asw = AutoStopWordsGen(train['code'])

2023-04-13 18:52:54,347 : INFO : initializing the AutoStopwordsGen started
2023-04-13 18:52:55,362 : INFO : initializing the AutoStopwordsGen completed


In [18]:
stop_words = asw.get_stopwords()

2023-04-13 18:52:55,425 : INFO : generating stopwords started
2023-04-13 18:52:55,605 : INFO : # stopwords generated :: 42950
2023-04-13 18:52:55,606 : INFO : generating stopwords completed


In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [20]:
vectorizer = TfidfVectorizer(tokenizer=lambda x: x.split(), stop_words=stop_words)
X_train_vectorized = vectorizer.fit_transform(train['code'])
X_test_vectorized = vectorizer.transform(test['code'])
clf = RandomForestClassifier(n_jobs=-1, random_state=42).fit(X_train_vectorized, y_train)
preds = clf.predict(X_test_vectorized)
print(classification_report(y_test, preds, zero_division=0))



              precision    recall  f1-score   support

           c       0.93      0.99      0.96       229
         cpp       1.00      0.90      0.94       230
         css       0.96      0.99      0.97       220
     haskell       0.99      0.97      0.98       221
        html       0.96      0.94      0.95       240
        java       0.97      0.98      0.97       216
  javascript       0.96      0.98      0.97       219
         lua       0.99      0.97      0.98       218
        objc       0.99      0.98      0.98       243
        perl       1.00      0.98      0.99       225
         php       0.99      0.97      0.98       237
      python       0.95      0.94      0.94       225
           r       0.98      0.99      0.98       214
        ruby       0.93      0.98      0.95       208
       scala       0.99      1.00      0.99       198
      sqlite       0.96      0.99      0.97       209
       swift       1.00      0.98      0.99       210

    accuracy              

From this we can conclude that deleting `54404` words with this algorithm can reduce the number of words in the corpus and the quality will remain virtually unchanged. Therefore, in the future you can try to apply this list of stop words.

In [21]:
import json

In [22]:
with open('stop-words.json', 'w') as f:
    f.write(json.dumps([w.strip() for w in stop_words]))