# rake
scripts test
will be used later

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

        
#https://www.kaggle.com/jbencina/clustering-documents-with-tfidf-and-kmeans
from sklearn.cluster import MiniBatchKMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np

print_every = 50
init_size = 2000
batch_size = 4000

def drop_empty_rows(df):
    nan_value = float("NaN")
    df.replace("", nan_value, inplace=True)
    

def levenshtein_distance(s, t):
    ''' From Wikipedia article; Iterative with two matrix rows. '''
    if s == t: return 0
    elif len(s) == 0: return len(t)
    elif len(t) == 0: return len(s)
    v0 = [None] * (len(t) + 1)
    v1 = [None] * (len(t) + 1)
    for i in range(len(v0)):
        v0[i] = i
    for i in range(len(s)):
        v1[0] = i + 1
        for j in range(len(t)):
            cost = 0 if s[i] == t[j] else 1
            v1[j + 1] = min(v1[j] + 1, v0[j + 1] + 1, v0[j] + cost)
        for j in range(len(v0)):
            v0[j] = v1[j]
            
    return v1[len(t)]    
        
def find_optimal_clusters(data, max_k,column,init_size,batch_size):
    iters = range(2, max_k+1, 2)
    
    sse = []
    for k in iters:
        sse.append(MiniBatchKMeans(n_clusters=k, init_size=init_size, batch_size=batch_size, random_state=20).fit(data).inertia_)
        if (k % print_every == 0):
            print('Fit {} clusters for  column: {}'.format(k,column))
    f, ax = plt.subplots(1, 1)
    ax.plot(iters, sse, marker='o')
    ax.set_xlabel('Cluster Centers')
    ax.set_xticks(iters)
    ax.set_xticklabels(iters)
    ax.set_ylabel('SSE')
    ax.set_title('SSE by Cluster Center Plot')
    plt.show()
    
def plot_tsne_pca(data, labels,column):
    max_label = max(labels)
    max_items = np.random.choice(range(data.shape[0]), size=min(3000,data.shape[0]), replace=False)
    
    pca = PCA(n_components=2).fit_transform(data[max_items,:].todense())
    tsne = TSNE().fit_transform(PCA(n_components=20).fit_transform(data[max_items,:].todense()))
    
    
    idx = np.random.choice(range(pca.shape[0]), size=500, replace=False)
    label_subset = labels[max_items]
    label_subset = [cm.hsv(i/max_label) for i in label_subset[idx]]
    
    f, ax = plt.subplots(1, 2, figsize=(14, 6))
    
    scatt=ax[0].scatter(pca[idx, 0], pca[idx, 1], c=label_subset)
    #ax[0].legend(*scatt.legend_elements())
    ax[0].set_title('PCA Cluster Plot ' + column)
    
    ax[1].scatter(tsne[idx, 0], tsne[idx, 1], c=label_subset)
    ax[1].set_title('TSNE Cluster Plot ' + column)
    plt.show()
    from scipy.sparse.linalg import svds, eigs
    from scipy.sparse import csc_matrix

    um,sm,vm=svds(data*1.0,k=5)
    um=pd.DataFrame(um)
    um['label']=labels
    return um
    
def get_top_keywords(data, clusters, labels, n_terms,column):
    df = pd.DataFrame(data.todense()).groupby(clusters).mean()
    
    for i,r in df.iterrows():
        print('\nCluster {} column: {}'.format(i,column))
        print(','.join(set([labels[t] for t in np.argsort(r)[-n_terms:]])))

            
tfidf = TfidfVectorizer(
    min_df = 1,
    max_df = 0.95,
    stop_words = 'english',    
    max_features = 450
)

import pandas as pd


#RAKE/rake.py /
#@polymeris polymeris FIX stop_words_path in Rake class initialization
#@idf@mikeiannacone@aneesha@GMadorell@polymeris
  
# Implementation of RAKE - Rapid Automtic Keyword Exraction algorithm
# as described in:
# Rose, S., D. Engel, N. Cramer, and W. Cowley (2010). 
# Automatic keyword extraction from indi-vidual documents. 
# In M. W. Berry and J. Kogan (Eds.), Text Mining: Applications and Theory.unknown: John Wiley and Sons, Ltd.

import re
import operator

debug = False


def is_number(s):
    try:
        float(s) if '.' in s else int(s)
        return True
    except ValueError:
        return False


def load_stop_words(stop_word_file):
    """
    Utility function to load stop words from a file and return as a list of words
    @param stop_word_file Path and file name of a file containing stop words.
    @return list A list of stop words.
    """
    stop_words = []
    for line in open(stop_word_file):
        if line.strip()[0:1] != "#":
            for word in line.split():  # in case more than one per line
                stop_words.append(word)
    return stop_words


def separate_words(text, min_word_return_size):
    """
    Utility function to return a list of all words that are have a length greater than a specified number of characters.
    @param text The text that must be split in to words.
    @param min_word_return_size The minimum no of characters a word must have to be included.
    """
    splitter = re.compile('[^a-zA-Z0-9_\\+\\-/]')
    words = []
    for single_word in splitter.split(text):
        current_word = single_word.strip().lower()
        #leave numbers in phrase, but don't count as words, since they tend to invalidate scores of their phrases
        if len(current_word) > min_word_return_size and current_word != '' and not is_number(current_word):
            words.append(current_word)
    return words


def split_sentences(text):
    """
    Utility function to return a list of sentences.
    @param text The text that must be split in to sentences.
    """
    sentence_delimiters = re.compile(u'[.!?,;:\t\\\\"\\(\\)\\\'\u2019\u2013]|\\s\\-\\s')
    sentences = sentence_delimiters.split(text)
    return sentences


def build_stop_word_regex(stop_word_file_path):
    stop_word_list = load_stop_words(stop_word_file_path)
    stop_word_regex_list = []
    for word in stop_word_list:
        word_regex = r'\b' + word + r'(?![\w-])'  # added look ahead for hyphen
        stop_word_regex_list.append(word_regex)
    stop_word_pattern = re.compile('|'.join(stop_word_regex_list), re.IGNORECASE)
    return stop_word_pattern


def generate_candidate_keywords(sentence_list, stopword_pattern):
    phrase_list = []
    for s in sentence_list:
        tmp = re.sub(stopword_pattern, '|', s.strip())
        phrases = tmp.split("|")
        for phrase in phrases:
            phrase = phrase.strip().lower()
            if phrase != "":
                phrase_list.append(phrase)
    return phrase_list


def calculate_word_scores(phraseList):
    word_frequency = {}
    word_degree = {}
    for phrase in phraseList:
        word_list = separate_words(phrase, 0)
        word_list_length = len(word_list)
        word_list_degree = word_list_length - 1
        #if word_list_degree > 3: word_list_degree = 3 #exp.
        for word in word_list:
            word_frequency.setdefault(word, 0)
            word_frequency[word] += 1
            word_degree.setdefault(word, 0)
            word_degree[word] += word_list_degree  #orig.
            #word_degree[word] += 1/(word_list_length*1.0) #exp.
    for item in word_frequency:
        word_degree[item] = word_degree[item] + word_frequency[item]

    # Calculate Word scores = deg(w)/frew(w)
    word_score = {}
    for item in word_frequency:
        word_score.setdefault(item, 0)
        word_score[item] = word_degree[item] / (word_frequency[item] * 1.0)  #orig.
    #word_score[item] = word_frequency[item]/(word_degree[item] * 1.0) #exp.
    return word_score


def generate_candidate_keyword_scores(phrase_list, word_score):
    keyword_candidates = {}
    for phrase in phrase_list:
        keyword_candidates.setdefault(phrase, 0)
        word_list = separate_words(phrase, 0)
        candidate_score = 0
        for word in word_list:
            candidate_score += word_score[word]
        keyword_candidates[phrase] = candidate_score
    return keyword_candidates


def RAKE(text,__stop_words_pattern):

    sentence_list = split_sentences(text)
    phrase_list = generate_candidate_keywords(sentence_list, __stop_words_pattern)
    word_scores = calculate_word_scores(phrase_list)
    keyword_candidates = generate_candidate_keyword_scores(phrase_list, word_scores)
    sorted_keywords = sorted(keyword_candidates.items(), key=operator.itemgetter(1), reverse=True)
    return sorted_keywords

debug=True

if True:
    text = "Compatibility of systems of linear constraints over the set of natural numbers. Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered. Upper bounds for components of a minimal set of solutions and algorithms of construction of minimal generating sets of solutions for all types of systems are given. These criteria and the corresponding algorithms for constructing a minimal supporting set of solutions can be used in solving all the considered types of systems and systems of mixed types."

    # Split text into sentences
    sentenceList = split_sentences(text)
    #stoppath = "FoxStoplist.txt" #Fox stoplist contains "numbers", so it will not find "natural numbers" like in Table 1.1
    stoppath = "/kaggle/input/smartstoplist/SmartStoplist.txt"  #SMART stoplist misses some of the lower-scoring keywords in Figure 1.5, which means that the top 1/3 cuts off one of the 4.0 score words in Table 1.1
    stopwordpattern = build_stop_word_regex(stoppath)

    # generate candidate keywords
    phraseList = generate_candidate_keywords(sentenceList, stopwordpattern)

    # calculate individual word scores
    wordscores = calculate_word_scores(phraseList)

    # generate candidate keyword scores
    keywordcandidates = generate_candidate_keyword_scores(phraseList, wordscores)
    if debug: print (keywordcandidates)

    sortedKeywords = sorted(keywordcandidates.items(), key=operator.itemgetter(1), reverse=True)
    if debug: print (sortedKeywords)

    totalKeywords = len(sortedKeywords)
    if debug: print (totalKeywords)
    print (sortedKeywords[0:(int(totalKeywords / 3) )])

    #rake = Rake("SmartStoplist.txt")
    keywords = RAKE(text,stopwordpattern)
    print (keywords)


# text + sentiment tfidf 
test if sentiment clusters are seperatated with tfidf date combined with sentiment
> evidently if you add the sentiment word to the text, that the separation becomes a fact

In [None]:
train = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/train.csv',encoding='utf-8')
tfidf = TfidfVectorizer( ngram_range=(1, 1) )
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
from scipy.sparse import csc_matrix
X=tfidf.fit_transform(train.text.fillna(' ')+' '+train.sentiment)   
X=X.T.multiply(np.exp( np.array( le.fit_transform(train.sentiment) ) ) )
X=csc_matrix(X.T)
plot_tsne_pca(X, le.fit_transform(train.sentiment)+1,'tt')  


# text+sentiment ngram1-2 Countvectorizer
is better

In [None]:
train = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/train.csv',encoding='utf-8')
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(ngram_range=(1, 2))
Xc = vectorizer.fit_transform(train.text.fillna(' ') +' '+train.sentiment)
#XcS=Xc.T.multiply(np.exp( np.array( le.fit_transform(train.sentiment) ) ) )
#XcS=csc_matrix(XcS.T)
plot_tsne_pca(Xc, le.fit_transform(train.sentiment)+1,'tt')  


# usvd separation word/text vectors
makes it evenbetter

In [None]:
from scipy.sparse.linalg import svds, eigs
from scipy.sparse import csc_matrix

u,s,v=svds(Xc*1.0,k=30)
plot_tsne_pca(csc_matrix(u*s), le.fit_transform(train.sentiment)+1,'tn')  


In [None]:
find_optimal_clusters(u,20,2,20,1000)

In [None]:
print(vectorizer.vocabulary_['positive'],vectorizer.vocabulary_['negative'],vectorizer.vocabulary_['neutral'])

labels= np.array([0 for x in range(len(v.T)) ] )
labels[vectorizer.vocabulary_['positive']]=2
labels[vectorizer.vocabulary_['negative']]=3
labels[vectorizer.vocabulary_['neutral']]=1                 

# FIND POSITIVE NEGATIVE AND NEUTRAL WORDS

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
#cosine_similarity(v.T,
v123=np.concatenate((v.T[vectorizer.vocabulary_['neutral']],v.T[vectorizer.vocabulary_['positive']],v.T[vectorizer.vocabulary_['negative']])).reshape(3,-1)
v123=pd.DataFrame(cosine_similarity(v.T,v123))
#labels=
v123['label']=0
for xi in range (len(v123)):
    maxrow=v123.iloc[xi,:3].max()
    for yi in range(3):
        if v123.iloc[xi,yi]>0.335:
            #print(xi,yi,maxrow)
            v123.iat[xi,3]=yi+1
        
labels=v123.label.values
        

In [None]:
plot_tsne_pca(csc_matrix(v.T),labels,'tn')  


# EMBEDDING

In [None]:
train = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/train.csv',encoding='utf-8')
test = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/test.csv',encoding='utf-8')
test['sentiment']='unknown'
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(ngram_range=(1, 2))

Xc = vectorizer.fit_transform( (train.text.fillna(' ') +' '+train.sentiment).append(test.text.fillna(' ')) )
#XcS=Xc.T.multiply(np.exp( np.array( le.fit_transform(train.sentiment) ) ) )
#XcS=csc_matrix(XcS.T)
plot_tsne_pca(Xc, le.fit_transform(train.sentiment.append(test.sentiment) ) +1,'tt')  


In [None]:
le.inverse_transform([ 0, 1, 2,3])

In [None]:
from scipy.sparse.linalg import svds, eigs
from scipy.sparse import csc_matrix

u,s,v=svds(Xc*1.0,k=30)
plot_tsne_pca(csc_matrix(u*s), le.fit_transform(train.sentiment.append(test.sentiment) )+1,'tn')  


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
#cosine_similarity(v.T,
v123=np.concatenate((v.T[vectorizer.vocabulary_['neutral']],v.T[vectorizer.vocabulary_['positive']],v.T[vectorizer.vocabulary_['negative']])).reshape(3,-1)
v123=pd.DataFrame(cosine_similarity(v.T,v123))
#labels=
v123['label']=0
for xi in range (len(v123)):
    maxrow=v123.iloc[xi,:3].max()
    for yi in range(3):
        if v123.iloc[xi,yi]>0.335:
            #print(xi,yi,maxrow)
            v123.iat[xi,3]=yi+1
        
labels=v123.label.values
        

In [None]:
Xn = vectorizer.transform(train.selected_text[train.sentiment=='negative'].fillna(' ') )
Xwn=Xn.sum(axis=0)
Xp = vectorizer.transform(train.selected_text[train.sentiment=='positive'].fillna(' ') )
Xwp=Xp.sum(axis=0)
#[xi for xi in range(26000) if Xwn[:,xi]>Xwp[:,xi] and Xwp[:,xi]>0]
Xo = vectorizer.transform(train.selected_text[train.sentiment=='neutral'].fillna(' ') )
Xwo=Xo.sum(axis=0)
Xwo

In [None]:
from scipy.sparse import csc_matrix

us=np.dot(Xc,csc_matrix(v.T))
plot_tsne_pca(us, le.fit_transform(train.sentiment.append(test.sentiment) )+1,'tn')  


# find cosine similarity of text vectors

In [None]:
usim=pd.DataFrame(u)
usim['label']=le.fit_transform(train.sentiment.append(test.sentiment) )
ugroup=usim.groupby('label').mean()
from sklearn.metrics.pairwise import cosine_similarity
print(u.shape,ugroup.shape)
ucosim=pd.DataFrame( cosine_similarity(u,ugroup) )
ucosim['label']=le.fit_transform(train.sentiment.append(test.sentiment) )
ucosim['max']=ucosim.iloc[:,:3].idxmax(axis=1)

ucosim
print(le.inverse_transform([ 0, 1, 2,3]))
ucosim.groupby(['label','max']).count()


In [None]:
#totalu=pd.DataFrame(u)
kmea=MiniBatchKMeans(n_clusters=3, init_size=10, batch_size=100, random_state=20).fit_transform(u)
kmea=pd.DataFrame(kmea)
kmea['label']=le.fit_transform(train.sentiment.append(test.sentiment) )
kmea['max']=kmea.iloc[:,:3].idxmax(axis=1)
kmea.groupby(['label','max']).count()


# find relevant texts

In [None]:
subm = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/sample_submission.csv',encoding='utf-8')
subm

# searching kmeans keywords

In [None]:

train = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/train.csv',encoding='utf-8')
train["text"] = train["text"].astype(str)
train["text"] = train["text"].str.lower()
train["selected_text"] = train["selected_text"].astype(str)
train["selected_text"] = train["selected_text"].str.lower()
train["sentiment"] = train["sentiment"].astype(str)
train["sentiment"] = train["sentiment"].str.lower()
drop_empty_rows(train)

test = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/test.csv',encoding='utf-8')
test["text"] = test["text"].astype(str)
test["text"] = test["text"].str.lower()
test["sentiment"] = test["sentiment"].astype(str)
test["sentiment"] = test["sentiment"].str.lower()
drop_empty_rows(test)

common_cols = list(set.intersection(*(set(df.columns) for df in [train,test])))
combined = pd.concat([df[common_cols] for df in [train,test]], ignore_index=True)
# applying groupby() function to 
# group the data on team value. 
gp = combined.groupby('sentiment') 
  
# Let's print the first entries 
# in all the groups formed. 
for name, group in gp: 
    print(name) 
    print(group) 
    print(len(group)) 

optimal_clusters = 150
for name, group in gp: 
    tfidf.fit(group.text)
    text = tfidf.transform(group.text)   
    find_optimal_clusters(text, optimal_clusters,name,init_size,batch_size)

kmeans_collection = {}
n_clusters = 150           
for name, group in gp: 
    tfidf.fit(group.text)
    text = tfidf.transform(group.text) 
    kmeans = MiniBatchKMeans(n_clusters=n_clusters, init_size=init_size, batch_size=batch_size, random_state=20)
    kmeans.fit(text)
    clusters = kmeans.predict(text) 
    plot_tsne_pca(text, clusters,name)  
    get_top_keywords(text, clusters, tfidf.get_feature_names(), 5,name)
    kmeans_collection[name.lower()] = kmeans


def get_keywords(line,data, clusters, labels, n_terms,column):
    df = pd.DataFrame(data.todense()).groupby(clusters).mean()
    selected_text = []
    for i,r in df.iterrows():
        #print('\nCluster {} column: {}'.format(i,column))
        #key_words = ','.join(set([labels[t] for t in np.argsort(r)[-n_terms:]]))
        #print(key_words)
        n_terms = len(labels)
        key_words = ','.join(set([labels[t] for t in np.argsort(r)[-n_terms:]]))
        for word in line.strip().split():
            for kw in key_words:
                word = word.strip()
                kw = kw.strip()
                ld = 1.0-levenshtein_distance(word,kw)/max(len(word),len(kw))
                if ld > 0.1:
                    selected_text.append(word) 
                    break
            #if word in key_words:
            #    selected_text.append(word)
    return " ".join(selected_text)

            
def jaccard(str1, str2): 
    if str1 and str2:
        a = set(str1.strip().split()) 
        b = set(str2.strip().split())
        c = a.intersection(b)
        return float(len(c)) / (len(a) + len(b) - len(c))
    else:
        return 0.0



@ the RAKE method

In [None]:
# RAKE( ' '.join(list( train.text.fillna(' ').values) ) ,stopwordpattern)

#RAKE(train.text[1],stopwordpattern)
    # Split text into sentences
    sentenceList = split_sentences(train.text[1])

    # generate candidate keywords
    phraseList = generate_candidate_keywords(sentenceList, stopwordpattern)
    print('wordgroups',phraseList)

    # calculate individual word scores
    wordscores = calculate_word_scores(phraseList)
    print('wordscore',wordscores)

    # generate candidate keyword scores
    keywordcandidates = generate_candidate_keyword_scores(phraseList, wordscores)
    print ('keyword',keywordcandidates)

    sortedKeywords = sorted(keywordcandidates.items(), key=operator.itemgetter(1), reverse=True)
    print('sort keyword',sortedKeywords)

    totalKeywords = len(sortedKeywords)
    print('nr keywords',totalKeywords)
    print ('top 30%',sortedKeywords[0:(int(totalKeywords / 3) )])


In [None]:
train[:5]

cosine_similarity(v.T[vectorizer.vocabulary_['sooo sad']].reshape(1,-1),v123)

In [None]:
v123=np.concatenate((v.T[vectorizer.vocabulary_['neutral']],v.T[vectorizer.vocabulary_['positive']],v.T[vectorizer.vocabulary_['negative']])).reshape(3,-1)

for ti in range(1,10):
    #keywordkandidates
    sentenceList = split_sentences(train.text[ti])
    phraseList = generate_candidate_keywords(sentenceList, stopwordpattern)
    wordscores = calculate_word_scores(phraseList)
    keywordcandidates = generate_candidate_keyword_scores(phraseList, wordscores)
    print(train.selected_text[ti:ti+1].values)
    costemp=[]
    wordtemp=[]
    #keywordkandidates: sekect highest positive - negative or neutral, the one with the highest cosinsimilarity
    for xi in keywordcandidates:
        try:
            wordtemp.append(xi)
            costemp.append( np.max( cosine_similarity(v.T[vectorizer.vocabulary_[xi]].reshape(1,-1),v123)  )  )
        except:
            pass
    try:
        print(wordtemp[np.argmax(np.array(costemp))] )
    except:
        print(train.text[ti])
#tfidfsp

In [None]:
v123=np.concatenate((v.T[vectorizer.vocabulary_['neutral']],v.T[vectorizer.vocabulary_['positive']],v.T[vectorizer.vocabulary_['negative']])).reshape(3,-1)
subm['selected_text']=' '
for ti in range(2,len(test)):
    #keywordkandidates
    sentenceList = split_sentences(test.text[ti])
    phraseList = generate_candidate_keywords(sentenceList, stopwordpattern)
    wordscores = calculate_word_scores(phraseList)
    keywordcandidates = generate_candidate_keyword_scores(phraseList, wordscores)
    costemp=[]
    wordtemp=[]
    #print(test.text[ti])
    #keywordkandidates: sekect highest positive - negative or neutral, the one with the highest cosinsimilarity
    for xi in keywordcandidates:
        try:
            costemp.append( np.max( cosine_similarity(v.T[vectorizer.vocabulary_[xi]].reshape(1,-1),v123)  )  )
            wordtemp.append(xi)

        except:
            pass
    try:
        #print(ti,xi,costemp,wordtemp[np.argmax(np.array(costemp))],wordtemp,subm.iloc[ti,0])
        subm.iat[ti,1]=wordtemp[np.argmax(np.array(costemp))] 
    except:
        subm.iat[ti,1]=test.text[ti]
#tfidfsp

In [None]:
subm.to_csv("submission.csv",index=False)

from sklearn.metrics.pairwise import cosine_similarity
#cosine_similarity(v.T,
v123=np.concatenate((v.T[vectorizer.vocabulary_['neutral']],v.T[vectorizer.vocabulary_['positive']],v.T[vectorizer.vocabulary_['negative']])).reshape(3,-1)
v123=pd.DataFrame(cosine_similarity(v.T,v123))
#labels=
v123['label']=0
for xi in range (len(v123)):
    maxrow=v123.iloc[xi,:3].max()
    for yi in range(3):
        if v123.iloc[xi,yi]>0.335:
            #print(xi,yi,maxrow)
            v123.iat[xi,3]=yi+1
        
labels=v123.label.values


scores = pd.DataFrame(columns = ["sentiment","text","selected_text","result","jaccard_score"])
count = 1
max_count = len(train)
print_every = 1000
gp = train.groupby('sentiment') 
for name, group in gp:
    for query,selected_text in zip(group.text,group.selected_text):
        text = tfidf.transform([query])      
        cluster = kmeans_collection[name.lower()].predict(text)
        result = get_keywords(selected_text,text,cluster,tfidf.get_feature_names(), 10,name)
        js = jaccard(selected_text,result)
        new_row = {'sentiment':name,'text':query, 'selected_text':selected_text, 'result':result, 'jaccard_score':js}
        scores = scores.append(new_row, ignore_index=True)
        if (count % print_every == 0):
            print("Train Processed:",count)
        count = count + 1
        if max_count < count:
            break
plt.figure()
scores.sort_values(by=['jaccard_score'],inplace=True,ascending=True)    
scores["jaccard_score"].plot.kde()
plt.hist(scores["jaccard_score"], color = 'blue', edgecolor = 'black')
plt.show()
print(scores["jaccard_score"].mean())

sample_submission = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/sample_submission.csv',encoding='utf-8')
sample_submission["selected_text"] = sample_submission["selected_text"].astype(str)
for index in range(len(test)):
    text = tfidf.transform([test.iloc[index]['text']])      
    cluster = kmeans_collection[test.iloc[index]['sentiment'].lower()].predict(text)
    result = get_keywords(test.iloc[index]['text'],text,cluster,tfidf.get_feature_names(), 10,name)
    sample_submission.at[index,'selected_text'] = result
    if (index % print_every == 0):
        print("Result:" ,result)
        print("Test Processed:",index)
        
sample_submission.to_csv("submission.csv",index=False)