In [1]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.cluster import KMeans

In [2]:
word_vectors = Word2Vec.load("word2vec.model").wv


In [3]:
model = KMeans(n_clusters=2, max_iter=1000, random_state=True, n_init=50).fit(X=word_vectors.vectors)


In [4]:
word_vectors.similar_by_vector(model.cluster_centers_[0], topn=11, restrict_vocab=None)

[('thei', 0.7631075382232666),
 ('gammon', 0.729962944984436),
 ('decent', 0.7257193922996521),
 ('mostly', 0.7242159843444824),
 ('failing', 0.721838653087616),
 ('th', 0.7167788743972778),
 ('nah', 0.7133491039276123),
 ('peo', 0.7103697061538696),
 ('benefit', 0.7052839994430542),
 ('blackli', 0.7034202814102173),
 ('abou', 0.7015049457550049)]

In [5]:
positive_cluster_center = model.cluster_centers_[0]
negative_cluster_center = model.cluster_centers_[1]

In [6]:
words = pd.DataFrame(word_vectors.vocab.keys())
words.columns = ['words']
words['vectors'] = words.words.apply(lambda x: word_vectors.wv[f'{x}'])
words['cluster'] = words.vectors.apply(lambda x: model.predict([np.array(x)]))
words.cluster = words.cluster.apply(lambda x: x[0])

  This is separate from the ipykernel package so we can avoid doing imports until


In [7]:
words['cluster_value'] = [1 if i==0 else -1 for i in words.cluster]
words['closeness_score'] = words.apply(lambda x: 1/(model.transform([x.vectors]).min()), axis=1)
words['sentiment_coeff'] = words.closeness_score * words.cluster_value

In [8]:
words.head(10)

Unnamed: 0,words,vectors,cluster,cluster_value,closeness_score,sentiment_coeff
0,alllivesmatter,"[0.03352602, -0.043020952, -0.00232427, 0.0180...",0,1,1.054813,1.054813
1,look_like,"[-0.034277424, 0.018904887, 0.07060962, 0.0342...",0,1,1.130751,1.130751
2,attendant,"[-0.054576255, 0.07015803, -0.03585131, 0.0331...",1,-1,1.140103,-1.140103
3,realdonaldtrump,"[0.018800395, -0.10510669, 0.10309875, 0.01638...",0,1,1.096741,1.096741
4,graduate,"[0.05906425, 0.023874968, -0.005894852, 0.0327...",0,1,1.079865,1.079865
5,but,"[0.04050105, -0.044139367, 0.015760371, -0.001...",0,1,1.27164,1.27164
6,deranged,"[-0.027293492, -0.017787265, 0.09804127, 0.027...",0,1,1.170161,1.170161
7,ignorant,"[-0.024534395, -0.008489642, 0.00914253, 0.104...",0,1,1.112257,1.112257
8,incompetent,"[0.001806536, -0.037414446, 0.04319391, 0.0638...",0,1,1.098314,1.098314
9,r,"[0.017156366, 0.020579617, -0.026792856, 0.110...",0,1,1.028735,1.028735


In [9]:
words[['words', 'sentiment_coeff']].to_csv('sentiment_dictionary.csv', index=False)

ASSIGNING CLUSTERS

In [10]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from IPython.display import display

In [11]:
final_file = pd.read_csv('cleaned_datasets.csv')
#tweet_file=pd.read_csv('USelectionw2v.csv')

In [12]:
sentiment_map = pd.read_csv('sentiment_dictionary.csv')
sentiment_dict = dict(zip(sentiment_map.words.values, sentiment_map.sentiment_coeff.values))

In [13]:
file_weighting = final_file.copy()


In [14]:
tfidf = TfidfVectorizer(tokenizer=lambda y: y.split(), norm=None)
tfidf.fit(file_weighting.full_text)
features = pd.Series(tfidf.get_feature_names())
transformed = tfidf.transform(file_weighting.full_text)

In [15]:
def create_tfidf_dictionary(x, transformed_file, features):
    '''
    create dictionary for each input sentence x, where each word has assigned its tfidf score
    
    inspired  by function from this wonderful article: 
    https://medium.com/analytics-vidhya/automated-keyword-extraction-from-articles-using-nlp-bfd864f41b34
    
    x - row of dataframe, containing sentences, and their indexes,
    transformed_file - all sentences transformed with TfidfVectorizer
    features - names of all words in corpus used in TfidfVectorizer

    '''
    vector_coo = transformed_file[x.name].tocoo()
    vector_coo.col = features.iloc[vector_coo.col].values
    dict_from_coo = dict(zip(vector_coo.col, vector_coo.data))
    return dict_from_coo

def replace_tfidf_words(x, transformed_file, features):
    '''
    replacing each word with it's calculated tfidf dictionary with scores of each word
    x - row of dataframe, containing sentences, and their indexes,
    transformed_file - all sentences transformed with TfidfVectorizer
    features - names of all words in corpus used in TfidfVectorizer
    '''
    dictionary = create_tfidf_dictionary(x, transformed_file, features)   
    return list(map(lambda y:dictionary[f'{y}'], x.full_text.split()))

In [16]:
%%time
replaced_tfidf_scores = file_weighting.apply(lambda x: replace_tfidf_words(x, transformed, features), axis=1)#this step takes around 3-4 minutes minutes to calculate

Wall time: 7.34 s


In [17]:
def replace_sentiment_words(word, sentiment_dict):
    '''
    replacing each word with its associated sentiment score from sentiment dict
    '''
    try:
        out = sentiment_dict[word]
    except KeyError:
        out = 0
    return out

In [18]:
replaced_closeness_scores = file_weighting.full_text.apply(lambda x: list(map(lambda y: replace_sentiment_words(y, sentiment_dict), x.split())))

In [None]:
replacement_df = pd.DataFrame(data=[replaced_closeness_scores, replaced_tfidf_scores, file_weighting.title, file_weighting.rate]).T
replacement_df.columns = ['sentiment_coeff', 'tfidf_scores', 'sentence', 'sentiment']
replacement_df['sentiment_rate'] = replacement_df.apply(lambda x: np.array(x.loc['sentiment_coeff']) @ np.array(x.loc['tfidf_scores']), axis=1)
replacement_df['prediction'] = (replacement_df.sentiment_rate>0).astype('int8')

In [19]:
replacement_df = pd.DataFrame(data=[replaced_closeness_scores, replaced_tfidf_scores, file_weighting.full_text]).T
replacement_df.columns = ['sentiment_coeff', 'tfidf_scores', 'sentence']
replacement_df['sentiment_rate'] = replacement_df.apply(lambda x: np.array(x.loc['sentiment_coeff']) @ np.array(x.loc['tfidf_scores']), axis=1)
replacement_df['prediction'] = (replacement_df.sentiment_rate>0).astype('int8')
replacement_df['Tweet']=final_file.Tweet
replacement_df['Hashtags']=final_file.hashtags

In [21]:
replacement_df['prediction'] = replacement_df['prediction'].apply(lambda x: 1 if x==0 else -1)


In [22]:
replacement_df.head(100)

Unnamed: 0,sentiment_coeff,tfidf_scores,sentence,sentiment_rate,prediction,Tweet,Hashtags
0,"[0, 1.0548129096757086]","[10.77741399528076, 3.840099914057079]",traceybra alllivesmatter,4.050587,-1,@traceybra @TheLondonHughes #AllLivesMatter,alllivesmatter
1,"[1.1307508195068914, 0, -1.1401028216133509]","[6.010975661696547, 9.273336598504486, 9.07266...",look_like easyjet_flight attendant,-3.546856,1,RT @beardedfinance: You look like an EasyJet f...,
2,"[0, 1.0548129096757086, 1.0548129096757086, 1....","[10.77741399528076, 26.880699398399553, 26.880...",lieve alllivesmatter alllivesmatter alllivesma...,198.478761,-1,RT @EWindt: Lieve @NUnl #AllLivesMatter #AllL...,"alllivesmatter,alllivesmatter,alllivesmatter,a..."
3,"[1.0967411557102231, 0, 1.0798650595378538, 1....","[3.6790383566899734, 9.16797608284666, 8.13835...",realdonaldtrump congrats graduate but embarras...,52.686019,-1,@realDonaldTrump Congrats to the graduates. Bu...,
4,"[-1.1048555385771512, 1.1587823894226137, 1.15...","[8.580189417944542, 5.46174799039811, 5.309353...",secretsbedard_thezogbypoll country want law_or...,34.289739,-1,RT @deplorablelori: @SecretsBedard @realDonald...,
...,...,...,...,...,...,...,...
95,"[1.0967411557102231, 1.1338393406851608, 1.122...","[3.6790383566899734, 7.558538170412559, 8.5261...",realdonaldtrump black_community need_stand pre...,47.428534,-1,RT @LittleMike1977: @realDonaldTrump “The Blac...,
96,"[1.1209469991455219, 0, 0, 1.004533223931562, ...","[12.01345874163019, 9.524651026785392, 19.0493...",cop many_victim buffalo v orlando 13 people ki...,28.343042,-1,Cops have too many victims. Buffalo vs. Orland...,
97,"[0, 0, 1.1384337409032284, 1.2052243082191785,...","[10.084266814720815, 9.16797608284666, 6.55790...",hackneyabbott deluded idiot ignoring video blm...,62.480874,-1,@HackneyAbbott You deluded idiot. You're ignor...,
98,"[0, 1.1192989212489453, 0, 1.2009581038748272,...","[10.371948887172596, 8.835680253216765, 9.1679...",berniebros amp alt left fucking w/ amp hard wo...,67.297141,-1,Why is @AOC #BernieBros &amp; the alt-left fu...,berniebros


In [23]:
replacement_df[['Tweet','Hashtags','prediction']].to_csv('final_prediction.csv',index=False)