In [24]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from IPython.display import display

In [25]:
clean_data = pd.read_csv('cleaned.csv')
sentiment = pd.read_csv('sentiment_dictionary.csv')
sentiment_dict = dict(zip(sentiment.words.values, sentiment.sentiment_coeff.values))

In [26]:
file_weight = clean_data.copy()

In [27]:
tfidf = TfidfVectorizer(tokenizer=lambda y: y.split(), norm=None)
tfidf.fit(file_weight.text)
features = pd.Series(tfidf.get_feature_names())
transformed = tfidf.transform(file_weight.text)



In [28]:
def create_tfidf_dictionary(x, transformed, features):
    vec_coo = transformed[x.name].tocoo()
    vec_coo.col = features.iloc[vec_coo.col].values
    dict_from_coo = dict(zip(vec_coo.col, vec_coo.data))
    return dict_from_coo

def replace_tfidf_words(x, transformed, features):
    dictionary = create_tfidf_dictionary(x, transformed, features)
    return list(map(lambda y: dictionary[f'{y}'], x.text.split()))

In [29]:
%%time
replaced_scores = file_weight.apply(lambda x: replace_tfidf_words(x, transformed, features), axis=1)

Wall time: 400 ms


In [30]:
replaced_scores

0       [4.747148362237912, 6.22305488204749, 7.203884...
1       [5.3067641501733345, 6.22305488204749, 4.09780...
2       [6.22305488204749, 6.916202062607435, 4.410676...
3       [7.203884135059216, 5.904601150928955, 7.20388...
4       [6.22305488204749, 6.22305488204749, 5.3580574...
                              ...                        
1478    [3.9717630834409947, 4.175362038682234, 5.2579...
1479    [8.947710054476461, 7.60934924316738, 4.836760...
1480    [6.916202062607435, 5.006659557722997, 3.38984...
1481    [4.970291913552122, 7.203884135059216, 6.35658...
1482    [3.859845167237009, 5.3067641501733345, 5.2579...
Length: 1483, dtype: object

In [31]:
def replace_sentiment_words(word, sent_dict):
    try:
        out = sent_dict[word]
    except KeyError:
        out = 0
    return out

In [32]:
replaced_closeness_score = file_weight.text.apply(lambda x: list(map(lambda y: replace_sentiment_words(y, sentiment_dict), x.split())))

In [33]:
replace_df = pd.DataFrame(data=[replaced_closeness_score, replaced_scores, file_weight.text, file_weight.sentiment]).T
replace_df.columns = ['sentiment_coeff', 'tfidf_score', 'sentence', 'sentiment_score']
replace_df['sentiment_rate'] = replace_df.apply(lambda x: np.array(x.loc['sentiment_coeff']) @ np.array(x.loc['tfidf_score']), axis=1)
replace_df['predict'] = (replace_df.sentiment_rate > 0).astype('int8')
replace_df['sentiment_score'] = [1 if i==1 else 0 for i in replace_df.sentiment_score]

In [35]:
predicted_classes = replace_df.predict
y_test = replace_df.sentiment_score

conf_mat = pd.DataFrame(confusion_matrix(replace_df.sentiment_score, replace_df.predict))
print('confusion matrix')
display(conf_mat)

test_scores = accuracy_score(y_test, predicted_classes), precision_score(y_test, predicted_classes), recall_score(y_test, predicted_classes), f1_score(y_test, predicted_classes)
print('\n\nScores')
scores = pd.DataFrame(data = [test_scores])
scores.columns = ['accuracy', 'precision', 'recall', 'f1_score']
scores = scores.T
scores.columns = ['scores']
display(scores)

confusion matrix


Unnamed: 0,0,1
0,21,1211
1,3,248




Scores


Unnamed: 0,scores
accuracy,0.181389
precision,0.169979
recall,0.988048
f1_score,0.290058
