In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from IPython.display import display

In [2]:
clean_data = pd.read_csv('cleaned.csv')
sentiment = pd.read_csv('sentiment_dictionary.csv')
sentiment_dict = dict(zip(sentiment.words.values, sentiment.sentiment_coeff.values))

In [3]:
file_weight = clean_data.copy()

In [4]:
tfidf = TfidfVectorizer(tokenizer=lambda y: y.split(), norm=None)
tfidf.fit(file_weight.text)
features = pd.Series(tfidf.get_feature_names())
transformed = tfidf.transform(file_weight.text)



In [5]:
def create_tfidf_dictionary(x, transformed, features):
    vec_coo = transformed[x.name].tocoo()
    vec_coo.col = features.iloc[vec_coo.col].values
    dict_from_coo = dict(zip(vec_coo.col, vec_coo.data))
    return dict_from_coo

def replace_tfidf_words(x, transformed, features):
    dictionary = create_tfidf_dictionary(x, transformed, features)
    return list(map(lambda y: dictionary[f'{y}'], x.text.split()))

In [6]:
%%time
replaced_scores = file_weight.apply(lambda x: replace_tfidf_words(x, transformed, features), axis=1)

CPU times: user 428 ms, sys: 8.8 ms, total: 437 ms
Wall time: 456 ms


In [7]:
replaced_scores

0       [6.803627809625917, 4.318721159837917, 5.09887...
1       [5.887337077751762, 5.417333448506026, 13.2284...
2       [6.580484258311707, 6.110480629065972, 2.49282...
3       [6.580484258311707, 6.110480629065972, 2.49282...
4       [6.580484258311707, 4.298101872635181, 3.41923...
                              ...                        
1320    [4.501042716631872, 14.182619764155396, 5.3567...
1321    [4.501042716631872, 3.307120248159437, 4.61319...
1322    [4.501042716631872, 3.307120248159437, 2.30659...
1323    [11.101729682261098, 7.4967749901858625, 11.10...
1324    [5.992697593409589, 6.110480629065972, 7.49677...
Length: 1325, dtype: object

In [8]:
def replace_sentiment_words(word, sent_dict):
    try:
        out = sent_dict[word]
    except KeyError:
        out = 0
    return out

In [9]:
replaced_closeness_score = file_weight.text.apply(lambda x: list(map(lambda y: replace_sentiment_words(y, sentiment_dict), x.split())))

In [10]:
replace_df = pd.DataFrame(data=[replaced_closeness_score, replaced_scores, file_weight.text, file_weight.sentiment]).T
replace_df.columns = ['sentiment_coeff', 'tfidf_score', 'sentence', 'sentiment_score']
replace_df['sentiment_rate'] = replace_df.apply(lambda x: np.array(x.loc['sentiment_coeff']) @ np.array(x.loc['tfidf_score']), axis=1)
replace_df['predict'] = (replace_df.sentiment_rate > 0).astype('int8')
replace_df['sentiment_score'] = [1 if i==1 else 0 for i in replace_df.sentiment_score]

In [11]:
predicted_classes = replace_df.predict
y_test = replace_df.sentiment_score

conf_mat = pd.DataFrame(confusion_matrix(replace_df.sentiment_score, replace_df.predict))
print('confusion matrix')
display(conf_mat)

test_scores = accuracy_score(y_test, predicted_classes), precision_score(y_test, predicted_classes), recall_score(y_test, predicted_classes), f1_score(y_test, predicted_classes)
print('\n\nScores')
scores = pd.DataFrame(data = [test_scores])
scores.columns = ['accuracy', 'precision', 'recall', 'f1_score']
scores = scores.T
scores.columns = ['scores']
display(scores)

confusion matrix


Unnamed: 0,0,1
0,9,1074
1,1,241




Scores


Unnamed: 0,scores
accuracy,0.188679
precision,0.18327
recall,0.995868
f1_score,0.30957
