In [15]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from IPython.display import display

In [3]:
clean_data = pd.read_csv('cleaned.csv')
sentiment = pd.read_csv('sentiment_dictionary.csv')
sentiment_dict = dict(zip(sentiment.words.values, sentiment.sentiment_coeff.values))

In [4]:
file_weight = clean_data.copy()

In [6]:
tfidf = TfidfVectorizer(tokenizer=lambda y: y.split(), norm=None)
tfidf.fit(file_weight.text)
features = pd.Series(tfidf.get_feature_names())
transformed = tfidf.transform(file_weight.text)



In [7]:
def create_tfidf_dictionary(x, transformed, features):
    vec_coo = transformed[x.name].tocoo()
    vec_coo.col = features.iloc[vec_coo.col].values
    dict_from_coo = dict(zip(vec_coo.col, vec_coo.data))
    return dict_from_coo

def replace_tfidf_words(x, transformed, features):
    dictionary = create_tfidf_dictionary(x, transformed, features)
    return list(map(lambda y: dictionary[f'{y}'], x.text.split()))

In [8]:
%%time
replaced_scores = file_weight.apply(lambda x: replace_tfidf_words(x, transformed, features), axis=1)

Wall time: 332 ms


In [9]:
replaced_scores

0       [4.655640812638202, 6.709764546333749, 7.22059...
1       [5.061105920746367, 3.5070181033954313, 6.9329...
2       [6.239760917088013, 6.709764546333749, 4.44800...
3       [7.220590170099739, 5.754253101306312, 7.22059...
4       [6.9329080976479585, 6.121977881431629, 3.2377...
                              ...                        
1503    [2.5988906821604374, 5.141148628419903, 6.9329...
1504    [6.239760917088013, 6.9329080976479585, 5.3234...
1505    [12.994453410802187, 7.220590170099739, 9.8336...
1506    [5.8342958089798485, 4.951906628781375, 15.252...
1507    [3.5070181033954313, 7.220590170099739, 5.9213...
Length: 1508, dtype: object

In [10]:
def replace_sentiment_words(word, sent_dict):
    try:
        out = sent_dict[word]
    except KeyError:
        out = 0
    return out

In [12]:
replaced_closeness_score = file_weight.text.apply(lambda x: list(map(lambda y: replace_sentiment_words(y, sentiment_dict), x.split())))

In [13]:
replace_df = pd.DataFrame(data=[replaced_closeness_score, replaced_scores, file_weight.text, file_weight.sentiment]).T
replace_df.columns = ['sentiment_coeff', 'tfidf_score', 'sentence', 'sentiment_score']
replace_df['sentiment_rate'] = replace_df.apply(lambda x: np.array(x.loc['sentiment_coeff']) @ np.array(x.loc['tfidf_score']), axis=1)
replace_df['predict'] = (replace_df.sentiment_rate > 0).astype('int8')
replace_df['sentiment_score'] = [1 if i==1 else 0 for i in replace_df.sentiment_score]

In [17]:
predicted_classes = replace_df.predict
y_test = replace_df.sentiment_score

conf_mat = pd.DataFrame(confusion_matrix(replace_df.sentiment_score, replace_df.predict))
print('confusion matrix')
display(conf_mat)

test_scores = accuracy_score(y_test, predicted_classes), precision_score(y_test, predicted_classes), recall_score(y_test, predicted_classes), f1_score(y_test, predicted_classes)
print('\n\nScores')
scores = pd.DataFrame(data = [test_scores])
scores.columns = ['accuracy', 'precision', 'recall', 'f1_score']
scores = scores.T
scores.columns = ['scores']
display(scores)

confusion matrix


Unnamed: 0,0,1
0,1505,3
1,0,0




Scores


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,scores
accuracy,0.998011
precision,0.0
recall,0.0
f1_score,0.0
