-
Notifications
You must be signed in to change notification settings - Fork 0
/
create_final_analysis_data.py
41 lines (29 loc) · 1.75 KB
/
create_final_analysis_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import pandas as pd
# Create final predictions dataset by replacing machine predictions of reviews that are in training set with
# ground truth labels from humans
def main():
# Machine predictions for entire dataset
predictions = pd.read_csv('NA Reviews Data Sentiment.csv')
# Training data with ground truth labels
training = pd.read_csv('training_data.csv')
# Only keep needed columns from training data
training = training[['Id (Review)', 'Sentiment']]
# Join datasets
joined = predictions.join(training.set_index('Id (Review)'), on='Id (Review)')
# Capitalize training labels to match format of prediction labels
joined['Sentiment'] = joined['Sentiment'].str.upper()
# Make new column that uses the training label when available, and the machine prediction otherwise
joined['Best Sentiment'] = joined['Predicted Sentiment']
joined.loc[(joined['Predicted Sentiment'] != joined['Sentiment'])
& (pd.notnull(joined['Sentiment'])) &
(joined['Predicted Sentiment'] == 'POSITIVE'), 'Best Sentiment'] = 'NEGATIVE'
joined.loc[(joined['Predicted Sentiment'] != joined['Sentiment'])
& (pd.notnull(joined['Sentiment'])) &
(joined['Predicted Sentiment'] == 'NEGATIVE'), 'Best Sentiment'] = 'POSITIVE'
# Only include needed columns
joined = joined[['Id (Review)', 'User Id', 'name (vehicle)', 'Location Id', 'Created At (Review)',
'Connectors_site', 'Networks_site', 'Check-in Rating (alias)', 'Review', 'Best Sentiment']]
joined = joined.rename(columns={'Best Sentiment': 'Predicted Sentiment'})
joined.to_csv('NA Reviews Data Best Sentiment.csv', index=False)
if __name__ == '__main__':
main()