<a href="https://colab.research.google.com/github/samlawson1/news/blob/TENSORFLOW_MODEL/tensorflow_model_train/Label_Text_For_Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install afinn

Collecting afinn
  Downloading afinn-0.1.tar.gz (52 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/52.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.6/52.6 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: afinn
  Building wheel for afinn (setup.py) ... [?25l[?25hdone
  Created wheel for afinn: filename=afinn-0.1-py3-none-any.whl size=53430 sha256=27d6617267647f40d2e9370d2dd807e1496ace9f8b116fbb37c23d27a667e8e2
  Stored in directory: /root/.cache/pip/wheels/b0/05/90/43f79196199a138fb486902fceca30a2d1b5228e6d2db8eb90
Successfully built afinn
Installing collected packages: afinn
Successfully installed afinn-0.1


In [2]:
import pandas as pd
from afinn import Afinn

In [6]:
#Using afinn package to label text for TensorFlow model
afinn = Afinn(language='en')
#Afinn df with keywords with a positive or negative value
afinn_wl_url = ('https://raw.githubusercontent.com'
                '/fnielsen/afinn/master/afinn/data/AFINN-111.txt')

afinn_wl_df = pd.read_csv(afinn_wl_url,
                          header=None,
                          sep='\t',
                          names=['term', 'value'])
print(len(afinn_wl_df))
afinn_wl_df.head()

2477


Unnamed: 0,term,value
0,abandon,-2
1,abandoned,-2
2,abandons,-2
3,abducted,-2
4,abduction,-2


Training File: [Million Headlines from Kaggle](https://www.kaggle.com/datasets/therohk/million-headlines?select=abcnews-date-text.csv)

In [11]:


training_file = 'abcnews-date-text.csv'

#process in chunks - csv file has over 1 Million Rows

data_scored = []
for chunk in pd.read_csv(training_file, chunksize = 50000):
  chunk['AfinnScore'] = chunk['headline_text'].apply(afinn.score)
  #Label the data - -1 = No Score, 1 = positive scores, 0 = negative scores, -1
  chunk['Label'] = [-1 if s == None else 1 if s > 0 else 0 for s in chunk['AfinnScore']]
  data_scored.append(chunk)

data_scored = pd.concat(data_scored)
data_scored.head()

Unnamed: 0,publish_date,headline_text,AfinnScore,Label
0,20030219,aba decides against community broadcasting lic...,0.0,0
1,20030219,act fire witnesses must be aware of defamation,-2.0,0
2,20030219,a g calls for infrastructure protection summit,0.0,0
3,20030219,air nz staff in aust strike for pay rise,-1.0,0
4,20030219,air nz strike to affect australian travellers,-1.0,0


In [14]:
#positive and negative scores
data_scored['Label'].value_counts()

Label
0    1025154
1     219030
Name: count, dtype: int64

In [16]:
#Get equal distribution of positive and negative labels
#200K total rows for train/test data

import random
pos_i = list(data_scored.loc[data_scored['Label'] == 1].index)
neg_i = list(data_scored.loc[data_scored['Label'] == 0].index)

#random shuffle

random.shuffle(pos_i)
random.shuffle(neg_i)

print(pos_i[:10])
print(neg_i[:10])

[600698, 25634, 296790, 979864, 564166, 1092598, 401535, 1038535, 960106, 807329]
[682520, 409441, 143141, 891481, 529440, 855147, 515179, 845321, 261876, 452067]


In [23]:
#Keep first 100K random indexes in each list
subset = 100000

keep_pos = pos_i[:subset]
keep_neg = neg_i[:subset]

#combine the 2 lists - 200K total
keepers = keep_pos + keep_neg
#sort
keepers = sorted(keepers)
print(len(keepers))
print(keepers[:10])

200000
[11, 13, 14, 16, 20, 28, 30, 32, 46, 49]


In [26]:
#filter dataset

train_test_data = data_scored.iloc[keepers]
train_test_data['Label'].value_counts()

Label
1    100000
0    100000
Name: count, dtype: int64

In [30]:
#Check dtypes
train_test_data.dtypes

publish_date       int64
headline_text     object
AfinnScore       float64
Label              int64
dtype: object

In [31]:
#Save to Google Drive
import os
my_drive_dir = r'drive/MyDrive'

data_folder = os.path.join(my_drive_dir, 'tf_text_analysis')
if not os.path.exists(data_folder):
  os.mkdir(data_folder)

file = 'train_test_data.csv'
file_out = os.path.join(data_folder, file)
train_test_data.to_csv(file_out, index = False)