# Prerequisites

- Python 3.10.4

> Warning: Installation from conda environment may take few minutes

Configuring conda environment

```cmd
conda activate ca2_env
conda install matplotlib
conda install pandas
conda install nltk
conda install scikit-learn
conda install jsonpickle
conda install -c conda-forge textblob
conda install -c conda-forge scrapy
conda install -c conda-forge pycountry
conda install -c conda-forge wordcloud
conda install -c conda-forge langdetect
pip install emoji
```

References:
https://towardsdatascience.com/step-by-step-twitter-sentiment-analysis-in-python-d6f650ade58d
CCT College; David Text Processing Class



In [1]:
# Import Libraries
from textblob import TextBlob
import pandas as pd
import numpy as np
import nltk
import re
import string
import nltk
import re
import emoji
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.corpus import stopwords
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer() 

# IFA Market and prices - Web scrapage

## Step 1: Configure spider
Configure spider `\scrapy\quotesbot\ifa-css.py` with the list of urls


```python
urls=[  'https://www.ifa.ie/markets-and-prices/grain-price-update-20th-may/',
        'https://www.ifa.ie/markets-and-prices/potato-market-update-18th-may/',
        'https://www.ifa.ie/markets-and-prices/pig-market-update-18th-may-2/',
        'https://www.ifa.ie/markets-and-prices/weekly-cattle-prices-18th-may/',
        ...
        'https://www.ifa.ie/markets-and-prices/grain-market-update-3rd-feb/',
        'https://www.ifa.ie/markets-and-prices/weekly-cattle-prices-3rd-feb/',
        'https://www.ifa.ie/markets-and-prices/potato-market-update-3rd-feb/']
```

## Step 2: Check css selectors.
The following selectors has been configured.

```python
    yield {
        'title': quote.css(".entry-title::text").extract_first(),
        'time': quote.css(".entry-date::text").extract_first(),
        'text': quote.css(".single-content > p::text").getall()
    }
```
## Step 3: Run scrapy
Run `scrapy crawl toscrape-css -o market-prices.json` to generate markets and prices articles. See full list of URLS below.



In [2]:
import nltk
nltk.download('stopwords')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rmsry\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\rmsry\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:
stop = stopwords.words('english')
# Store the string.punctuation into an object punct
punct = string.punctuation

def avg_word(sentence):
    words = sentence.split()
    return (sum(len(word) for word in words)/len(words))

In [4]:
market_price_df = pd.read_json("../data/ifa.ie/market-prices.json")
market_price_df.sample(5)

Unnamed: 0,title,time,text
43,Weekly Cattle Prices 9th March,9 March 2022,[Prices reported as quoted or paid to IFA Memb...
34,Weekly Cattle Prices 6th April,6 April 2022,[Prices reported as quoted or paid to IFA Memb...
163,Potato Market Update 3rd Feb,3 February 2021,[Household consumption and retail sales remain...
15,Pig Market Update 4th May,4 May 2022,[Irish pig price was unchanged this week with ...
124,Beef & Sheep Update 25th June,25 June 2021,[IFA Livestock Chairman Brendan Golden said be...


In [5]:
# Create a copy of original text
market_price_df['text'] = market_price_df['text'].apply(lambda x: str(x))
market_price_df['text_original'] = market_price_df['text']

# Title of article
market_price_df["title"] = market_price_df.title.astype(str)
market_price_df["tags"] = market_price_df.title.apply(lambda x: x.split(" Update ")[0][:15])

# Original Word count
market_price_df['word_count'] = market_price_df['text'].apply(lambda x: len(str(x).split(" ")))

# Average word length
market_price_df['avg_word'] = market_price_df['text'].apply(lambda x: avg_word(x))

# Create column with stop words on text
market_price_df['stopwords'] = market_price_df['text'].apply(lambda x: len([x for x in x.split() if x in stop]))
market_price_df[['text','stopwords']].head()

# Number of numers in text
market_price_df['numeric_values'] = market_price_df['text'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
market_price_df[['text','numeric_values']].head()

# Highlight upper case words
market_price_df['upper'] = market_price_df['text'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
market_price_df[['text','upper']].head()

# Convert text to lower
market_price_df['text'] = market_price_df['text'].apply(lambda x: " ".join(x.lower() for x in x.split()))
market_price_df['text'].head()

# Remove stop words
stop = stopwords.words('english')
market_price_df['text'] = market_price_df['text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
market_price_df['text'].head()

# remove 10 most frequent words
freq = pd.Series(' '.join(market_price_df['text']).split()).value_counts()[:10]
freq = list(freq.index)
market_price_df['text'] = market_price_df['text'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
market_price_df['text'].head()

# remove 10 less frequet words
freq = pd.Series(' '.join(market_price_df['text']).split()).value_counts()[-10:]
freq = list(freq.index)
market_price_df['text'] = market_price_df['text'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
market_price_df['text'].head()

# Remove special characters
market_price_df = market_price_df.replace(r'[^A-Za-z0-9 ]+', '', regex=True)

# Get sentiment using TextBlob
market_price_df['sentiment'] = market_price_df['text'].apply(lambda x: TextBlob(x).sentiment[0] )
market_price_df.query("abs(sentiment) > 0.5 ")

Unnamed: 0,title,time,text,text_original,tags,word_count,avg_word,stopwords,numeric_values,upper,sentiment


# 2 Boards.ie CAP 

## Step 1: Configure spider
Configure spider `\scrapy\quotesbot\boards-css.py` with the list of urls. Discussion regarding CAP reforms since 2013.


```python
urls=[  'https://www.boards.ie/discussion/2058237577/new-cap-for-2023/p1',
        'https://www.boards.ie/discussion/2058214934/new-cap/p1',
        'https://www.boards.ie/discussion/2057938257/post-2020-cap/p1',
        'https://www.boards.ie/discussion/2057780562/cap-payments-reference-years-2000-2002/p1',
        'https://www.boards.ie/discussion/2057336158/the-new-cap/p1',
        'https://www.boards.ie/discussion/2057111876/cap-reform/p1',
        'https://www.boards.ie/discussion/2056998752/cap-2013/p1']
```

## Step 2: Check css selectors.
The following selectors has been configured.

```python
    yield {
            'title': response.css("#Item_0 > h1::text").extract_first(),
            'time_1': li.css("div.postbit-header::text").extract_first(),
            'time': li.css("a.Permalink::text").extract_first(),
            'text': li.css("div.Item-Body > div.Message > p::text").getall(),
            'text_1': li.css("div.Item-Body > div.Message::text").getall()
        }
```


## Step 3: Run scrapy
Run `scrapy crawl boards-css -o market-prices.json` to generate cap discussion json.

In [6]:
cap_df = pd.read_json("../data/boards.ie/boards-cap.json")
cap_df.head(5)

Unnamed: 0,title,time_1,time,text,text_1
0,Post 2020 CAP,\n15-12-2018 1:18pm,,[],[\nJust wondering what the room thinks of the ...
1,Post 2020 CAP,\n,15-12-2018 1:34pm,"[\n, \n, \n]","[\n, \n, \nThe entitlements belong to your mot..."
2,Post 2020 CAP,\n,15-12-2018 1:37pm,[],[\nIt's a ridiculous scheme anyway. Pure lazin...
3,Post 2020 CAP,\n,15-12-2018 3:19pm,"[\n, \n, \n]","[\n, \n, \nCompletely agree ( though instinct ..."
4,Cap 2013,\n23-07-2013 5:29pm,,[],[\nA quote from Mairead mcguinness on the perm...


In [7]:
# merge time fields
cap_df["time"] = cap_df.apply(lambda x: x.time if x.time != None and str(x.time) > str(x.time_1) else str(x.time_1).replace("\\n",""), axis=1 )
cap_df["time"] = cap_df.time.astype(np.datetime64)
cap_df.drop("time_1",inplace=True, axis=1)

In [8]:
cap_df["text"] = cap_df.apply(lambda x: str(x.text).replace("\\n","") if len(str(x.text)) > len(str(x.text_1)) else str(x.text_1).replace("\\n",""), axis=1 )
cap_df.drop("text_1",inplace=True, axis=1)

In [9]:
cap_df.sample(10)

Unnamed: 0,title,time,text
117,New CAP,2021-10-27 20:40:00,['What peoples thought on leased land and enti...
15,Post 2020 CAP,2018-12-15 17:08:00,"['', '', ""Completely agree, it's redicolous."",..."
107,New CAP,2021-10-27 17:23:00,"['', '', ""I don't know about AF as it'd be too..."
25,CAP Payments Reference Years 2000-2002,2017-08-27 19:21:00,"['', '', ""It was well flagged that the link'd ..."
72,New CAP for 2023,2022-03-16 16:38:00,"['', '', 'Ah indeed it was a draft I got the t..."
78,New CAP for 2023,2022-03-16 18:56:00,"['', '', 'Yeah just picked it as a random refe..."
124,New CAP,2021-10-27 21:01:00,"['', '', 'The payments follow which ever is th..."
17,Cap 2013,2013-07-23 18:57:00,['Good road frontage?;) ']
166,New CAP,2021-11-13 09:57:00,"['', '', 'Depends. Dept fella said they first ..."
52,New CAP,2021-10-27 09:39:00,"['', '', ""You can on that farmers journal calc..."


In [10]:
# Handling emoji's
def replace_emoji(w:str):
   for key in emoji.unicode_codes.EMOJI_DATA:
      if key in w:
         lang = emoji.unicode_codes.EMOJI_DATA.get(key)
         w = w.replace(key,str(lang.get('en')).replace("_"," ").replace(":"," "))
   return w

def clean_word_emoji(w):
   w = re.sub(r'[\[A-Za-z\]]',"",w)
   w = w.replace("'","")
   return w

In [11]:
# Create a copy of original text
cap_df['text_original'] = cap_df['text'].astype("str")

# Replace emoji with name
cap_df['text'] = cap_df['text'].apply(lambda x: replace_emoji(x))

# Original Word count
cap_df['word_count'] = cap_df['text'].apply(lambda x: len(str(x).split(" ")))

# Average word length
cap_df['avg_word'] = cap_df['text'].apply(lambda x: avg_word(x))

# Create column with stop words
stop = stopwords.words('english')
cap_df['stopwords'] = cap_df['text'].apply(lambda x: len([x for x in x.split() if x in stop]))
cap_df[['text','stopwords']].head()

# Number of numers in text
cap_df['numeric_values'] = cap_df['text'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
cap_df[['text','numeric_values']].head()

# Highlight upper case words
cap_df['upper'] = cap_df['text'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
cap_df[['text','upper']].head()

# Convert text to lower
cap_df['text'] = cap_df['text'].apply(lambda x: " ".join(x.lower() for x in x.split()))
cap_df['text'].head()

# Remove stop words after text formatting
stop = stopwords.words('english')
cap_df['text'] = cap_df['text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
cap_df['text'].head()

# Remove puntuation
cap_df['text'] = cap_df['text'].apply(lambda x: " ".join(x for x in x.split() if x not in punct))
cap_df['text'].head()

# remove 10 most frequent words
freq = pd.Series(' '.join(cap_df['text']).split()).value_counts()[:10]
freq = list(freq.index)
cap_df['text'] = cap_df['text'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
cap_df['text'].head()

# remove 10 less frequet words
freq = pd.Series(' '.join(cap_df['text']).split()).value_counts()[-10:]
freq = list(freq.index)
cap_df['text'] = cap_df['text'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
cap_df['text'].head()

# Remove special characters
cap_df = cap_df.replace(r'[^A-Za-z0-9 ]+', '', regex=True)

# Second pass stop words after text formatting
stop = stopwords.words('english')
cap_df['text'] = cap_df['text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
cap_df['text'].head()

0    wondering room thinks following theory widowed...
1    entitlements belong mother leased current tena...
2    ridiculous scheme anyway pure laziness let sit...
3    completely agree though instinct tells phil ho...
4    quote mairead mcguinness permanent pasture asp...
Name: text, dtype: object

In [12]:
# Get sentiment using TextBlob
cap_df['sentiment'] = cap_df['text'].apply(lambda x: TextBlob(x).sentiment[0] )


In [13]:
cap_df.query("sentiment > 0.3 ")

Unnamed: 0,title,time,text,text_original,word_count,avg_word,stopwords,numeric_values,upper,sentiment
7,CAP Payments Reference Years 20002002,2017-08-27 18:57:00,chosen privie information others lucky,I would think the chosen few were privie to th...,16,4.6875,7,0,1,0.333333
8,CAP Payments Reference Years 20002002,2017-08-27 19:02:00,afairm fairly well known going reference years...,Afairm it was fairly well known there were g...,21,4.47619,8,0,0,0.7
17,Cap 2013,2013-07-23 18:57:00,good road frontage,Good road frontage,4,5.75,0,0,0,0.7
28,New CAP,2021-10-26 19:43:00,happy im farmer whats new,Not happy but then Im a farmer so whats new,13,3.615385,4,0,0,0.468182
31,New CAP,2021-10-26 19:47:00,convergence heading right direction done thoug...,Convergence heading in the right direction m...,53,4.698113,18,0,4,0.324416
33,New CAP,2021-10-26 20:00:00,10k seems top new environmental scheme payment...,10k seems to be the top of the new environment...,87,4.321839,38,0,4,0.347273
62,cap reform,2013-12-28 13:43:00,im guessing coincide sps 14 application period...,Im guessing it will coincide with the sps 14...,18,5.055556,5,1,0,0.6
74,New CAP for 2023,2022-03-16 17:16:00,ok 25 figure using calc got saying 25 suckler ...,OK so is the 25 figure youre using in that cal...,23,3.695652,11,2,1,0.5
75,New CAP for 2023,2022-03-16 17:18:00,reference number ie best 3 years averagedlets ...,Is it not the reference number ie your best 3 ...,38,5.026316,16,2,0,1.0
87,New CAP for 2023,2022-03-16 20:38:00,ah ok thanks,ah ok thanks,5,4.0,0,0,0,0.35


## Generate negative sentiment with values from ifa dataset as reference. 

Dataset in IFA are market briefings and articles describing facts or events on farming sector, it's clear that the sentiment should be neutral without any intention to translate / influence the reader or to induce an opinion. This give us a good reference of what neutral sentiment should be.

On the other hand, boards.ie post and dicussions regarding CAP are opinions and discussion on the topic. As a referenced to produce thresholds to determine the sentiment of them thresholds calculated from IFA articles will be used.

In [14]:
negative_sentiment_threshold = round(market_price_df.sentiment.describe().reset_index().query("index == '25%'")["sentiment"].values[0],2)
positive_sentiment_threshold = round(market_price_df.sentiment.describe().reset_index().query("index == '75%'")["sentiment"].values[0],2)

print(f"positve thestholds: \t({positive_sentiment_threshold},1]")
print(f"neutral: \t\t[{negative_sentiment_threshold},{positive_sentiment_threshold}]")
print(f"negatvie: \t\t[-1,{negative_sentiment_threshold})")


positve thestholds: 	(0.1,1]
neutral: 		[-0.06,0.1]
negatvie: 		[-1,-0.06)


In [15]:
def sentiment_class(sentiment:float):
    if(sentiment > positive_sentiment_threshold):
        return "positive"
    elif(sentiment >= negative_sentiment_threshold):
        return "neutral"
    else:
        return "negative"

cap_df['sentiment_class'] = cap_df['sentiment'].apply(lambda x: sentiment_class(x))

In [16]:
cap_df.sentiment_class.value_counts()

neutral     74
positive    69
negative    34
Name: sentiment_class, dtype: int64

# Data Prep

In [17]:
columns = ["text","word_count","avg_word","stopwords","numeric_values","upper"]
X = cap_df[columns]
y = cap_df["sentiment_class"]
# Map class to integer 0, 1, 2
y = y.apply(lambda x: ['negative', 'neutral', 'positive'].index(x))

## NLP methods

1. Count Vectorizer
2. TF-IDF

In [18]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer

# Instantiate an object cv by calling a method named as CountVectorizer()
cv = CountVectorizer(max_features = 3000)

# Train the dataset by calling a fit_transform() method
X_fin = cv.fit_transform(X.text).toarray()

# Display the rows and colums
X_fin.shape

# Instantiate an object model by calling a method MultinomialNB()
model = MultinomialNB()

# Split the dataset into training and testing parts
X_train, X_test, y_train, y_test = train_test_split(X_fin, y, test_size = 0.3)

# Train the model by calling a method fit()
model.fit(X_train,y_train)

# Call predict() method
y_pred = model.predict(X_test)

In [19]:
# Instantiate a mthod named as Cla
train_r2 = model.score(X_train, y_train)
test_r2 = model.score(X_test, y_test)

# Display the results for train and test
print(f"Train Data Predict R2: {train_r2}")
print(f"Test Data Predict R2: {test_r2}")

# Instantiate a mthod named as Cla
cf_cv = classification_report(y_test, y_pred)

Train Data Predict R2: 0.967479674796748
Test Data Predict R2: 0.4444444444444444


In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create an object 'tf' by calling a method TfidfVectorizer()
tfidf = TfidfVectorizer(max_features = 3000)

# Train the dataset by calling a method fit_tranform() 
X_tfidf = tfidf.fit_transform(X.text).toarray()

# Instantiate an object model by calling a method MultinomialNB()
model_tdidf = MultinomialNB()

# Split the dataset into training and testing parts
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size = 0.3)

# Train the model by calling a method fit()
model_tdidf.fit(X_train, y_train)

# Call predict() method
y_pred = model_tdidf.predict(X_test)


# Instantiate a mthod named as Cla
train_r2 = model.score(X_train, y_train)
test_r2 = model.score(X_test, y_test)

# Display the results for train and test
print(f"Train Data Predict R2: {train_r2}")
print(f"Test Data Predict R2: {test_r2}")

# Instantiate a mthod named as Cla
cf_tdidf = classification_report(y_test, y_pred)

Train Data Predict R2: 0.7804878048780488
Test Data Predict R2: 0.8703703703703703


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
# Display the values of an object cf
print(cf_cv)
# Display the values of an object cf
print(cf_tdidf)

              precision    recall  f1-score   support

           0       0.40      0.31      0.35        13
           1       0.48      0.52      0.50        21
           2       0.43      0.45      0.44        20

    accuracy                           0.44        54
   macro avg       0.44      0.43      0.43        54
weighted avg       0.44      0.44      0.44        54

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        11
           1       0.59      0.38      0.47        26
           2       0.30      0.65      0.41        17

    accuracy                           0.39        54
   macro avg       0.30      0.34      0.29        54
weighted avg       0.38      0.39      0.35        54

