## CA2 ##
## IRELAND MEAT PRODUCTION / CAP PERFORMANCE COMPARED TO OTHER COUNTRIES

1. Step 1 : Exploratory Data Analysis
2. Step 2 : Statistics
3. Step 3 : ML Model around our data
4. Step 4 : Optimization with Python

In [1]:
#import of library for exploration of data
import pandas as pd
import numpy as np
import seaborn as sns
import math
import matplotlib.pyplot as plt

# We can suppress the warnings for a better reading
import warnings
warnings.filterwarnings('ignore')



# 1. Exploratory Data Analysis

In [2]:
#Source data generated from the link below
#https://www.fao.org/faostat/en/#data/QV

# Applied filter
# Country = Ireland,France,Spain,USA Year Only last 3 years including 2018,2019,2020 (2021,2022 data are not available yet)

#Read files from Faostat

ag_production_value = "faostat/irl_fr_sp_usa.csv"
ag_prod_value_df = pd.read_csv(ag_production_value)

#Quick Overview of the data
ag_prod_value_df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'faostat/irl_fr_sp_usa.csv'

In [None]:
ag_prod_value_df.info()

In [None]:
#View rows and column count
ag_prod_value_df.shape

In [None]:
# Check columns name if there is any irrelevant spelling
ag_prod_value_df.columns.values

In [None]:
# Standardisation of column name -> Rename column name to keep a single uppercase to make easier our work with analysis. 
ag_prod_value_df.rename(columns = {"Domain Code" :"Domain_code", 
                                  "Domain": "Domain",
                                  "Area Code (M49)":"Area_code_m49",
                                  "Area":"Area",
                                  "Element Code":"Element_code",
                                  "Element": "Element",
                                  "Item Code (CPC)":"Item_code_cpc",
                                  "Item":"Item",
                                  "Year Code":"Year_code",
                                  "Year":"Year",
                                    "Unit": "Unit",
                                    "Value":"Value",
                                    "Flag":"Flag",
                                    "Flag Description" : "Flag_description"}, inplace = True)

In [None]:
# Let's view Unique Value in axes 0 -> In each Column
ag_prod_value_df.nunique(axis = 0)

In [None]:
#Removing column having a single value as these one wont help us to much for our analysis
#These columns are : Domain_code, Domain, Flag, Flag_description
#First lets see differenet values that they contain

print("##### Domain_code: #####")
print(pd.unique(ag_prod_value_df['Domain_code']))
print("\n##### Domain : #####")
print(pd.unique(ag_prod_value_df['Domain']))
print("\n##### Area_code_m49 : #####")
print(pd.unique(ag_prod_value_df['Area_code_m49']))

print("##### Area: #####")
print(pd.unique(ag_prod_value_df['Area']))
print("\n##### Flag : #####")
print(pd.unique(ag_prod_value_df['Flag']))
print("\n##### Flag_descritpion : #####")
print(pd.unique(ag_prod_value_df['Flag_description']))


In [None]:
#Removal of columns
ag_prod_value_df = ag_prod_value_df.drop(['Domain_code','Domain','Flag','Flag_description'], axis=1)

#View after Removal
ag_prod_value_df.head()

In [None]:
#Check also the value in Element and Element_code
print("\n##### Element : #####")
print(pd.unique(ag_prod_value_df['Element']))
print("\n##### Element_code : #####")
print(pd.unique(ag_prod_value_df['Element_code']))
print("\n##### Unit : #####")
print(pd.unique(ag_prod_value_df['Unit']))


# Definitions and standards used in FAOSTAT 
# I$ = international dollar
# SLC = standard local currency


In [None]:
## For simplification purpose we consider that the Unit is the US dollar, and SLC for Ireland/France/Spain is 1 Euro = 1 Dollar.
## Lets Remove unnecessary Column: Element, Element_code, Year_code
ag_prod_value_df = ag_prod_value_df.drop(['Element','Element_code','Year_code'], axis=1)
ag_prod_value_df.head()

In [None]:
## Check values in Item
print("\n##### Item : #####")
print(pd.unique(ag_prod_value_df['Item']))

In [None]:
## We will focus on Meat and Milk Production value 
## lets simplify our dataframe creating a new one containing only meat and milk data

In [None]:
meat_milk_df = ag_prod_value_df.loc[(ag_prod_value_df['Item'].str.contains('Meat'))|(ag_prod_value_df['Item'].str.contains('meat'))|(ag_prod_value_df['Item'].str.contains('milk'))]

In [None]:
#Quick Overview of the dataframe
meat_milk_df.head()

In [None]:
meat_milk_df.shape

In [None]:
#Let's check that we have all values in which we are focusing
## Check values in Item
print("\n##### Item : #####")
print(pd.unique(meat_milk_df['Item']))

In [None]:
# More Interesting to categories and classify these creating 3 new columns
# Category Column -> To Classify by type MEAT or MILK


In [None]:
#Create Category Column
meat_milk_df['Category'] = np.where(meat_milk_df['Item'].str.contains('milk'),'MILK','MEAT')


In [None]:
#Quick Overview
meat_milk_df.head()

In [None]:
#Creating Animal Column
conditions = [meat_milk_df['Item'].str.contains('Horse'), 
              meat_milk_df['Item'].str.contains('cattle'),
             meat_milk_df['Item'].str.contains('chickens'),
              meat_milk_df['Item'].str.contains('ducks'),  
              meat_milk_df['Item'].str.contains('geese'),
              meat_milk_df['Item'].str.contains('goat'),
             meat_milk_df['Item'].str.contains('pig'),
              meat_milk_df['Item'].str.contains('rabbits'), 
             meat_milk_df['Item'].str.contains('sheep'),
             meat_milk_df['Item'].str.contains('turkeys'),
              meat_milk_df['Item'].str.contains('goats'),
              meat_milk_df['Item'].str.contains('Game')]

choices = ['Horse','Cattle','Chicken','Duck', 'Geese','Goat','Pig','Rabbit', 'Sheep', 'Turkey','Goat','Game']
meat_milk_df['Animal'] = np.select(conditions,choices,default="")

In [None]:
meat_milk_df.head()

In [None]:
#Remind quickly available values
pd.unique(meat_milk_df['Animal'])

In [None]:
# Creating the Animal_group column to classify the production of meat.
# CATTLE(cattle only), POULTRY(chickens, ducks, geese, rabbits, turkey), SHEEP(sheep), OTHER(Horse, Game, Pig)

animal_group_dictionnary = {'Horse':'Other', 'Cattle':'Cattle', 'Chicken':'Poultry', 
                            'Duck':'Poultry', 'Geese':'Poultry', 'Goat':'Sheep', 'Pig':'Other',
                            'Rabbit':'Poultry', 'Sheep':'Sheep', 'Turkey':'Poultry', 'Game':'Other' }

meat_milk_df['Animal_group'] = meat_milk_df['Animal'].map(animal_group_dictionnary)



In [None]:
#Quick check of new DF
meat_milk_df.head()

In [None]:
#Review of result 
pd.unique(meat_milk_df['Animal_group'])

In [None]:
#Check if we need to remove rows with nan values 
print(meat_milk_df.isnull().sum())

In [None]:
#Last Check to view if we are ready to plot all these and start our analysis
meat_milk_df.count()

In [None]:
#Check the shape of the DF
meat_milk_df.shape

### Our Conclusion : 
Our Dataframe doenst contain any null value, and is ready to be plotted and we can start our work around.

## Visualization of our dataframe for Ireland

In [None]:
#Filtering for Ireland only
ir_mm_df = meat_milk_df.loc[meat_milk_df['Area']=='Ireland']
ir_mm_df = ir_mm_df.groupby(['Year','Animal', 'Animal_group', 'Category'], as_index=False)[['Value']].sum()


In [None]:
type(meat_milk_df)

In [None]:
ir_mm_df.head(10)

In [None]:
# Import interactive Altair library
import altair as alt

# First Overview of the Interactive View for Ireland
alt.Chart(ir_mm_df).mark_point().encode(
    alt.X('Animal'),
    alt.Y('Value'),
    tooltip=['Animal', 'Category', 'Value', 'Year']
).properties(
    width=800,
    height=300
).configure_point(
    size=100)


In [None]:
#Interactive Visualization
# Visualization for different Year -> Production Value by Animal

select_country = alt.selection_single(
    name='select', fields=['Year'], init={'Year': '2018'},
    bind=alt.binding_range(min=2018, max=2020, step=1)
)
alt.Chart(ir_mm_df).mark_point(filled=True).encode(
    alt.X('Animal', scale=alt.Scale(zero=False)),
    alt.Y('Value', scale=alt.Scale(zero=False)),
    tooltip=['Animal', 'Category', 'Value', 'Year'],
    color='Category',
    
).add_selection(select_country).transform_filter(select_country).properties(
    width=600,
    height=300,
    title = 'Ireland Meat Milk Production Value From 2018 to 2020'
).configure_point(
    size=100)

## Our Conclusion :
We can remark that cattle production value is the highest type, followed by pig and sheep in the last 3 years.

## 2 - STATISTICS -> CAP PERFORMANCE ON MEAT MILK PRODUCTION VALUE

For simplification purpose we will study the performance on the last 3 years available in our dataframe those are 2020,2019,2018. We will compare our performance with other top 2 countries of EU in the agriculture production (France, Spain). We will use data provided by the agridata website. The CAP provide income support for farmers in each EU member country. The Income Support Amount is calculated as follow : 
Number of CAP beneficiary (for the Year) multiply Amount per CAP beneficary (for the Year).

I see that in the public website we dont have appropriate dataset, all dataset are bulked with too much unnecessary data for our analysis. I build the dataframe manually as it will save time and coding lines.

The source of these numbers are from the visual dashboard of agridata.

In [None]:
# Lets built the cap_df dataframe first according to the value provided by agridata website.
# Data is as follow :
# 2018, EU, 6158770, 6480 - France, 325810, 22990 - Ireland, 123300, 10330 - Spain, 663190, 7460
# 2019, EU, 6064820, 6610 - France, 321110, 23370 - Ireland, 122580, 10540 - Spain, 654400, 7620
# 2020, EU, 5996360, 6620 - France, 316120, 23400 - Ireland, 122450, 10540 - spain, 644500, 7590


cap_df = pd.DataFrame(
    
    {
    'Year' : [2018,2018,2018,2018,2019,2019,2019,2019,2020,2020,2020,2020], 
    'Entity':['EU', 'France', 'Ireland', 'Spain','EU', 'France', 'Ireland', 'Spain', 'EU', 'France', 'Ireland', 'Spain'],
    'Beneficiary_number' : [6158770,325810,123300,663190,6064820,321110,122580,654400,5996360,316120,122450,644500],
    'Amount_per_beneficiary' : [6480,22990,10330,7460,6610,23370,10540,7620,6620,23400,10540,7590],
    }

)


In [None]:
#Quick Overview
cap_df.info()

In [None]:
#Quick Overview
cap_df.head()

In [None]:
cap_df['Total_amount'] = cap_df['Amount_per_beneficiary']*cap_df['Beneficiary_number']

In [None]:
cap_df.head()

Our Conclusion : We have now our cap_df dataframe that contains values for the last 3 years regarding CAP for Ireland, France, Spain, the specific amount of Farming Income Support that have been paid for each country. For our analysis i have also included the total paid by EU for those specific years in order to have a global overview of the impact of the CAP support plan. 

### Descriptive Statistics
As a reminder we are in the situation where we have the production value for Ireland, Spain, France for last 3 years.
We have the CAP support amount for each for these countries and for the last 3 years.
We can now start to analyse the production value behaviour and see the impact of the CAP on a specific agriculture, here is for Meat and Milk.

1. Ireland production value of meat/milk on the last 3 years.
2. Central Tendancy : Mean, Mo, Median. 
3. Variation Measures

In [None]:
#Reminder of what we have as in our Meat/Milk production value for Ireland in our dataframe.
ir_mm_df.head()

In [None]:
#Calculate per Year the Value of the Meat/Milk Production 
ir_mm_df = ir_mm_df.groupby("Year").sum()
ir_mm_df.head()

In [None]:
# Let's calculate our Mean, Median, Variance and deviation for Meat/Milk Values.
ir_mm_mean = ir_mm_df['Value'].mean()
print("## The Mean for Meat Milk Production Value is : ", ir_mm_mean)
print("## The Median for Meat Milk Production Value is : ",ir_mm_df['Value'].median())
print("## The Variance for Meat Milk production Value is :",ir_mm_df['Value'].var())
ir_mm_var = ir_mm_df['Value'].var()
ir_mm_deviation =math.sqrt(ir_mm_var)
print("## The Deviation for Meat Milk Production Value is :",ir_mm_deviation)

In [None]:
#Let's plot Ireland Meat/Milk Production Value for the last 3 Years
import plotly.express as px
px.box(ir_mm_df, y='Value', width=600, height=800, title='Ireland Meat/Milk Production Value Last 3 years')

### Inferential Statistics

If we look the performance of Ireland in Meat/Milk production value for the last 3 years we have a constant increase in the production value. With the standard deviation that we have calculated before we can see that this value increase in linear way with a standard deviation at : 



In [None]:
# For the last 3 years 
ir_mm_deviation

As a reminder the cap value distributed by country in EU between France, Spain, Ireland is as follow :

In [None]:
#Quick Overview
cap_df.head(16)

If we look number recorded in this dataframe we see that the Amount_per_beneficiary has increased from 2018 by 210,00 euros
(2% of its initial value of 2018) and has not changed since 2019, and has the same amount in 2020. But we can notice that number of Beneficiary has decreased from 123300 to 122450. 
This has not prevent Ireland to increase his production value for Meat/Milk this is an interesting scenario.



Our Conclusion : It seems that our Beneficiary amount has probably attained his optimal value to get the best performance in irish market.  Is this the case lets check this with Hypothetique Test. 

#### Hypothesis Test ## T-Test One Population
Are we able to keep the production value for Ireland increasing with the same deviation for the coming years, and say that the performance of Cap value per beneficiary will enable Ireland to increase his production in Meat/milk for coming next three years, if we keep the same amount allocated per beneficiary, assuming that the number of beneficiary has been now stabilized and the change in coming 3 years will not be significant.

Following values are known :

1. σ Standard Deviation of Production Value Meat/Milk for Ireland -> ir_mm_deviation = 1092384
2. n as number of sample data we have in this dataset  -> 3
3. x̅ as the mean for the las 3 years for Ireland -> ir_mm_mean = 50721621 
4. α signifiance at 5% 

Our Scenario :
1. -> H0 -> Is the scenario where u = 50721621
2. -> H1 -> Is the scenario where u =! 50721621


In [None]:
#Import the library
import scipy.stats as stats

# Define the variable 
X = ir_mm_df['Value']; X

In [None]:
#H0 : u = 50721621
#H1 : u =! 50721621
#stats.ttest_1samp(X,mu of H0)
stats.ttest_1samp(X,ir_mm_mean)

#### Our Conclusion : 
We obtain a t-statistics value equal to zero, this means for us that our data match the H0 hypothesis, with a p-value as probability equal to 1. Probability to obtain the Meat/Milk production value at it average is very High we can keep then our Hypothesis of H0 as true.

#### Confidence Interval

Calculation of the Confidence Interval as a reminder with following values :
1. Number of sample data we have in this dataset  -> 3
2. x̅ as the mean for the las 3 years for Ireland -> ir_mm_mean = 50721621 
3. We will use a confidence level at 95%

In [None]:
#Let's firts calculate the degre of freedom which is sample size -1
degree_of_freedom = len(ir_mm_df['Value']) -1
degree_of_freedom

In [None]:
#import standard error library
from scipy.stats import sem

In [None]:
#Calculate the standard error
standard_error = sem(ir_mm_df['Value'])

In [None]:
#We set the confidence level
confidence_level = 0.95

In [None]:
#import t library 
from scipy.stats import t

In [None]:
#Calulating our Confidence Interval
t.interval(confidence_level, degree_of_freedom, loc=ir_mm_mean, scale=standard_error)

#### Our Conclusion : 
Our Confidence interval has been calculated and we obtained an upper value of 53435255 and a lower value of 48007987. 

### Other country performance against Ireland

Ireland, France, Spain are all members of EU and receive different amount of CAP. The CAP is mainly an income support for farmers. If we look only at revenue production of these countries to measure their performance we will not get accurate analysis aligned with the reality.

This is why i'm adding this CAP coefficent in our study. 
I've defined this coefficient number = Total_amount(for Year n for specific country) / Total_amount(for Year n for Total EU)



In [None]:
#Quick Overview
cap_df.head(12)

In [None]:
##Adding Cap_coefficient column
for ind, row in cap_df.iterrows():
    if(cap_df.loc[ind, 'Entity'] == 'EU'): 
        cap_df.loc[ind, 'Cap_coefficient'] = 1
        eu_amount = cap_df.loc[ind, 'Total_amount']
    else:
        current_year = cap_df.loc[ind, 'Year']    
        cap_df.loc[ind, 'Cap_coefficient'] = row['Total_amount']/eu_amount

In [None]:
cap_df.head(12)

In [None]:
#Filtering for Ireland/France/Spain only from meat_milk 
eu_mm_df = meat_milk_df.loc[(meat_milk_df['Area']=='Ireland')|(meat_milk_df['Area']=='Spain')|(meat_milk_df['Area']=='France')]
eu_mm_df = eu_mm_df.groupby(['Year','Area','Animal', 'Animal_group', 'Category'], as_index=False)[['Value']].sum()

In [None]:
eu_mm_df.rename(columns = {"Value" :"Production_value", "Area" : "Entity"}, inplace = True)
eu_mm_df.head()

In [None]:
#Grouping data rows
eu_performance_df = eu_mm_df.groupby(['Year','Entity'], as_index=False)[['Production_value']].sum()
eu_performance_df.head()

In [None]:
#Merging cap_df et eu_performance_df dataframe
cap_df = cap_df.merge(eu_performance_df, on=['Year','Entity'])

In [None]:
#Quick Overview
cap_df.head()

In [None]:
#Normality Check of the data. Our variable is "Production_value"
stats.probplot(cap_df.Production_value, plot=plt)
plt.figure()

Our Conclusion : We can see clearly that our data are normally distributed between each other making almost a line for each country.

### Shapiro wilk Test

In [None]:
#Shapiro wilk test
stats.shapiro(cap_df.Production_value[cap_df.Entity == "France"])

In [None]:
#Shapiro wilk test
stats.shapiro(cap_df.Production_value[cap_df.Entity == "Spain"])

In [None]:
#Shapiro wilk test
stats.shapiro(cap_df.Production_value[cap_df.Entity == "Ireland"])

Our Conclusion : Pvalue are greater than 0.05 for all countries, Shapiro Wilk Test confirm the normal distribution of our data and confirm the result of the graph that we obtained above.

### Standard Deviation 

In [None]:
#Calculation for France/Ireland/Spain
france = cap_df.Production_value[cap_df.Entity == "France"]
ireland = cap_df.Production_value[cap_df.Entity == "Ireland"]
spain = cap_df.Production_value[cap_df.Entity == "Spain"]


In [None]:
print("Standard Deviation for France is :", france.std(),"for Ireland is :", ireland.std(), "for Spain is :", spain.std())

Our Conclusion : The standard deviation is the highest for Spain and lowest for France.

### Test of Homogeneity of Variance

In [None]:
#Homogeneity of variance: Levene's test
from scipy.stats import levene
levene(france, spain, ireland, center = 'mean')

Our Conclusion : We obtain a pvalue greater than 0.05 Levene test is non significant.

### One-way ANOVA ( Analysis of Variance between 2 or more groups)

Analyzing variance between 3 groups France, Spain, Ireland.

In [None]:
#ONE-WAY ANOVA
from statsmodels.formula.api import ols
import statsmodels.api as sm

model = ols('Production_value~Entity', data = cap_df).fit()
aov = sm.stats.anova_lm(model, type=2)
print(aov)


p<0,05 There are significant difference between entity.

In [None]:
#ANOVA TWO WAYS
model2 = ols('Production_value~Entity+Cap_coefficient', data = cap_df).fit()
aov2 = sm.stats.anova_lm(model2, type=2)
print(aov2)

## 3 - ML ON CAP PERFORMANCE

Supervised ML - based on CRISP-DM - Business Case/Need
We try to see if we can predict the best cap coefficent that will enable a country of EU to perform better in the agriculture production value specifically for meat/milk. As we are looking for a number and/or a classification, Supervised ML is the most appropriate methodology.

1. Choice of ML model : Supervised ML -> classsification -> Test + Training + Prediction 
2. Sentiment analysis :
3. train and test supervised model
4. Plotting ML model and comparing their performance

## Choice of ML Model : Supervised ML and Classification

In [None]:
#As reminder available dataframe : cap_df : EU
cap_df.head()

## Classification and Classification Rule
#We are in classification situation we will then add a new column to classify the result 
#We apply the following rule to classify the Cap_coefficient as Good, Bad
#Performance_score = Production_value / Total_amount 
1. If Performance_score > Cap_coefficent -> is Good
2. If Perfromance_score < Cap_coefficeient -> is Bad

In [None]:
#Let's create Performance_score to measure the performance
cap_df['Performance_score'] = cap_df['Production_value']/cap_df['Total_amount']

In [None]:
cap_df.head()

In [None]:
cap_df.shape

In [None]:
#Let's create then the Performance Class
cap_df['Performance_class'] = np.where(cap_df.Performance_score > cap_df.Cap_coefficient, "Good", "Bad" )

In [None]:
cap_df.head(9)

Our Conclusion : We can see here that Based on our classification criteria for the last 3 years, Ireland is performing well against Spain and France. Cap coefficient seems to be not enough performing in France and Spain or those country.


In [None]:
#Checking the distribution of our classification
cap_df['Performance_class'].value_counts(normalize=True)

In [None]:
#In order to explore our training and test we need to convert string x attributes in a scaler.
# This include Entity, Performance_class
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

cap_df['Entity'] = le.fit_transform(cap_df['Entity'])
cap_df.head()

## TEST - TRAINING  - PREDICTION

In [None]:
#Loading sklearn libraries 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
    
#loading our features to X and in y our target value
X = cap_df.drop(['Performance_class'], axis=1)
y = cap_df['Performance_class']

#Split the data into 70% and 30% by using a parameter test_size = 30
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 0)

# Display the size of the rows and columns
X.shape, y.shape, X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
# We can see here our distribution of 70% and 30% between training and test data.

In [None]:
#Using the logistic regression algorythm
logreg = OneVsRestClassifier(LogisticRegression())
logreg.fit(X_train, y_train)

In [None]:
#Let's do prediction for the test value provided
y_pred = logreg.predict(X_test)
y_pred

In [None]:
#Let's evaluate our model, we will use classification report for that
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

In [None]:
#We can check also the accuracy score
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

Our Conclusion : Accuracy score is at the highest level.  

### Logistic Regression

In [None]:
#Applying Cross Validation 
from sklearn.model_selection import cross_val_score
cv_score_logistic_reg = cross_val_score(LogisticRegression(max_iter=1000),X,y,cv=5)

In [None]:
print(cv_score_logistic_reg)

Our Conclusion : Cross validation validates our model at its highest level and percentage with 5 false.

### GridSearchCV

In [None]:
#Applying GrydSearchCv Method to find the best 
from sklearn.model_selection import GridSearchCV



## SENTIMENT ANALYSIS

### Meat Supply Balance 2021
source data :  https://www.cso.ie/en/releasesandpublications/ep/p-msb/meatsupplybalance2021/


#### PRE - PROCESSING

In [None]:
#import requests library in order to load the data directly from the server.
#import BeautifilSoup to parse our data
import requests
from bs4 import BeautifulSoup
url_source = 'https://www.cso.ie/en/releasesandpublications/ep/p-msb/meatsupplybalance2021/'
get_url = requests.get(url_source) 

In [None]:
#Check the connection with hosting server. 200 Required to confirm that the connection is made.
print(get_url)

In [None]:
#read the data with BeautifulSoup  
stream = BeautifulSoup(get_url.text, "html.parser")

In [None]:
#Check type of stream
type(stream)

In [None]:
#Check the lenght
len(stream)

In [None]:
#Extracting text in an array
page_text = stream.text
print(page_text)

In [None]:
# Check the type of page_text 
type(page_text)

#### NLP PROCESSING 

In [None]:
#Import Necessary library for NLP
import os
import nltk
import nltk.corpus

In [None]:
#Donwloading required library data
nltk.download('punkt')

In [None]:
#Tokenization of the content available in page_text
from nltk.tokenize import word_tokenize
ai_tokens = word_tokenize(page_text)

In [None]:
#quick overview of our tokenization
ai_tokens

In [None]:
#Check the number of tokens we have
len(ai_tokens)

In [None]:
#Standardisation of the text to limit the tokenization number
from nltk.probability import FreqDist
fdist = FreqDist()

In [None]:
#Frequency count for unique word 
for word in ai_tokens :
    fdist[word.lower()]+=1
fdist

In [None]:
# Check of Number of unqiue word 
len(fdist)

In [None]:
#If we want to see the top twenty words used here
fdist_top10 = fdist.most_common(20)
fdist_top10

In [None]:
#Removing stop words because they are not helping us for the analysis
import re
punctuation = re.compile(r'[-.?!,:;()|0-9]')

In [None]:
post_punctuation=[]
for words in ai_tokens:
    word = punctuation.sub("",words)
    if len(word)>0 :
        post_punctuation.append(word)
        
    

In [None]:
post_punctuation

In [None]:
#Checking the lenght after removing punctuation
len(post_punctuation)

In [None]:
## POS ## Part of Speech and tagging words after in post punctuation
nltk.download()

In [None]:
for token in post_punctuation:
    print(nltk.pos_tag([token]))


In [None]:
## NER ## Named Entity Recognition and Classification
from nltk import ne_chunk
NE_tag = nltk.pos_tag([post_punctuation]))
NE_ner = ne_chunk(NE_tag)
print(NE_ner)