In [25]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import sklearn
from nltk.tokenize import word_tokenize
import statsmodels.api as sm
from statsmodels.formula.api import ols
from ISLP.models import (ModelSpec as MS,
                         summarize)
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.manifold import TSNE
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

from ipywidgets import IntProgress
from IPython.display import display

# special matplotlib argument for improved plots
from matplotlib import rcParams

In [5]:
def regression_graph(y,x,df,title):

    # Set the aesthetic style of the plots
    sns.set(style="whitegrid")

    # Create a regression plot
    plt.figure(figsize=(10, 6))
    sns.regplot(y=y, x=x, data=df, fit_reg=True, 
                scatter_kws={'s': 50, 'alpha': 0.6, 'color': 'b'}, 
                line_kws={'color': 'red', 'linewidth': 2})

    # Customize the plot with titles and labels
    plt.title(title, fontsize=20)

    # Customize the ticks on the axes
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)

    # Show grid
    plt.grid(True, linestyle='--', alpha=0.7)

    # Show the plot
    plt.show()

# Import Data

In [6]:
df_model = pd.read_csv('../data/model_data.csv')
df_ndc = pd.read_csv('../data/ndc_files.csv')

# Exploratory Data Analysis
* Table 1 
* Table 2
* Figure 1

In [None]:
print(df_model.describe().to_latex())

In [None]:
print(df_model.corr(numeric_only=True).to_latex())

In [None]:
fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(25, 10), squeeze=False)

# Flatten axes array for easy iteration
axes = axes.flatten()

# Plot histograms and set titles and labels
for i, col in enumerate(df_model[['population', 'income', 'democracy', 'health_expenditure',
       'coal_rents', 'temperature_change', 'air_pollution']].columns):
    df_model[col].plot.hist(ax=axes[i], alpha=0.7, bins=20)
    axes[i].set_title(f'Histogram of {col}')
    axes[i].set_xlabel(f'Value of {col}')
    axes[i].set_ylabel('Frequency')

# Adjust layout
plt.tight_layout()
plt.show()

# Univariate Regression

In [7]:
m = ols('income ~ air_pollution',df_model).fit()
m.summary()

  return _pandas_is_categorical_dtype(dt)
  return _pandas_is_categorical_dtype(dt)
  return _pandas_is_categorical_dtype(dt)


0,1,2,3
Dep. Variable:,income,R-squared:,0.138
Model:,OLS,Adj. R-squared:,0.133
Method:,Least Squares,F-statistic:,27.66
Date:,"Sat, 03 Aug 2024",Prob (F-statistic):,4.23e-07
Time:,09:44:21,Log-Likelihood:,-303.01
No. Observations:,175,AIC:,610.0
Df Residuals:,173,BIC:,616.4
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,9.4055,0.185,50.750,0.000,9.040,9.771
air_pollution,-0.0332,0.006,-5.259,0.000,-0.046,-0.021

0,1,2,3
Omnibus:,1.5,Durbin-Watson:,1.715
Prob(Omnibus):,0.472,Jarque-Bera (JB):,1.156
Skew:,0.051,Prob(JB):,0.561
Kurtosis:,3.385,Cond. No.,52.3


In [None]:
regression_graph('income','air_pollution',df_model,'Income vs Air Pollution')

# Run Additional Analysis:
* Air Pollution and Democracy
* Income and Democracy

# Logistic Regression

In [11]:
allvars = df_model[['population', 'income','air_pollution']]
design = MS(allvars)
X = design.fit_transform(allvars).astype(float)
y = df_model.eu == 'EU'
glm = sm.GLM(y,
             X,
             family=sm.families.Binomial())
results = glm.fit()
summarize(results)

Unnamed: 0,coef,std err,z,P>|z|
intercept,-13.4285,3.625,-3.705,0.0
population,0.2423,0.123,1.972,0.049
income,1.0149,0.272,3.728,0.0
air_pollution,-0.0804,0.04,-1.993,0.046


# T-SNE

In [None]:
# Convert categorical variable 'decision' to numerical
le = LabelEncoder()
df_model['eu_num'] = le.fit_transform(df_model['eu'])
df_model = df_model.dropna()

# Features to be used in t-SNE
features = ['population', 'income', 'democracy', 'health_expenditure',
       'coal_rents', 'temperature_change', 'air_pollution','total_sentences', 'total_words',
       'health_percent' ]
X = df_model[features].dropna()

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply t-SNE
tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(X_scaled)

# Add t-SNE results to the DataFrame
df_model['tsne-2d-one'] = X_tsne[:, 0]
df_model['tsne-2d-two'] = X_tsne[:, 1]

# Define a custom color palette with 5 colors
custom_palette = sns.color_palette('husl', 5)

# Plot t-SNE results
plt.figure(figsize=(10, 6))
sns.scatterplot(
    x='tsne-2d-one', y='tsne-2d-two',
    hue='eu',
    palette=custom_palette,
    data=df_model,
    alpha=0.8
)
plt.title('t-SNE of NDC Countries')
plt.xlabel('t-SNE EU 1')
plt.ylabel('t-SNE EU 2')
plt.legend(title='EU Countries', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

# Deep Learning Sentiment Analysis

In [18]:
from transformers import pipeline
from transformers import AutoTokenizer

# Step 3: Load the pre-trained model and tokenizer
sentiment_analysis = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment-latest")
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [23]:

# Define a function to split the text into chunks
def split_text_into_chunks(text, tokenizer, max_length):
    tokens = tokenizer.tokenize(text)
    chunks = []
    current_chunk = []
    current_length = 0

    for token in tokens:
        if current_length + len(token) > max_length:
            chunks.append(" ".join(current_chunk))
            current_chunk = []
            current_length = 0
        current_chunk.append(token)
        current_length += len(token)
    
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    
    return chunks

# Define a function to perform sentiment analysis on the entire text
def analyze_sentiment(text, tokenizer, sentiment_analysis, max_length):
    chunks = split_text_into_chunks(text, tokenizer, max_length)
    results = []
    
    for chunk in chunks:
        result = sentiment_analysis(chunk)
        results.extend(result)
    
    return results

# Example usage
text = "Your long text goes here..."

# The max_length for the tokenizer
max_length = 514


def ave_sent(sent_results):
    
    l_sent = []
    for i, result in enumerate(sent_results):
        
        l_sent.append(result['score'])
    
    return sum(l_sent)/len(l_sent)

def get_sentiment_result(text):
    sentiment_results = analyze_sentiment(text, tokenizer, sentiment_analysis, max_length=514)
    ave_sent_result = ave_sent(sentiment_results)

    return ave_sent_result    

In [None]:
from tqdm.notebook import tqdm
tqdm.pandas()
df_ndc['sentiment'] = df_ndc['text'].progress_apply(get_sentiment_result)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/158 [00:00<?, ?it/s]