Mount you google drive account to execute in **Google Colab**

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
%cd drive/My\ Drive/RUN_dataset/

Mounted at /content/drive
/content/drive/My Drive/RUN_dataset


Load training set dataset and test set

In [2]:
import os
import pandas as pd
from tqdm import tqdm
import numpy as np
from sklearn.preprocessing import LabelEncoder

TitleStyle = ["objective", "subjective", "unknown"]
TitleStance = ["agree", "disagree", "unrelated"]

def load_data(file, with_features):
    df = pd.read_json(file, 'index')
    df = df.rename(columns={"VALUE_ACUERDO": "label", "TITLE": 'text_a', 'TEXT': 'text_b'})
    label_encoder = LabelEncoder()
    df['label'] = label_encoder.fit_transform(df['label'].values)    
    labels = df['label']
    text_a = df['text_a']
    df['text_b'] = df['text_b'].replace(r'<PARAGRAPH>', '', regex=True)
    df['text_b'] = df['text_b'].replace('<TEXT>', '', regex=True)
    df['text_b'] = df['text_b'].replace('<BREAK>', ' ', regex=True)
    df['text_b'] = df['text_b'].replace(r'\n\n\n\n', ' ', regex=True)
    df['text_b'] = df['text_b'].replace(r'\n\n', ' ', regex=True)
    df['text_b'] = df['text_b'].replace(r'\n', ' ', regex=True)
    df['text_b'] = df['text_b'].replace(r'    ', ' ', regex=True)
    df['text_b'] = df['text_b'].replace(r'   ', ' ', regex=True)
    df['text_b'] = df['text_b'].replace(r'  ', ' ', regex=True)

    text_b = df['text_b']

    reduce = [ "TitleStyleObjective",
                "TitleStyleSubjective",
                "TitleStyleUnknown",
                "TitleTitle-StanceAgree",
                "TitleTitle-StanceDisagree",
                "TitleTitle-StanceUnrelated"]
    features = ["Title",
                "Subtitle",
                "Lead",
                "Body",
                "Conclusion",
                "What",
                "WhatReliabilityReliable",
                "WhatReliabilityUnreliable",
                "WhatLack-Of-InformationYes",               
                "WhatMain-Event",
                "Who",
                "WhoReliabilityReliable",
                "WhoReliabilityUnreliable",
                "WhoLack-Of-InformationYes",                
                "When",
                "WhenReliabilityReliable",
                "WhenReliabilityUnreliable",
                "WhenLack-Of-InformationYes",                
                "Where",
                "WhereReliabilityReliable",
                "WhereReliabilityUnreliable",
                "WhereLack-Of-InformationYes",                
                "Why",
                "WhyReliabilityReliable",
                "WhyReliabilityUnreliable",
                "WhyLack-Of-InformationYes",                
                "How",
                "HowReliabilityReliable",
                "HowReliabilityUnreliable",
                "HowLack-Of-InformationYes",                
                "Quote",
                "QuoteAuthor-StanceAgree",
                "QuoteAuthor-StanceDisagree",
                "QuoteAuthor-StanceUnknown",
                "WhoRoleSubject",
                "WhoRoleTarget",
                "WhoRoleBoth",
                "Key-Expression",
                "Orthotypography",
                "Figure",
                ]

    df = pd.json_normalize(df['DATA'])
    df_1 = df[reduce]
    conditions_stance = [
        (df_1['TitleTitle-StanceAgree'] == 1),
        (df_1['TitleTitle-StanceDisagree'] == 1),
        (df_1['TitleTitle-StanceUnrelated'] == 1)
    ]
    conditions_style = [
        (df_1['TitleStyleObjective'] == 1),
        (df_1['TitleStyleSubjective'] == 1),
        (df_1['TitleStyleUnknown'] == 1)
    ]

    # create a new column and use np.select to assign values to it using our lists as arguments
    df_1['stance'] = np.select(conditions_stance, TitleStance)
    df_1['style'] = np.select(conditions_style, TitleStyle)
    df_1 = df_1.drop(columns=reduce, axis=1)

    # encode columns stance and style
    label_encoder = LabelEncoder()
    integer_encoded = label_encoder.fit_transform(df_1['stance'].values)
    df_1['stance'] = integer_encoded
    integer_encoded = label_encoder.fit_transform(df_1['style'].values)
    df_1['style'] = integer_encoded

    
    #other features
    df_2 = df[features]
    df_2.head()
    
    if with_features: 
      df = pd.concat([text_a, text_b, labels, df_1, df_2], axis=1)
    else:
      df = pd.concat([text_a, text_b, labels], axis=1)    
    return df
    
# True if you use features
df_train = load_data("training_set.json", True)
df_test = load_data("test_set.json", True)


df_train.head()


  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,text_a,text_b,label,stance,style,Title,Subtitle,Lead,Body,Conclusion,...,Quote,QuoteAuthor-StanceAgree,QuoteAuthor-StanceDisagree,QuoteAuthor-StanceUnknown,WhoRoleSubject,WhoRoleTarget,WhoRoleBoth,Key-Expression,Orthotypography,Figure
0,Preocupación entre las personas vacunadas con ...,Tenían que empezar a pasar cosas como estás y...,1,1,2,1,0,1,1,1,...,5,0,0,5,5,5,1,4,0,0
1,La mejor dieta del mundo: Adelgazar comiendo j...,Esta más que apetecible dieta permite perder ...,1,1,2,1,1,1,1,1,...,0,0,0,0,2,3,0,1,0,3
2,Qué curioso lo del Reino Unido: Son los primer...,Más información sospechosa sobre la vacuna. C...,1,1,2,1,0,1,1,1,...,0,0,0,0,5,2,1,9,0,1
3,Colombia recibió este sábado nuevo lote de 280...,"Bogotá, 4 de abril de 2021. A través de su cu...",0,1,1,1,0,1,1,0,...,2,0,0,2,9,4,2,0,0,5
4,Cataluña citará masivamente a los mayores de 7...,En algunas zonas ya se ha comenzado a vacunar...,0,1,1,1,1,1,1,0,...,3,0,0,3,8,6,0,0,0,4


Merge Title and body text in a new column 

In [7]:
# Train dataset
df_train.insert(0,"text", list(df_train["text_a"] + " " + df_train["text_b"]))

# Test dataset
df_test.insert(0,"text", list(df_test["text_a"] + " " + df_test["text_b"]))


Preprocess text to remove emojis, symbols & pictographs, URL, etc

In [8]:
import re
def de_emojify(text):
    regrex_pattern = re.compile(pattern="["
                                        u"\U0001F600-\U0001F92F"  # emoticons
                                        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                        u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                        u"\U00002702-\U000027B0"
                                        u"\U000024C2-\U0001F251"
                                        u"\U0001F190-\U0001F1FF"
                                        u"\U0001F926-\U0001FA9F"                                        
                                        u"\u2640-\u2642"
                                        u"\u2600-\u2B55"
                                        u"\u200d"
                                        u"\u23cf"
                                        u"\u23e9"
                                        u"\u231a"
                                        u"\ufe0f"                                        
                                        "]+", flags=re.UNICODE)
    return regrex_pattern.sub(r'', text)

def preprocess(value):
    new_value = de_emojify(value)
    new_value = re.sub(r'http\S+', '', new_value)
    return new_value

# Train dataset
df_train["text"] = df_train["text"].str.lower() 
df_train["text"] = df_train.text.apply(preprocess)
df_train.head()

# Test dataset
df_test["text"] = df_test["text"].str.lower() 
df_test["text"] = df_test.text.apply(preprocess)

Convert text to TF-IDF vectors

In [9]:
from sklearn.feature_extraction.text import  TfidfVectorizer
import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
nltk.download('stopwords')

from nltk.stem.porter import PorterStemmer

def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems = []
    for item in tokens:
        stems.append(PorterStemmer().stem(item))
    return stems


# create the transform
tfidf = TfidfVectorizer(ngram_range=(1, 3), max_features=1000, use_idf=True,
                            norm='l2', stop_words=stopwords.words("spanish"), tokenizer=tokenize)

# tokenize and build vocab
tfidf.fit(df_train.text)


vector = tfidf.transform(df_train.text).todense()
new_cols = tfidf.get_feature_names_out()

# remove the text column as the word 'text' may exist in the words and you'll get an error
df_train = df_train.drop('text_a',axis=1)
df_train = df_train.drop('text_b',axis=1)
df_train = df_train.drop('text',axis=1)

# join the tfidf values to the existing dataframe training set
df_train = df_train.join(pd.DataFrame(vector, columns=new_cols))


vector_test = tfidf.transform(df_test.text).todense()
new_cols_test = tfidf.get_feature_names_out()
# remove the text column as the word 'text' may exist in the words and you'll get an error
df_test = df_test.drop('text_a',axis=1)
df_test = df_test.drop('text_b',axis=1)
df_test = df_test.drop('text',axis=1)


# join the tfidf values to the existing dataframe test set
df_test = df_test.join(pd.DataFrame(vector_test, columns=new_cols))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  "The parameter 'token_pattern' will not be used"
  % sorted(inconsistent)


Training the Logistic Regression model and computing metrics

In [10]:
from sklearn.metrics import precision_score, accuracy_score,f1_score,recall_score, confusion_matrix
from sklearn.linear_model import LogisticRegression


models = [LogisticRegression(random_state=0)]

# Train
y_train = df_train.iloc[:,0]
x_train = df_train.iloc[:,1:]

# Test 
y_test = df_test.iloc[:,0]
x_test = df_test.iloc[:,1:]
  
for model in models:                
  model.fit(x_train, y_train)
  y_pred = model.predict(x_test)
      
  print(model)
  print(accuracy_score(y_test, y_pred.round()))
  print(f1_score(y_test, y_pred.round(), average='macro'))
  print(confusion_matrix(y_test, y_pred.round()))

LogisticRegression(random_state=0)
0.95
0.949874686716792
[[ 9  1]
 [ 0 10]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
