In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
%cd drive/My\ Drive/paper_RUN_AS/

Mounted at /content/drive
/content/drive/My Drive/paper_RUN_AS


In [None]:
import os
import pandas as pd
from tqdm import tqdm
import numpy as np
from sklearn.preprocessing import LabelEncoder

TitleStyle = ["objective", "subjective", "unknown"]
TitleStance = ["agree", "disagree", "unrelated"]

def load_data(file, with_features):
    df = pd.read_json(file, 'index')
    df = df.rename(columns={"VALUE_ACUERDO": "label", "TITLE": 'text_a', 'TEXT': 'text_b'})
    labels = df['label']
    text_a = df['text_a'].replace(r'\n','', regex=True)
    
    df['text_b'] = df['text_b'].replace(r'<TEXT>', '', regex=True)
    df['text_b'] = df['text_b'].replace(r'<BREAK>', ' ', regex=True)    
    df['text_b'] = df['text_b'].replace(r'<PARAGRAPH>', '', regex=True)
    text_b = df['text_b'].replace(r'\n', '', regex=True)

    reduce = [ "TitleStyleObjective",
                "TitleStyleSubjective",
                "TitleStyleUnknown",
                "TitleTitle-StanceAgree",
                "TitleTitle-StanceDisagree",
                "TitleTitle-StanceUnrelated"]

    features = ["Title",
                "Subtitle",
                "Lead",
                "Body",
                "Conclusion",
                "What",
                "WhatReliabilityReliable",
                "WhatReliabilityUnreliable",
                "WhatLack-Of-InformationYes",
                "WhatMain-Event",
                "Who",
                "WhoReliabilityReliable",
                "WhoReliabilityUnreliable",
                "WhoLack-Of-InformationYes",
                "When",
                "WhenReliabilityReliable",
                "WhenReliabilityUnreliable",
                "WhenLack-Of-InformationYes",
                "Where",
                "WhereReliabilityReliable",
                "WhereReliabilityUnreliable",
                "WhereLack-Of-InformationYes",
                "Why",
                "WhyReliabilityReliable",
                "WhyReliabilityUnreliable",
                "WhyLack-Of-InformationYes",
                "How",
                "HowReliabilityReliable",
                "HowReliabilityUnreliable",
                "HowLack-Of-InformationYes",
                "Quote",
                "QuoteAuthor-StanceAgree",
                "QuoteAuthor-StanceDisagree",
                "QuoteAuthor-StanceUnknown",
                "WhoRoleSubject",
                "WhoRoleTarget",
                "WhoRoleBoth",
                "Key-Expression",
                "Orthotypography",
                "Figure",
                ]

    df = pd.json_normalize(df['DATA'])
    df_1 = df[reduce]
    conditions_stance = [
        (df_1['TitleTitle-StanceAgree'] == 1),
        (df_1['TitleTitle-StanceDisagree'] == 1),
        (df_1['TitleTitle-StanceUnrelated'] == 1)
    ]
    conditions_style = [
        (df_1['TitleStyleObjective'] == 1),
        (df_1['TitleStyleSubjective'] == 1),
        (df_1['TitleStyleUnknown'] == 1)
    ]

    # create a new column and use np.select to assign values to it using our lists as arguments
    df_1['stance'] = np.select(conditions_stance, TitleStance)
    df_1['style'] = np.select(conditions_style, TitleStyle)
    df_1 = df_1.drop(columns=reduce, axis=1)

    # encode columns stance and style
    label_encoder = LabelEncoder()
    integer_encoded = label_encoder.fit_transform(df_1['stance'].values)
    df_1['stance'] = integer_encoded
    integer_encoded = label_encoder.fit_transform(df_1['style'].values)
    df_1['style'] = integer_encoded
  
    #other features
    df_2 = df[features]
    df_2.head()
        
    if with_features: 
      df = pd.concat([text_a, text_b, labels, df_1, df_2], axis=1)
    else:
      df = pd.concat([text_a, text_b, labels], axis=1)    
    return df

df = load_data("dataset.json", True)
df


  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,text_a,text_b,label,stance,style,Title,Subtitle,Lead,Body,Conclusion,...,Quote,QuoteAuthor-StanceAgree,QuoteAuthor-StanceDisagree,QuoteAuthor-StanceUnknown,WhoRoleSubject,WhoRoleTarget,WhoRoleBoth,Key-Expression,Orthotypography,Figure
0,Los ejercicios que debes hacer cuando tienes o...,Aunque hay quienes piensan que es solo una afe...,1,0,0,1,0,1,1,1,...,0,0,0,0,1,2,1,2,1,0
1,¿Es recomendable utilizar dos mascarillas a la...,"Según los expertos, si son homologadas solo e...",1,0,0,1,1,1,1,1,...,0,0,0,0,6,1,1,0,0,0
2,160 MÉDICOS REUNIDOS EN LONDRES DESAPRUEBAN VA...,Por Mente Alternativa Publicado 29 de mayo de...,0,0,1,1,0,1,1,0,...,0,0,0,0,6,2,5,1,1,1
3,53 muertos en Gibraltar en 10 días después de ...,Fuente original: https://healthimpactnews.com...,0,2,1,1,0,1,1,1,...,8,0,2,6,10,11,0,7,13,7
4,¡Cómo limpiar tus riñones!,COMO LIMPIAR TUS RIÑONES LIMPIA TUS RIÑONES P...,0,0,1,1,0,1,1,0,...,0,0,0,0,0,0,0,3,10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,¿Realmente es más eficaz llevar dos mascarillas?,"04/04/2021-20:03:57""Siempre y cuando se coloqu...",1,0,0,1,1,1,1,1,...,2,1,0,1,4,2,2,0,0,0
76,Los principales efectos de las vacunas detecta...,La mayoría de las notificaciones corresponden ...,1,0,0,1,1,1,1,1,...,1,0,0,1,5,1,0,0,0,7
77,Los enjuagues bucales podrían reducir el riesg...,Una investigación asegura que el uso podría b...,1,0,0,1,1,1,1,1,...,1,1,0,0,4,1,0,0,1,0
78,Esto es lo que le pasa a tu cuerpo si bebes al...,Un estudio de la Asociación Americana del Cor...,1,0,0,1,1,1,1,1,...,1,0,0,1,5,2,1,0,0,2


In [None]:
df.insert(0,"text", list(df["text_a"] + " " + df["text_b"]))
df.head()

Unnamed: 0,text,text_a,text_b,label,stance,style,Title,Subtitle,Lead,Body,...,Quote,QuoteAuthor-StanceAgree,QuoteAuthor-StanceDisagree,QuoteAuthor-StanceUnknown,WhoRoleSubject,WhoRoleTarget,WhoRoleBoth,Key-Expression,Orthotypography,Figure
0,Los ejercicios que debes hacer cuando tienes o...,Los ejercicios que debes hacer cuando tienes o...,Aunque hay quienes piensan que es solo una afe...,1,0,0,1,0,1,1,...,0,0,0,0,1,2,1,2,1,0
1,¿Es recomendable utilizar dos mascarillas a la...,¿Es recomendable utilizar dos mascarillas a la...,"Según los expertos, si son homologadas solo e...",1,0,0,1,1,1,1,...,0,0,0,0,6,1,1,0,0,0
2,160 MÉDICOS REUNIDOS EN LONDRES DESAPRUEBAN VA...,160 MÉDICOS REUNIDOS EN LONDRES DESAPRUEBAN VA...,Por Mente Alternativa Publicado 29 de mayo de...,0,0,1,1,0,1,1,...,0,0,0,0,6,2,5,1,1,1
3,53 muertos en Gibraltar en 10 días después de ...,53 muertos en Gibraltar en 10 días después de ...,Fuente original: https://healthimpactnews.com...,0,2,1,1,0,1,1,...,8,0,2,6,10,11,0,7,13,7
4,¡Cómo limpiar tus riñones! COMO LIMPIAR TUS R...,¡Cómo limpiar tus riñones!,COMO LIMPIAR TUS RIÑONES LIMPIA TUS RIÑONES P...,0,0,1,1,0,1,1,...,0,0,0,0,0,0,0,3,10,0


In [None]:
import re
def de_emojify(text):
    regrex_pattern = re.compile(pattern="["
                                        u"\U0001F600-\U0001F92F"  # emoticons
                                        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                        u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                        u"\U00002702-\U000027B0"
                                        u"\U000024C2-\U0001F251"
                                        u"\U0001F190-\U0001F1FF"
                                        u"\U0001F926-\U0001FA9F"                                        
                                        u"\u2640-\u2642"
                                        u"\u2600-\u2B55"
                                        u"\u200d"
                                        u"\u23cf"
                                        u"\u23e9"
                                        u"\u231a"
                                        u"\ufe0f"                                        
                                        "]+", flags=re.UNICODE)
    return regrex_pattern.sub(r'', text)

def preprocess(value):
    new_value = de_emojify(value)
    new_value = re.sub(r'http\S+', '', new_value)
    return new_value

df["text"] = df["text"].str.lower() 
df["text"] = df.text.apply(preprocess)
df.head()

Unnamed: 0,text,text_a,text_b,label,stance,style,Title,Subtitle,Lead,Body,...,Quote,QuoteAuthor-StanceAgree,QuoteAuthor-StanceDisagree,QuoteAuthor-StanceUnknown,WhoRoleSubject,WhoRoleTarget,WhoRoleBoth,Key-Expression,Orthotypography,Figure
0,los ejercicios que debes hacer cuando tienes o...,Los ejercicios que debes hacer cuando tienes o...,Aunque hay quienes piensan que es solo una afe...,1,0,0,1,0,1,1,...,0,0,0,0,1,2,1,2,1,0
1,¿es recomendable utilizar dos mascarillas a la...,¿Es recomendable utilizar dos mascarillas a la...,"Según los expertos, si son homologadas solo e...",1,0,0,1,1,1,1,...,0,0,0,0,6,1,1,0,0,0
2,160 médicos reunidos en londres desaprueban va...,160 MÉDICOS REUNIDOS EN LONDRES DESAPRUEBAN VA...,Por Mente Alternativa Publicado 29 de mayo de...,0,0,1,1,0,1,1,...,0,0,0,0,6,2,5,1,1,1
3,53 muertos en gibraltar en 10 días después de ...,53 muertos en Gibraltar en 10 días después de ...,Fuente original: https://healthimpactnews.com...,0,2,1,1,0,1,1,...,8,0,2,6,10,11,0,7,13,7
4,¡cómo limpiar tus riñones! como limpiar tus r...,¡Cómo limpiar tus riñones!,COMO LIMPIAR TUS RIÑONES LIMPIA TUS RIÑONES P...,0,0,1,1,0,1,1,...,0,0,0,0,0,0,0,3,10,0


In [None]:
from sklearn.feature_extraction.text import  TfidfVectorizer
import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
nltk.download('stopwords')

from nltk.stem.porter import PorterStemmer

def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems = []
    for item in tokens:
        stems.append(PorterStemmer().stem(item))
    return stems



# create the transform
tfidf = TfidfVectorizer(ngram_range=(1, 3), max_features=1000, use_idf=True,
                            norm='l2', stop_words=stopwords.words("spanish"), tokenizer=tokenize)

# tokenize and build vocab
tfidf.fit(df.text)
vector = tfidf.transform(df.text).todense()

new_cols = tfidf.get_feature_names_out()

# remove the text column as the word 'text' may exist in the words and you'll get an error
df = df.drop('text_a',axis=1)
df = df.drop('text_b',axis=1)
df = df.drop('text',axis=1)
df.head()

# join the tfidf values to the existing dataframe
df = df.join(pd.DataFrame(vector, columns=new_cols))

df.head()



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


  "The parameter 'token_pattern' will not be used"
  % sorted(inconsistent)


Unnamed: 0,label,stance,style,Title,Subtitle,Lead,Body,Conclusion,What,WhatReliabilityReliable,...,última,último,útil,‘,’,“,”,"” ,",” .,•
0,1,0,0,1,0,1,1,1,10,10,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0,0,1,1,1,1,1,16,16,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0,1,1,0,1,1,0,15,7,...,0.0,0.0,0.0,0.0,0.0,0.061269,0.060292,0.0,0.0,0.0
3,0,2,1,1,0,1,1,1,27,11,...,0.0,0.0,0.0,0.084608,0.0,0.084129,0.068989,0.0,0.01862,0.0
4,0,0,1,1,0,1,1,0,6,0,...,0.0,0.0,0.045475,0.0,0.0,0.0,0.0,0.0,0.0,0.640069


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

from sklearn.metrics import precision_score, accuracy_score,f1_score,recall_score

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
import numpy as np
np.random.seed(8)


models = [
          SVC(kernel="linear", C=0.025),           
          RandomForestClassifier(random_state=1, max_depth=5), 
          LogisticRegression(random_state=0),
          DecisionTreeClassifier(max_depth=5),          
          MLPClassifier(alpha=1, max_iter=1000),
          AdaBoostClassifier(),
          GaussianNB(),
          ]


is_cross_validation = True
if is_cross_validation:
    n = 5
    kf = KFold(n_splits=n, random_state=1, shuffle=True)
    for model in models:    
      accs = []
      f1s = []
      accs_t = []
      f1s_t = []
      avg_precisions = []
      for train_index, val_index in kf.split(df):
          train_df = df.iloc[train_index]
          val_df = df.iloc[val_index]
          y_train = train_df.iloc[:,0].values
          x_train = train_df.iloc[:,1:].values
          y_test = val_df.iloc[:,0].values
          x_test = val_df.iloc[:,1:].values

                    
          model.fit(x_train,y_train)
          y_pred = model.predict(x_test)
          
          accs.append(accuracy_score(y_test, y_pred.round()))
          f1s.append(f1_score(y_test, y_pred.round(), average='macro'))

      print(np.mean(accs))
      print(np.mean(f1s))


0.9375
0.9250267379679145
0.9125
0.8983600713012478


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

0.9125
0.8752896631854676
0.95
0.9482013411425176
0.925
0.9124777183600713
0.95
0.9458110516934047
0.6875
0.5709611403089665
