## Imports

In [1]:
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.preprocessing import MaxAbsScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from tqdm import tqdm
import nltk
from sklearn.compose import ColumnTransformer
from datetime import datetime
from sklearn.dummy import DummyClassifier
from sklearn.feature_selection import SelectPercentile

In [2]:
tqdm.pandas()

In [3]:
import sys
sys.path.append('../src/')
from models.ClassificationPipeline import ClassificationPipeline 

In [4]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/semcovici/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
def get_classification_report(y_test, y_pred):
    '''Source: https://stackoverflow.com/questions/39662398/scikit-learn-output-metrics-classification-report-into-csv-tab-delimited-format'''
    report = classification_report(y_test, y_pred, output_dict=True)
    df_classification_report = pd.DataFrame(report).transpose()
    df_classification_report = df_classification_report.sort_values(by=['f1-score'], ascending=False)
    return df_classification_report

## definitions

In [6]:
random_seed = 42

In [7]:
path_raw_data = '../data/raw/'
path_processed_data = '../data/processed/'
path_results_cr = '../reports/classification_report'

In [8]:
list_corpus = ['ig','bo', 'cl', 'co', 'gl', 'lu']

## data 

### Classification by corpus

In [9]:
def process_classification_by_corpus(
    text_vect,
    sampling,
    scaling,
    classifier,
    list_corpus,
    selection = None
    
):
    

    df_results = pd.DataFrame({})
    for corpus in list_corpus:
        
        print(f'##### START of {corpus} - {datetime.now()} #####')

        ### read data
        print('Reading data')
        #data = pd.read_csv(path_processed_data + f'train_r3_{corpus}_filtered.csv', index_col = 0)
        
        data = pd.read_csv(
            path_raw_data + f'train_r3_{corpus}_top_mentioned_timelines.csv', 
            sep = ';', 
            encoding='utf-8-sig'
            )
        
        print('Formating data')
        X = data['Texts']
        y = data.Polarity
        # encode label
        y_encoded = y.map({'against': 0, 'for': 1})
        X_train, X_test, y_train, y_test = train_test_split(
            X, 
            y_encoded, 
            test_size=0.2, 
            random_state=random_seed, 
            stratify=y_encoded
            )

        # defining model
        clf_pipe = ClassificationPipeline(
            vectorizer=text_vect,
            sampling = sampling,
            scaling =scaling, 
            selection = selection,
            estimator= classifier
        )

        print('Training model')
        clf_pipe.train(X_train, y_train)

        print('Predict test')
        y_pred, y_pred_proba = clf_pipe.predict(X_test)

        df_classification_report = get_classification_report(y_test, y_pred)

        df_classification_report = df_classification_report.reset_index().rename(columns = {"index": "class"})
        
        df_classification_report['corpus'] = corpus

        df_results = pd.concat([df_results, df_classification_report])
        
        print(f'##### END of {corpus} - {datetime.now()} #####\n\n\n\n\n')
        
        
    return df_results

#### without selection

In [12]:
##########################
# Definitions 
##########################

text_vect = TfidfVectorizer()
sampling = None
scaling = None

classifier = DummyClassifier()

##########################
# Process 
##########################

df_results = process_classification_by_corpus(
    text_vect,
    sampling,
    scaling,
    classifier,
    list_corpus
    
)

print(display(df_results[df_results['class'] == 'macro avg'].sort_values('f1-score')))

df_results.to_excel(path_results_cr + 'classification_report_dummy_clf.xlsx', index = False)

##### START of ig - 2024-04-08 23:22:18.246503 #####
Reading data
Formating data
Training model
[Pipeline] ........ (step 1 of 5) Processing vectorizer, total=  29.7s
[Pipeline] .......... (step 2 of 5) Processing sampling, total=   0.0s
[Pipeline] ........... (step 3 of 5) Processing scaling, total=   0.0s
[Pipeline] ......... (step 4 of 5) Processing selection, total=   0.0s
[Pipeline] ......... (step 5 of 5) Processing estimator, total=   0.0s
Predict test


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


##### END of ig - 2024-04-08 23:23:04.126244 #####





##### START of bo - 2024-04-08 23:23:04.126307 #####
Reading data
Formating data
Training model
[Pipeline] ........ (step 1 of 5) Processing vectorizer, total=   9.5s
[Pipeline] .......... (step 2 of 5) Processing sampling, total=   0.0s
[Pipeline] ........... (step 3 of 5) Processing scaling, total=   0.0s
[Pipeline] ......... (step 4 of 5) Processing selection, total=   0.0s
[Pipeline] ......... (step 5 of 5) Processing estimator, total=   0.0s
Predict test


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


##### END of bo - 2024-04-08 23:23:18.796619 #####





##### START of cl - 2024-04-08 23:23:18.796670 #####
Reading data
Formating data
Training model
[Pipeline] ........ (step 1 of 5) Processing vectorizer, total=  47.4s
[Pipeline] .......... (step 2 of 5) Processing sampling, total=   0.0s
[Pipeline] ........... (step 3 of 5) Processing scaling, total=   0.0s
[Pipeline] ......... (step 4 of 5) Processing selection, total=   0.0s
[Pipeline] ......... (step 5 of 5) Processing estimator, total=   0.0s
Predict test


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


##### END of cl - 2024-04-08 23:24:31.984457 #####





##### START of co - 2024-04-08 23:24:31.984529 #####
Reading data
Formating data
Training model
[Pipeline] ........ (step 1 of 5) Processing vectorizer, total=  55.4s
[Pipeline] .......... (step 2 of 5) Processing sampling, total=   0.0s
[Pipeline] ........... (step 3 of 5) Processing scaling, total=   0.0s
[Pipeline] ......... (step 4 of 5) Processing selection, total=   0.0s
[Pipeline] ......... (step 5 of 5) Processing estimator, total=   0.0s
Predict test


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


##### END of co - 2024-04-08 23:26:00.742554 #####





##### START of gl - 2024-04-08 23:26:00.742609 #####
Reading data
Formating data
Training model
[Pipeline] ........ (step 1 of 5) Processing vectorizer, total=  22.6s
[Pipeline] .......... (step 2 of 5) Processing sampling, total=   0.0s
[Pipeline] ........... (step 3 of 5) Processing scaling, total=   0.0s
[Pipeline] ......... (step 4 of 5) Processing selection, total=   0.0s
[Pipeline] ......... (step 5 of 5) Processing estimator, total=   0.0s
Predict test


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


##### END of gl - 2024-04-08 23:26:35.547197 #####





##### START of lu - 2024-04-08 23:26:35.547277 #####
Reading data
Formating data
Training model
[Pipeline] ........ (step 1 of 5) Processing vectorizer, total=  16.5s
[Pipeline] .......... (step 2 of 5) Processing sampling, total=   0.0s
[Pipeline] ........... (step 3 of 5) Processing scaling, total=   0.0s
[Pipeline] ......... (step 4 of 5) Processing selection, total=   0.0s
[Pipeline] ......... (step 5 of 5) Processing estimator, total=   0.0s
Predict test
##### END of lu - 2024-04-08 23:27:00.359828 #####







  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,class,precision,recall,f1-score,support,corpus
3,macro avg,0.250725,0.5,0.333977,345.0,cl
3,macro avg,0.262195,0.5,0.344,164.0,lu
3,macro avg,0.271552,0.5,0.351955,464.0,co
3,macro avg,0.281944,0.5,0.360568,360.0,ig
3,macro avg,0.295547,0.5,0.371501,247.0,gl
3,macro avg,0.433628,0.5,0.464455,113.0,bo


None


#### with selection

In [11]:
##########################
# Definitions 
##########################

text_vect = TfidfVectorizer(
            stop_words = stopwords.words('portuguese'),
            lowercase = True,
            ngram_range = (1,3)
            
            )
sampling = None

percentile = .3
selection = SelectPercentile(percentile=percentile)
scaling = MaxAbsScaler()

classifier = XGBClassifier(
            random_state = random_seed,
            verbosity = 0,
            device = 'cuda',
            tree_method = 'hist'
            )

##########################
# Process 
##########################

df_results = process_classification_by_corpus(
    text_vect = text_vect,
    sampling = sampling,
    scaling = scaling,
    classifier = classifier,
    list_corpus = list_corpus
    
)

display(df_results[df_results['class'] == 'macro avg'].sort_values('f1-score'))

df_results.to_excel(path_results_cr + f'classification_report_tfidf_SelectPercentile_{str(percentile).replace('.',',')}.xlsx', index = False)

##### START of ig - 2024-04-08 19:27:03.033742 #####
Reading data
Formating data
Training model


### classification with all corpora 

In [None]:
# join all corpus 

df_all = pd.DataFrame({}) 

for corpus in tqdm(list_corpus):
    
    data = pd.read_csv(
        path_raw_data + f'train_r3_{corpus}_top_mentioned_timelines.csv', 
        sep = ';', 
        encoding='utf-8-sig'
        )
    
    data['corpus'] = corpus
    
    df_all = pd.concat([df_all, data])

In [None]:
df_all

#### (without indication of corpus)

In [None]:
print('Formating data')
X = df_all['Texts']
y = df_all.Polarity
# encode label
y_encoded = y.map({'against': 0, 'for': 1})
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y_encoded, 
    test_size=0.2, 
    random_state=random_seed, 
    stratify=y_encoded
    )

# defining model
clf_pipe = ClassificationPipeline(
    vectorizer=text_vect,
    sampling = sampling,
    scaling =scaling, 
    estimator= classifier
)

print('Training model')
clf_pipe.train(X_train, y_train)

print('Predict test')
y_pred, y_pred_proba = clf_pipe.predict(X_test)

df_classification_report = get_classification_report(y_test, y_pred)

df_classification_report = df_classification_report.reset_index().rename(columns = {"index": "class"})

In [None]:
df_classification_report

In [None]:
df_classification_report.to_excel(path_results_cr + 'classification_report_tfidf_all_corpora_without_corpus_tag.xlsx', index = False)

#### (with indication of corpus)

In [None]:
print('Formating data')
X = df_all[['Texts', 'corpus']]
y = df_all.Polarity
# encode label
y_encoded = y.map({'against': 0, 'for': 1})
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y_encoded, 
    test_size=0.2, 
    random_state=random_seed, 
    stratify=y_encoded
    )

preprocessor = ColumnTransformer(
    transformers=[
        ('Texts', text_vect , 'Texts'),
        ('corpus', 'passthrough',['corpus'])
    ]
)

# defining model
clf_pipe = ClassificationPipeline(
    vectorizer=preprocessor,
    sampling = sampling,
    scaling =scaling, 
    estimator= classifier
)

print('Training model')
clf_pipe.train(X_train, y_train)

print('Predict test')
y_pred, y_pred_proba = clf_pipe.predict(X_test)

df_classification_report = get_classification_report(y_test, y_pred)

df_classification_report = df_classification_report.reset_index().rename(columns = {"index": "class"})

In [None]:
df_classification_report.to_excel(path_results_cr + 'classification_report_tfidf_all_corpora_with_corpus_tag.xlsx', index = False)

### Com feature indicando a quantidade de textos

In [None]:
# data['n_comments'] = data.Texts.apply(lambda x: len(x.split(' # ')))
# data

# X = data[['Texts','n_comments']]
# y = data.Polarity
# y_encoded = y.map({'against': 0, 'for': 1})
# X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)


# text_vect = TfidfVectorizer(
#             stop_words = stopwords.words('portuguese'),
#             lowercase = True,
#             ngram_range = (1,3),
#             max_features=30000
            
#             )
# sampling = RandomOverSampler(random_state=random_seed)
# scaling = MaxAbsScaler()

# classifier = XGBClassifier(
#             random_state = 42,
#             verbosity = 3,
#             # device = 'cuda',
#             # tree_method = 'hist'
#             )


# preprocessor = ColumnTransformer(
#     transformers=[
#         ('Texts', text_vect , 'Texts'),
#         ('n_comments', 'passthrough',['n_comments'])
#     ]
# )

# clf_pipe = ClassificationPipeline(
#     vectorizer=preprocessor,
#     sampling = sampling,
#     scaling =scaling, 
#     estimator= classifier
# )

# clf_pipe.train(X_train, y_train)

# y_pred, y_pred_proba = clf_pipe.predict(X_test)

# df_classification_report = get_classification_report(y_test, y_pred)

# df_classification_report