In [1]:
# import libraries

import os
import pickle
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import joblib

In [2]:
# file path of the tsv files
file_path = 'C:\\Users\\GLaDOS\\Documents\\SJ\\SDP\\Tech\\tf-idf_model_train\\'

# path 뒤에 number.tsv 필요함
train_set_augmented = 'data_cut_for_deal_status\\train_set_augmented_'
train_set_crop = 'data_cut_for_deal_status\\train_set_crop_'
val_set = 'data_cut_for_deal_status\\val_set_'

# trainset종류_number.pkl 필요
model_file = 'tfidf_logreg_201128_'
vector_file = 'tfidf_vector_201128_'

In [3]:
train = pd.read_csv(file_path + train_set_augmented + "0.tsv", sep='\t')
val = pd.read_csv(file_path + val_set + "0.tsv", sep ='\t')

In [4]:
train.head()

Unnamed: 0,body,label
0,Phase 1 (8MW) of the Formosa 1 project began o...,0
1,Formosa 1 is an offshore wind farm developed n...,0
2,The first unit of 1320MW coal-fired thermal Pa...,0
3,The first unit (660 MW) of newly-constructed P...,0
4,The Formosa 1 offshore wind project in Taiwan ...,0


In [5]:
train['label'].value_counts()

1    1070
0     806
Name: label, dtype: int64

In [6]:
val.head()

Unnamed: 0,body,label
0,Saudi Arabia-headquartered utilities company A...,0
1,"Saudi-based Acwa Power, a major developer of p...",0
2,Tata Power has an installed hydro power capaci...,0
3,The project will generate around 450 GWh of cl...,0
4,"Adjaristsqali Georgia (AGL), a joint venture b...",0


In [7]:
val['label'].value_counts()

0    202
1     53
Name: label, dtype: int64

## Train models for augmented texts

In [8]:
for i in range (0, 5):
    
    # read the input as pandas dataframe
    train = pd.read_csv(file_path + train_set_augmented + "%d.tsv" %i, sep='\t')
    val = pd.read_csv(file_path + val_set + "%d.tsv" %i, sep ='\t')

    # drop na values
    train.dropna(inplace=True)
    val.dropna(inplace=True)

    # use tf-idf method to vectorize the body texts
    label = list(train['label'])
    body_texts = list(train['body'])

    print("Report_Augmented %d" %i)
    vectorized = TfidfVectorizer(
        min_df = 0.0,
        analyzer = 'word',
        sublinear_tf = True,
        ngram_range = (1, 3),
        max_features = 5000
    )

    # train value (x)
    matrix = vectorized.fit_transform(body_texts)

    # save fitted tfidfVectorizer
    joblib.dump(vectorized, file_path + vector_file + "aug_%d.pkl" %i)

    # make logistic regression model using tfidfVectorizer
    model = LogisticRegression(class_weight="balanced")
    model.fit(matrix, label)

    # save the model
    joblib.dump(model, file_path + model_file + "aug_%d.pkl" %i)
    
    # model evaluation
    val_label = list(val['label'])
    test_body_texts = list(val['body'])
    
    # evaluation vector
    val_matrix = vectorized.transform(test_body_texts)
    
    print("Accuracy: %f" %model.score(val_matrix, val_label))
    
    print("Classification Report_%d" %i)
    print(classification_report(val_label, model.predict(val_matrix)))

Report_Augmented 0
Accuracy: 0.941176
Classification Report_0
              precision    recall  f1-score   support

           0       0.95      0.98      0.96       202
           1       0.91      0.79      0.85        53

    accuracy                           0.94       255
   macro avg       0.93      0.89      0.91       255
weighted avg       0.94      0.94      0.94       255

Report_Augmented 1
Accuracy: 0.878431
Classification Report_1
              precision    recall  f1-score   support

           0       0.91      0.95      0.92       202
           1       0.75      0.62      0.68        53

    accuracy                           0.88       255
   macro avg       0.83      0.78      0.80       255
weighted avg       0.87      0.88      0.87       255

Report_Augmented 2
Accuracy: 0.776471
Classification Report_2
              precision    recall  f1-score   support

           0       0.94      0.76      0.84       202
           1       0.48      0.83      0.61        

## Train models for cropped texts

In [9]:
for i in range (0, 5):
    
    # read the input as pandas dataframe
    train = pd.read_csv(file_path + train_set_crop + "%d.tsv" %i, sep='\t')
    val = pd.read_csv(file_path + val_set + "%d.tsv" %i, sep ='\t')

    # drop na values
    train.dropna(inplace=True)
    val.dropna(inplace=True)

    # use tf-idf method to vectorize the body texts
    label = list(train['label'])
    body_texts = list(train['body'])

    print("Report_Crop %d" %i)
    vectorized = TfidfVectorizer(
        min_df = 0.0,
        analyzer = 'word',
        sublinear_tf = True,
        ngram_range = (1, 3),
        max_features = 5000
    )

    # train value (x)
    matrix = vectorized.fit_transform(body_texts)

    # save fitted tfidfVectorizer
    joblib.dump(vectorized, file_path + vector_file + "crop_%d.pkl" %i)

    # make logistic regression model using tfidfVectorizer
    model = LogisticRegression(class_weight="balanced")
    model.fit(matrix, label)

    # save the model
    joblib.dump(model, file_path + model_file + "crop_%d.pkl" %i)
    
    # model evaluation
    val_label = list(val['label'])
    test_body_texts = list(val['body'])
    
    # evaluation vector
    val_matrix = vectorized.transform(test_body_texts)
    
    print("Accuracy: %f" %model.score(val_matrix, val_label))
    
    print("Classification Report_%d" %i)
    print(classification_report(val_label, model.predict(val_matrix)))

Report_Crop 0
Accuracy: 0.972549
Classification Report_0
              precision    recall  f1-score   support

           0       0.99      0.98      0.98       202
           1       0.93      0.94      0.93        53

    accuracy                           0.97       255
   macro avg       0.96      0.96      0.96       255
weighted avg       0.97      0.97      0.97       255

Report_Crop 1
Accuracy: 0.803922
Classification Report_1
              precision    recall  f1-score   support

           0       0.96      0.78      0.86       202
           1       0.52      0.89      0.65        53

    accuracy                           0.80       255
   macro avg       0.74      0.83      0.76       255
weighted avg       0.87      0.80      0.82       255

Report_Crop 2
Accuracy: 0.666667
Classification Report_2
              precision    recall  f1-score   support

           0       0.97      0.60      0.74       202
           1       0.38      0.92      0.54        53

    accurac