In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

### Data Preprocessing

#### Data load

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
file_path = '/content/drive/MyDrive/boaz/base_project/datasets/'

In [4]:
train_data = pd.read_csv(file_path + 'i1e0_train.csv', encoding='UTF-8')
val_data = pd.read_csv(file_path + 'i1e0_validation.csv', encoding='UTF-8')
test_data = pd.read_csv(file_path + 'i1e0_test.csv', encoding='UTF-8')

In [5]:
train_data = train_data.groupby('type').sample(frac=0.05)
train_data.reset_index(drop = True, inplace = True)

test_data = test_data.groupby('type').sample(frac=0.05)
test_data.reset_index(drop = True, inplace = True)

val_data = val_data.groupby('type').sample(frac=0.05)
val_data.reset_index(drop = True, inplace = True)

In [6]:
train_data.shape[0]

4705

### Preprocessing

#### Train, Validation, Test Split

In [7]:
X_train, X_test, X_val = train_data['posts'], test_data['posts'], val_data['posts']
y_train, y_test, y_val = train_data['i1e0'], test_data['i1e0'], val_data['i1e0']

In [8]:
X_train.shape, X_test.shape

((4705,), (1569,))

In [9]:
# TF-IDF Vectorizer 생성

from sklearn.feature_extraction.text import TfidfVectorizer

def vectorize(data, tfidf_vec_fit):
    X_tfidf = tfidf_vect_fit.transform(data)
    words = tfidf_vect_fit.get_feature_names()
    X_tfidf_df = pd.DataFrame(X_tfidf.toarray())
    X_tfidf_df.columns = words
    return(X_tfidf_df)

In [10]:
tmp_data = pd.concat([X_train, X_test, X_val])
 
tfidf_vect = TfidfVectorizer(analyzer = 'word')
tfidf_vect_fit = tfidf_vect.fit(tmp_data)

X_train = vectorize(X_train,tfidf_vect_fit)
X_test = vectorize(X_test,tfidf_vect_fit)
X_val= vectorize(X_val,tfidf_vect_fit)



### Modeling

#### Naive Bayes

In [18]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, f1_score, classification_report

In [19]:
nb_model = GaussianNB()

In [20]:
nb_model.fit(X_train, y_train)

GaussianNB()

In [21]:
pred = nb_model.predict(X_test)

In [22]:
f1 = f1_score(y_test, pred)
print('f1 score :', f1)

f1 score : 0.7121535181236675


In [23]:
# Gaussian NB의 경우 별도의 하이퍼라미터가 존재하지 않음
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.68      0.87      0.77       762
           1       0.83      0.62      0.71       807

    accuracy                           0.74      1569
   macro avg       0.76      0.75      0.74      1569
weighted avg       0.76      0.74      0.74      1569

