In [69]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

import nltk
import re
import regex

import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize


In [2]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\AW\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\AW\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\AW\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [43]:
train = pd.read_csv('raw_train.csv')
test = pd.read_csv('raw_test.csv')


In [44]:
train.head()

Unnamed: 0,type,posts
0,INFP,'One stereotype I disagree with is that INFPs ...
1,INTP,'The fridge and the buzzing of my roommates ph...
2,INFP,"'The thing is, the mbti is so much more than d..."
3,INFP,'Almost never. The only other results I got ot...
4,ENFP,'She was curious of how many others didn't mat...


In [45]:
test.head()

Unnamed: 0.1,Unnamed: 0,posts,ID
0,5443,'Captain America: ISFJ Iron Man: ENTP Thor: ES...,1
1,4886,'Is a X-Files fan. (What else is there to say?...,2
2,7127,'Thank you!|||This exactly. I think my SO is a...,3
3,3206,"'As stressful as school is, I'm happy to say t...",4
4,3528,Orthodox Iconoclast Yummy Donuts do you guys h...,5


### 1. Pre-processing data

In [46]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6506 entries, 0 to 6505
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   type    6506 non-null   object
 1   posts   6506 non-null   object
dtypes: object(2)
memory usage: 101.8+ KB


In [47]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2169 entries, 0 to 2168
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  2169 non-null   int64 
 1   posts       2169 non-null   object
 2   ID          2169 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 51.0+ KB


In [24]:
train.duplicated().sum()

0

In [None]:
train.duplicated().sum()

In [54]:
test.isna().sum()

Unnamed: 0    0
posts         0
ID            0
dtype: int64

In [55]:
test.isnull().sum()

Unnamed: 0    0
posts         0
ID            0
dtype: int64

In [78]:
def preprocess_text(text):
    document = text.lower()
    document = document.replace("’",'')
    document = document.replace("'",'')

    document = regex.sub(r'\.+', ".", document)
    new_sentence =''
    for sentence in sent_tokenize(document):
        
        sentence = regex.sub(r'http\S+', '', sentence)
        sentence = regex.sub(r'[A-Za-z0-9]*@[A-Za-z]*\.?[A-Za-z0-9]*', '', sentence)
        
        # Tokenize the text into individual words
        words = word_tokenize(sentence)

        # Remove punctuation
        words = [word for word in words if word not in string.punctuation]

        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        words = [word for word in words if word not in stop_words]

        # Lemmatization
        lemmatizer = WordNetLemmatizer()
        words = [lemmatizer.lemmatize(word) for word in words]

        # Join the words back into a single string
        sentence = ' '.join(words)
        new_sentence = new_sentence+ sentence + '. '                    
    document = new_sentence  
    #print(document)
     ###### DEL excess blank space
    document = regex.sub(r'\s+', ' ', document).strip()
    return document


   

In [79]:
train.cleaned = train.posts.apply(lambda x: preprocess_text(x))
test.cleaned = test.posts.apply(lambda x: preprocess_text(x))

In [80]:
train.head()

Unnamed: 0,type,posts,cleaned
0,INFP,'One stereotype I disagree with is that INFPs ...,one stereotype disagree infps one giant cause....
1,INTP,'The fridge and the buzzing of my roommates ph...,fridge buzzing roommate phone.|||have ever don...
2,INFP,"'The thing is, the mbti is so much more than d...",thing mbti much test read forget. come site li...
3,INFP,'Almost never. The only other results I got ot...,almost never. result got 9 either 6 ot 5. im s...
4,ENFP,'She was curious of how many others didn't mat...,curious many others didnt match skill likely p...


In [81]:
test.head()

Unnamed: 0.1,Unnamed: 0,posts,ID,cleaned
0,5443,'Captain America: ISFJ Iron Man: ENTP Thor: ES...,1,captain america isfj iron man entp thor esxp b...
1,4886,'Is a X-Files fan. (What else is there to say?...,2,x-files fan. else say. |||sorry say think your...
2,7127,'Thank you!|||This exactly. I think my SO is a...,3,thank |||this exactly. think infp. cant seem a...
3,3206,"'As stressful as school is, I'm happy to say t...",4,stressful school im happy say may next year il...
4,3528,Orthodox Iconoclast Yummy Donuts do you guys h...,5,orthodox iconoclast yummy donut guy source num...


### 2. Tranformation

In [83]:
X_train = train['cleaned']
y_train = train['type']

X_test = test['cleaned']

In [86]:
X_train.shape

(6506,)

In [87]:
y_train.shape

(6506,)

In [88]:
X_test.shape

(2169,)

In [None]:
X_test.shape

In [95]:
count = CountVectorizer()
count.fit(X_train)
bag_of_words_train = count.transform(X_train)
bag_of_words_train

<6506x88826 sparse matrix of type '<class 'numpy.int64'>'
	with 2853366 stored elements in Compressed Sparse Row format>

In [96]:
bag_of_words_test = count.transform(X_test)
bag_of_words_test

<2169x88826 sparse matrix of type '<class 'numpy.int64'>'
	with 930142 stored elements in Compressed Sparse Row format>

In [98]:
X_train_T = bag_of_words_train.toarray()
X_train_T

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [100]:
X_train_T.shape

(6506, 88826)

In [101]:
X_test_T = bag_of_words_test.toarray()
X_test_T

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [102]:
X_test_T.shape

(2169, 88826)

### 3. Build Model

In [104]:
nb = MultinomialNB()
model = nb.fit(X_train_T, y_train)


In [105]:
y_pred = model.predict(X_test_T)

In [125]:
y_pred

array(['INFJ', 'INTP', 'INFP', ..., 'INFP', 'INFP', 'INFP'], dtype='<U4')

In [106]:
print('Score Scikit learn - train: ', model.score(X_train_T, y_train))

Score Scikit learn - train:  0.6885951429449738


### 4. Evalutaion model

In [107]:
solution = pd.read_csv("solution.csv")
solution.head()

Unnamed: 0,Id,Category
0,1,INFJ
1,2,INTJ
2,3,ENTJ
3,4,ISFP
4,5,ENTP


In [108]:
test_sub = test[['ID','cleaned']]

In [110]:
test_sub.head()

Unnamed: 0,ID,cleaned
0,1,captain america isfj iron man entp thor esxp b...
1,2,x-files fan. else say. |||sorry say think your...
2,3,thank |||this exactly. think infp. cant seem a...
3,4,stressful school im happy say may next year il...
4,5,orthodox iconoclast yummy donut guy source num...


In [111]:
test_sub.rename({"ID":"Id"}, axis="columns", inplace=True)
test_sub = pd.merge(test_sub, solution, on="Id")
y_test = test_sub['Category']
y_test.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_sub.rename({"ID":"Id"}, axis="columns", inplace=True)


0    INFJ
1    INTJ
2    ENTJ
3    ISFP
4    ENTP
Name: Category, dtype: object

In [112]:
print('Score Scikit learn: ', model.score(X_test_T, y_test))

Score Scikit learn:  0.38589211618257263


In [124]:
from sklearn.metrics import classification_report, roc_auc_score, roc_curve

In [116]:
print(classification_report(y_pred, y_test, digits=4))

              precision    recall  f1-score   support

        ENFJ     0.0000    0.0000    0.0000         0
        ENFP     0.0485    0.5714    0.0894        14
        ENTJ     0.0000    0.0000    0.0000         0
        ENTP     0.0376    0.4667    0.0697        15
        ESFJ     0.0000    0.0000    0.0000         0
        ESFP     0.0000    0.0000    0.0000         0
        ESTJ     0.0000    0.0000    0.0000         0
        ESTP     0.0000    0.0000    0.0000         0
        INFJ     0.5077    0.4422    0.4727       450
        INFP     0.8690    0.3452    0.4941      1153
        INTJ     0.2036    0.6404    0.3089        89
        INTP     0.5979    0.3750    0.4609       448
        ISFJ     0.0000    0.0000    0.0000         0
        ISFP     0.0000    0.0000    0.0000         0
        ISTJ     0.0000    0.0000    0.0000         0
        ISTP     0.0000    0.0000    0.0000         0

    accuracy                         0.3859      2169
   macro avg     0.1415   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [117]:
from sklearn.metrics import accuracy_score
print('Accuracy is ', accuracy_score(y_test, y_pred)*100,'%')

Accuracy is  38.589211618257266 %


- Nhận xét: Cả training và testing đều có score thấp
- Model có độ chính xác không cao