In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
import string

from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from sklearn.manifold import TSNE
import re

**1. Phân tích dữ liệu**

In [None]:
train_data_path = "/kaggle/input/quora-insincere-questions-classification/train.csv"
test_data_path = "/kaggle/input/quora-insincere-questions-classification/test.csv"
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)
train_data

In [None]:
test_data

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv(train_data_path)
df['target'].value_counts().plot.bar(title='Target')
plt.show()

In [None]:
value_counts = train_data['target'].value_counts()
value_counts_percentage = train_data['target'].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'
pd.concat([value_counts, value_counts_percentage], axis=1, keys=['Counts', 'Percentage'])

In [None]:
# Biểu đồ Histogram tần suất của từ
word_length_list = [len(x.split()) for x in train_data['question_text'] if len(x.split()) < 80]
char_length_list = [len(x) for x in train_data['question_text'] if len(x) < 200]
fig, axs = plt.subplots(1, 2, sharey=True, tight_layout=True)
axs[0].hist(word_length_list, bins=25)
axs[0].set_title('Words in Questions')

axs[1].hist(char_length_list, bins=25)
axs[1].set_title('Length of Questions')
plt.show()

**2. Xử lý dữ liệu**

In [None]:
from nltk.tokenize import word_tokenize
stop_words = list(stopwords.words('english'))

puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', 
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]
# Xoá các dấu câu
def clean_text(x):
    x = str(x)
    for punct in puncts:
        x = x.replace(punct, f' {punct} ')
    return x

# Xoá chữ số
def clean_numbers(x):
    return re.sub('[0-9]{2}', ' ', x)

#Xoá stop_words
def remove_stopwords(x):
    word_token = word_tokenize(x)
    filtered = [w for w in word_token if not w in stop_words]
    x = " ".join(filtered)
    return x

In [None]:
def data_clean(x):
  x = clean_text(x)
  x = clean_numbers(x)
  x = remove_stopwords(x)
  return x

In [None]:
#clean dữ liệu tập train và test
train_data['question_text_cleaned'] = train_data['question_text'].apply(lambda x: data_clean(x))
test_data['question_text_cleaned'] = test_data['question_text'].apply(lambda x: data_clean(x))
display(train_data.head(),test_data.head())

In [None]:
display(train_data.head(),test_data.head())

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

**3. Vector hoá dữ liệu**

In [None]:
tfidf = TfidfVectorizer(ngram_range=(1, 3))

def predict_linearSVC(X_train,y_train,X_test):
    tfidf.fit(X_train)
    X_train = tfidf.transform(X_train)
    X_test = tfidf.transform(X_test)
    svm = LinearSVC()
    svm.fit(X_train,y_train)
    return svm.predict(X_test)

**4. Mô hình**

In [None]:
# Phân chia dữ liệu test = 20%, dữ liệu train = 80%
X = train_data.question_text_cleaned
y = train_data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)


In [None]:
predict = predict_linearSVC(X_train,y_train,X_test)
print('F1 score :', f1_score(predict, y_test), '\n')
print(classification_report(y_test, predict))

**5. Tạo submission**

In [None]:
pred_data = test_data['question_text_cleaned']

predict = predict_linearSVC(X_train,y_train,pred_data)

In [None]:
submit = pd.DataFrame({'qid':test_data['qid'].values})
submit['prediction'] = predict
submit.to_csv('submission.csv',index=False)

In [None]:
submit