In [1]:
import os.path
import pickle
import string
from collections import Counter

import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import ComplementNB

nltk.download('stopwords')


def process_text(df):
    df['Summary'].fillna('', inplace=True)
    df['Text'].fillna('', inplace=True)
    df['Summary'] = df['Summary'].apply(stem_sent)
    df['Text'] = df['Text'].apply(stem_sent)
    return df


def stem_sent(text):
    if not text:
        return

    stemmer = SnowballStemmer('english')
    text = text.lower()

    words = word_tokenize(text)
    result = []
    for w in words:
        # remove all punctuation
        if w in string.punctuation or w in stopwords.words('english'):
            continue
        result.append(stemmer.stem(w))
    return ' '.join(result)


def calc_counter(text, vocab_counter):
    counter = Counter(text.split())
    vocab_counter += counter


def one_hot(text, vocabulary):
    vec = [0] * (len(vocabulary) + 1)
    counter = Counter(text.split())
    for k, v in counter.items():
        if k in vocabulary:
            vec[vocabulary[k]] = v
        else:
            vec[0] += v
    return vec


def convert2Id(text, id_dict):
    if text in id_dict:
        return id_dict[text]
    else:
        return 0



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\97661\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:

# Load files into DataFrames
print('loading data...')
if os.path.exists('./data/X_train_stem.csv'):
    X_train = pd.read_csv("./data/X_train_stem.csv", index_col=0)
    X_submission = pd.read_csv("./data/X_test_stem.csv", index_col=0)
    X_train[['Summary', 'Text']] = X_train[['Summary', 'Text']].astype(str)
    X_submission[['Summary', 'Text']] = X_submission[['Summary', 'Text']].astype(str)
else:
    X_train = pd.read_csv("./data/X_train.csv", index_col=0)
    X_submission = pd.read_csv("./data/X_test.csv", index_col=0)
    X_train[['Summary', 'Text']] = X_train[['Summary', 'Text']].astype(str)
    X_submission[['Summary', 'Text']] = X_submission[['Summary', 'Text']].astype(str)
    # stem word
    print('stem data')
    X_train = process_text(X_train)
    X_train.to_csv("./data/X_train_stem.csv")
    X_submission = process_text(X_submission)
    X_submission.to_csv("./data/X_test_stem.csv")

In [None]:
date = pd.to_datetime(X_train['Time'], unit='s')
X_train['Day'] = date.dt.day
X_train['Month'] = date.dt.month
X_train['Year'] = date.dt.year

date = pd.to_datetime(X_submission['Time'], unit='s')
X_submission['Day'] = date.dt.day
X_submission['Month'] = date.dt.month
X_submission['Year'] = date.dt.year

In [None]:
from sklearn import preprocessing
import numpy as np
from sklearn.compose import make_column_transformer

column_trans = make_column_transformer(
    # (preprocessing.OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), ['ProductId', 'UserId']),
    (preprocessing.MinMaxScaler(), ['Day', 'Month', 'Year']), 
    ('passthrough', ['Helpfulness']),
    (TfidfVectorizer(stop_words={'english'}, min_df=10, ngram_range=(1,1)), 'Summary'),
    (TfidfVectorizer(stop_words={'english'}, min_df=10, ngram_range=(2,2)), 'Text'),
    n_jobs=-1
)

In [None]:
X_train.head()

In [None]:
x = X_train[:300]
column_trans.fit(x)
newdata = column_trans.transform(x)
# newdata.toarray()
newdata.shape
newdata

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate
 
# steps = make_pipeline(column_trans, ComplementNB())
train_set = column_trans.fit_transform(X_train)

cv_results = cross_validate(ComplementNB(), train_set, X_train['Score'], cv=10, return_train_score=True, return_estimator=True)
print(cv_results['test_score'])
print(cv_results['train_score'])

[0.60730999 0.6003406  0.6073243  0.60290656 0.61446981 0.62774323
 0.63443361 0.64138158 0.65020429 0.65385358]
[0.75072966 0.75158037 0.7514937  0.75129911 0.75225636 0.75226431
 0.75248851 0.75170459 0.75098825 0.7504985 ]


In [None]:
print(len(cv_results))
print(cv_results['test_score'][cv_results['test_score'].argmax()])

test_set = column_trans.transform(X_submission)
optimal = cv_results['estimator'][cv_results['test_score'].argmax()]
X_submission['Score'] = optimal.predict(test_set)
X_submission[['Score']].to_csv('./data/submission.csv')

In [None]:
pre = optimal.predict(train_set)

# Plot a confusion matrix
cm = confusion_matrix(X_train['Score'], pre, normalize='true')
sns.heatmap(cm, annot=True)
plt.title('Confusion matrix of the Summary classifier')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

In [5]:
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate

# 8 
tfidf = TfidfVectorizer(stop_words={'english'}, min_df=10, ngram_range=(1,2))
data = tfidf.fit_transform(X_train['Text'])

cv_results = cross_validate(ComplementNB(), data, X_train['Score'], cv=10, return_train_score=True, return_estimator=True)

print(cv_results['test_score'])
print(cv_results['train_score'])

[0.58670235 0.58074188 0.58299584 0.57623092 0.59130037 0.60785099
 0.61660215 0.61798316 0.62644094 0.62785772]
[0.73036996 0.73222323 0.73152756 0.73211372 0.7331942  0.73355754
 0.73379923 0.73266629 0.73150551 0.73122486]


In [9]:
print(len(cv_results))
optimal = cv_results['estimator'][cv_results['test_score'].argmax()]
print(cv_results['test_score'][cv_results['test_score'].argmax()])

testset = tfidf.transform(X_submission['Text'])
result = optimal.predict(testset)

5
0.6278577204067176


In [12]:
from sklearn.linear_model import SGDClassifier

svm = SGDClassifier()
svm_results = cross_validate(svm, data, X_train['Score'], cv=10, return_train_score=True, return_estimator=True)

print(svm_results['test_score'])
print(svm_results['train_score'])

[0.57410164 0.56855618 0.58014797 0.58746503 0.5807675  0.58229877
 0.58481034 0.58728614 0.58738632 0.58571193]
[0.60498863 0.6055078  0.60357583 0.60365406 0.60355865 0.60409929
 0.60286696 0.60187314 0.60227385 0.60236687]


In [13]:
from sklearn.svm import LinearSVC

svc = LinearSVC(C=1000)
svc_results = cross_validate(svm, data, X_train['Score'], cv=10, return_train_score=True, return_estimator=True)

print(svc_results['test_score'])
print(svc_results['train_score'])

[0.57413026 0.56826996 0.58087783 0.58742925 0.58032386 0.58205548
 0.58491768 0.58751512 0.58759383 0.58553305]
[0.60442892 0.60510948 0.60429058 0.60363895 0.60337976 0.60300609
 0.60341475 0.60268489 0.6029234  0.60195662]


In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# pre = 

# Plot a confusion matrix
cm = confusion_matrix(X_train['Score'], y_train, normalize='true')
sns.heatmap(cm, annot=True)
plt.title('Confusion matrix of the Summary classifier')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()