In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objs as go
from plotly.offline import iplot
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud, STOPWORDS

import re, string, unicodedata
from string import punctuation
from bs4 import BeautifulSoup

import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import wordnet, stopwords
from nltk.stem import WordNetLemmatizer

from keras.models import Sequential
from keras.layers import Dense
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, 
accuracy_score, plot_confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import os
import collections

In [None]:
df = pd.read_json('', lines = True)
df.head()

In [None]:
df.shape
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.describe()

In [None]:
print(df['category'].nunique())
df['category'].unique()

In [None]:
print(df.groupby('category').size())

In [None]:
import datetime
df['year'] = pd.DatetimeIndex(df['date']).year
df.head()

In [None]:
df['month'] = pd.DatetimeIndex(df['date']).month
df.head()

In [None]:
print(df['year'].unique())
print(df['month'].unique())

In [None]:
def category_merge(x):
    if x == 'THE WORLDPOST':
        return 'WORLDPOST'
    elif x == 'TASTE':
        return 'FOOD & DRINK'
    elif x == 'STYLE':
        return 'STYLE & BEAUTY'
    elif x == 'PARENTING':
        return 'PARENTS'
    elif x == 'COLLEGE':
        return 'EDUCATION'
    elif x == 'ARTS' or x == 'CULTURE & ARTS':
        return 'ARTS & CULTURE'
    
    else:
        return x
    
df['category'] = df['category'].apply(category_merge)
le = LabelEncoder()
data_labels = le.fit_transform(df['category'])
list(le.classes_)

In [None]:
print(df['authors'].nunique())
df['authors'].unique()

In [None]:
df['authors'] = df['authors'].apply(lambda x: x.split(',')[0])
df['authors'] = df['authors'].str.replace(' ', '', regex=False)
df['authors'].unique()

In [None]:
print(df.groupby('authors').size())

In [None]:
labels = df['year'].value_counts().index
values = df['year'].value_counts().values

colors = df['year']

figure = go.Figure(data = [go.Pie(labels = labels, values = values,
                              textinfo = "label+percent",
                              marker = dict(colors = colors))])
figure.show()

In [None]:
labels = df['month'].value_counts().index
values = df['month'].value_counts().values

colors = df['month']

figure = go.Figure(data = [go.Pie(labels = labels, values = values,
                               textinfo = "label+percent",
                               marker = dict(colors = colors))])
figure.show()

In [None]:
labels = df['category'].value_counts().index
values = df['category'].value_counts().values

colors = df['category']

figure = go.Figure(data = [go.Pie(labels = labels, values = values, 
                               textinfo = "label+percent",
                               marker = dict(colors = colors), 
                               pull=[0, 0, 0.2, 0])])
figure.show()

In [None]:
plt.figure(figsize=(20,20))
sizes = df['category'].value_counts().values
labels = df['category'].value_counts().index
plt.pie(sizes, labels=labels, autopct='%.1f%%',
        shadow=True, pctdistance=0.85, labeldistance=1.05, startangle=20, 
        explode = [0 if i > 0 else 0.2 for i in range(len(sizes))])
plt.axis('equal')
plt.show()

In [None]:
sns.barplot(y=df['category'].value_counts()[:5].index, 
            x=df['category'].value_counts()[:5].values, orient='h')

In [None]:
data_labels

In [None]:
df['target'] = data_labels
df.head()

In [None]:
df['target'].unique()

In [None]:
# 定義本文清理
def clean_text(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [None]:
# 將short_description本文清理
df['short_description'] = df['short_description'].apply(lambda x: clean_text(x))

In [None]:
df.head()

In [None]:
word_cloud = WordCloud(
                       width=1600,
                       height=800,
                       #colormap='PuRd', 
                       margin=0,
                       max_words=500,
                       max_font_size=150, min_font_size=30,
                       background_color="white").generate(" ".join(df['short_description']))

plt.figure(figsize=(10, 16))
plt.imshow(word_cloud, interpolation="gaussian")
plt.title('WordCloud of short_description', fontsize = 40)
plt.axis("off")
plt.show()

In [None]:
print()
text = "I love you, don't you"

tokenizer1 = nltk.tokenize.WhitespaceTokenizer()
tokenizer2 = nltk.tokenize.TreebankWordTokenizer()
tokenizer3 = nltk.tokenize.WordPunctTokenizer()
tokenizer4 = nltk.tokenize.RegexpTokenizer(r'\w+')

print("Example Text: ", text)
print("Tokenization by whitespace: ", tokenizer1.tokenize(text))
print("Tokenization by words using Treebank Word Tokenizer: ", tokenizer2.tokenize(text))
print("Tokenization by punctuation: ", tokenizer3.tokenize(text))
print("Tokenization by regular expression: ", tokenizer4.tokenize(text))

In [None]:
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')

df['short_description'] = df['short_description'].apply(lambda x: tokenizer.tokenize(x))

In [None]:
print()
print('Tokenized string:')
df['short_description'].head()

In [None]:
nltk.download('stopwords')

In [None]:
def remove_stopwords(text):
    words = [word for word in text if word not in stopwords.words('english')]
    return words

In [None]:
df['short_description'] = df['short_description'].apply(lambda x: remove_stopwords(x))

In [None]:
df.head()

In [None]:
nltk.download('wordnet')

In [None]:
text = "How is the Josh"

tokenizer = nltk.tokenize.TreebankWordTokenizer()
tokens = tokenizer.tokenize(text)

stemmer = nltk.stem.PorterStemmer()
print("Stemming the sentence: ", " ".join(stemmer.stem(token) for token in tokens))

lemmatizer = nltk.stem.WordNetLemmatizer()
print("Lemmatizing the sentence: ", " ".join(lemmatizer.lemmatize(token) for token in tokens))

In [None]:
def combine_text(list_of_text):
    
    combined_text = ' '.join(list_of_text)
    return combined_text

df['short_description'] = df['short_description'].apply(lambda x : combine_text(x))
df.head()

In [None]:
def text_preprocessing(text):
   
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    
    nopunc = clean_text(text)
    tokenized_text = tokenizer.tokenize(nopunc)
    remove_stopwords = [word for word in tokenized_text if word not in stopwords.words('english')]
    combined_text = ' '.join(remove_stopwords)
    return combined_text

In [None]:
count_vectorizer = CountVectorizer()
train_vectors = count_vectorizer.fit_transform(df['short_description'])

print(train_vectors[0].todense())

In [None]:
tfidf = TfidfVectorizer(min_df=2, max_df=0.5, ngram_range=(1, 2))
train_tfidf = tfidf.fit_transform(df['short_description'])

In [None]:
classifiers = {
    "LogisiticRegression": LogisticRegression(),
    "KNearest": KNeighborsClassifier(),
    "Support Vector Classifier": SVC(),
    "DecisionTreeClassifier": DecisionTreeClassifier(),
    "MultinimialNB": MultinomialNB()
}

In [None]:
from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(df['target'].values.reshape(-1,1))
scaler.transform(df['target'].values.reshape(-1,1))

In [None]:
from keras.preprocessing.text import Tokenizer, text_to_word_sequence

df['text'] = df.headline + " " + df.short_description


tokenizer = Tokenizer()
tokenizer.fit_on_texts(df.text)
X = tokenizer.texts_to_sequences(df.text)
df['words'] = X


df['word_length'] = df.words.apply(lambda i: len(i))
df = df[df.word_length >= 5]

df.head()