In [None]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import re

In [None]:
# load dataset
#df = pd.read_csv("datasets/20_newsgroup.csv") # src: https://www.kaggle.com/c/learn-ai-bbc
#df = pd.read_csv("datasets/news_headlines.csv") # src: https://www.kaggle.com/datasets/rmisra/news-category-dataset
# TODO: find sport article dataset or sth similar
df = pd.read_csv('datasets/nasz.csv', delimiter=';')
df = df.astype(str)

#df = df.groupby('category', group_keys=False).apply(lambda x: x.sample(1000)) # stratify -- take x elements of each category
df.head()

In [None]:
sns.countplot(df.category) # generate class plot
print(df['category'].value_counts()) # print count for each class

In [None]:
# count number of words in each article

df['word_count'] = df['text'].str.len()
sns.distplot(df['word_count']).set_title('Article length distribution')

In [None]:
# generate word cloud for each class

def create_wordcloud(words, title):
    wordcloud = WordCloud(width=500, height=500).generate(words)
    plt.figure(figsize=(10,10))
    plt.imshow(wordcloud)
    plt.title(title)
    plt.show()

for category in pd.unique(df['category']):
    category_df = df.loc[df['category'] == category]
    txt = ' '.join(category_df.text)
    create_wordcloud(txt, category)

In [None]:
df.head()

In [None]:
def clean_text(text):
    text = text.lower().replace('\n', ' ').replace('\r', '').strip() # remove special characters
    text = re.sub(' +', ' ', text) # remove multiple whitespaces
    text = re.sub(r'[^\w\s]', '', text) # remove non-letter characters
    # removing stopwords could be added here
    return text

In [None]:
df['text'] = df['text'].apply(clean_text)
df['word_count'] = df['text'].str.len()
df.head()

In [None]:
X = list(df['text'])
y = list(df['category'])
print(y[:3])

In [None]:
# label encoding

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
print(y[:3])

In [None]:
# split data into train and test sets

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size= .8, random_state = 1410)
print(f'train: {len(X_train)}, test: {len(X_test)}')

In [None]:
# TFIDF
# TF = count of specific word in article / number of words in article
# IDF = log(number of articles containing specific word / number of articles)
# TFIDF = TF * IDF -- for each article

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=1, norm='l2', encoding='utf-8', ngram_range=(1, 2), stop_words='english')
X_train = tfidf.fit_transform(X_train).toarray()
X_test = tfidf.transform(X_test).toarray()

In [None]:
# Bag-of-Words
count_vectorizer = CountVectorizer(analyzer='word', stop_words='english')
X_train = count_vectorizer.fit_transform(X_train).toarray()
X_test = count_vectorizer.transform(X_test).toarray()

In [None]:
# TODO:
# Word2Vec

In [None]:
lr_model = LogisticRegression(multi_class="multinomial")
rf_model = RandomForestClassifier()
knn_model = KNeighborsClassifier()

models = [lr_model, rf_model, knn_model]

for model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(type(model).__name__)
    print(classification_report(y_test,y_pred))

In [None]:
from sklearn.neural_network import MLPClassifier
mlp_model=MLPClassifier()


mlp_model.fit(X_train, y_train)
y_pred = mlp_model.predict(X_test)
print(type(mlp_model).__name__)
print(classification_report(y_test,y_pred))


In [None]:
import pickle

pickle.dump(mlp_model, open('mlp_model.pkl', 'wb'))

    # load generated traffic from file
    # vectors = pickle.load(open('vectors.pkl', 'rb'))

In [None]:
from sklearn.datasets import fetch_20newsgroups

In [None]:
def twenty_newsgroup_to_csv():
    newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))

    df = pd.DataFrame([newsgroups_train.data, newsgroups_train.target.tolist()]).T
    df.columns = ['text', 'target']

    targets = pd.DataFrame( newsgroups_train.target_names)
    targets.columns=['title']

    out = pd.merge(df, targets, left_on='target', right_index=True)
    out['date'] = pd.to_datetime('now')
    out.to_csv('20_newsgroup.csv')
twenty_newsgroup_to_csv()

In [None]:
def upload_data():
    df = pd.read_csv('20_newsgroup.csv')
    out_df = pd.DataFrame(columns=["text", "category"])
    out_df['text'] = df['text']
    out_df['category'] = df['title']
    out_df.to_csv('datasets/20_newsgroup.csv', index=False)
upload_data()

In [None]:
saved_model = pickle.load(open('mlp_model.pkl', 'rb'))

In [None]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=1,norm='l2', encoding='utf-8', ngram_range=(1, 2), stop_words='english')
X = tfidf.fit_transform(X).toarray()

In [None]:
print(X.shape)
print(X_train.shape)

df.head()

In [None]:
y_pred = saved_model.predict(X)

print(type(mlp_model).__name__)
print(label_encoder.inverse_transform(y),label_encoder.inverse_transform(y_pred))
